diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6398 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 2000, + "global_step": 7916, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012632642748863063, + "grad_norm": 0.3052225708961487, + "learning_rate": 1.4705882352941177e-06, + "loss": 1.3368, + "num_input_tokens_seen": 1013408, + "step": 10 + }, + { + "epoch": 0.0025265285497726125, + "grad_norm": 0.2797602713108063, + "learning_rate": 2.9411764705882355e-06, + "loss": 1.3885, + "num_input_tokens_seen": 1928736, + "step": 20 + }, + { + "epoch": 0.0037897928246589186, + "grad_norm": 0.2704920470714569, + "learning_rate": 4.4117647058823526e-06, + "loss": 1.4081, + "num_input_tokens_seen": 2911616, + "step": 30 + }, + { + "epoch": 0.005053057099545225, + "grad_norm": 0.26865851879119873, + "learning_rate": 5.882352941176471e-06, + "loss": 1.4143, + "num_input_tokens_seen": 3775872, + "step": 40 + }, + { + "epoch": 0.006316321374431531, + "grad_norm": 0.29596275091171265, + "learning_rate": 7.352941176470588e-06, + "loss": 1.4061, + "num_input_tokens_seen": 4755264, + "step": 50 + }, + { + "epoch": 0.007579585649317837, + "grad_norm": 0.26764747500419617, + "learning_rate": 8.823529411764705e-06, + "loss": 1.3282, + "num_input_tokens_seen": 5670016, + "step": 60 + }, + { + "epoch": 0.008842849924204144, + "grad_norm": 0.2855396866798401, + "learning_rate": 1.0294117647058823e-05, + "loss": 1.3441, + "num_input_tokens_seen": 6541472, + "step": 70 + }, + { + "epoch": 0.01010611419909045, + "grad_norm": 0.27064523100852966, + "learning_rate": 1.1764705882352942e-05, + "loss": 1.3611, + "num_input_tokens_seen": 7502848, + "step": 80 + }, + { + "epoch": 0.011369378473976757, + "grad_norm": 0.26372382044792175, + "learning_rate": 1.3235294117647058e-05, + "loss": 1.3355, + "num_input_tokens_seen": 8450880, + "step": 90 + }, + { + "epoch": 0.012632642748863061, + "grad_norm": 0.2819940149784088, + "learning_rate": 1.4705882352941175e-05, + "loss": 1.4074, + "num_input_tokens_seen": 9363776, + "step": 100 + }, + { + "epoch": 0.013895907023749368, + "grad_norm": 0.27858778834342957, + "learning_rate": 1.6176470588235293e-05, + "loss": 1.3922, + "num_input_tokens_seen": 10362848, + "step": 110 + }, + { + "epoch": 0.015159171298635674, + "grad_norm": 0.26853179931640625, + "learning_rate": 1.764705882352941e-05, + "loss": 1.3606, + "num_input_tokens_seen": 11337152, + "step": 120 + }, + { + "epoch": 0.016422435573521982, + "grad_norm": 0.29751917719841003, + "learning_rate": 1.9117647058823524e-05, + "loss": 1.3469, + "num_input_tokens_seen": 12268864, + "step": 130 + }, + { + "epoch": 0.017685699848408287, + "grad_norm": 0.29996374249458313, + "learning_rate": 2.0588235294117645e-05, + "loss": 1.3455, + "num_input_tokens_seen": 13304544, + "step": 140 + }, + { + "epoch": 0.018948964123294592, + "grad_norm": 0.26638367772102356, + "learning_rate": 2.2058823529411763e-05, + "loss": 1.3529, + "num_input_tokens_seen": 14245088, + "step": 150 + }, + { + "epoch": 0.0202122283981809, + "grad_norm": 0.2829771041870117, + "learning_rate": 2.3529411764705884e-05, + "loss": 1.3517, + "num_input_tokens_seen": 15277408, + "step": 160 + }, + { + "epoch": 0.021475492673067205, + "grad_norm": 0.28468722105026245, + "learning_rate": 2.4999999999999998e-05, + "loss": 1.3756, + "num_input_tokens_seen": 16296480, + "step": 170 + }, + { + "epoch": 0.022738756947953513, + "grad_norm": 0.2717965841293335, + "learning_rate": 2.6470588235294115e-05, + "loss": 1.3094, + "num_input_tokens_seen": 17249088, + "step": 180 + }, + { + "epoch": 0.024002021222839818, + "grad_norm": 0.2902025878429413, + "learning_rate": 2.7941176470588236e-05, + "loss": 1.3894, + "num_input_tokens_seen": 18267872, + "step": 190 + }, + { + "epoch": 0.025265285497726123, + "grad_norm": 0.27164924144744873, + "learning_rate": 2.941176470588235e-05, + "loss": 1.3471, + "num_input_tokens_seen": 19228288, + "step": 200 + }, + { + "epoch": 0.02652854977261243, + "grad_norm": 0.2791699767112732, + "learning_rate": 3.088235294117647e-05, + "loss": 1.3676, + "num_input_tokens_seen": 20112768, + "step": 210 + }, + { + "epoch": 0.027791814047498736, + "grad_norm": 0.27457180619239807, + "learning_rate": 3.2352941176470585e-05, + "loss": 1.3667, + "num_input_tokens_seen": 21080384, + "step": 220 + }, + { + "epoch": 0.029055078322385044, + "grad_norm": 0.2744538486003876, + "learning_rate": 3.38235294117647e-05, + "loss": 1.3791, + "num_input_tokens_seen": 21978464, + "step": 230 + }, + { + "epoch": 0.03031834259727135, + "grad_norm": 0.27631092071533203, + "learning_rate": 3.49999941403517e-05, + "loss": 1.3032, + "num_input_tokens_seen": 22891136, + "step": 240 + }, + { + "epoch": 0.03158160687215766, + "grad_norm": 0.25807875394821167, + "learning_rate": 3.499978905307333e-05, + "loss": 1.3203, + "num_input_tokens_seen": 23788384, + "step": 250 + }, + { + "epoch": 0.032844871147043965, + "grad_norm": 0.282926470041275, + "learning_rate": 3.499929098730414e-05, + "loss": 1.3487, + "num_input_tokens_seen": 24732448, + "step": 260 + }, + { + "epoch": 0.034108135421930266, + "grad_norm": 0.3243197500705719, + "learning_rate": 3.499849995138268e-05, + "loss": 1.3335, + "num_input_tokens_seen": 25651072, + "step": 270 + }, + { + "epoch": 0.035371399696816574, + "grad_norm": 0.28631719946861267, + "learning_rate": 3.499741595855233e-05, + "loss": 1.3104, + "num_input_tokens_seen": 26588256, + "step": 280 + }, + { + "epoch": 0.03663466397170288, + "grad_norm": 0.2739802598953247, + "learning_rate": 3.499603902696111e-05, + "loss": 1.3294, + "num_input_tokens_seen": 27506400, + "step": 290 + }, + { + "epoch": 0.037897928246589184, + "grad_norm": 0.25884002447128296, + "learning_rate": 3.499436917966138e-05, + "loss": 1.3253, + "num_input_tokens_seen": 28436096, + "step": 300 + }, + { + "epoch": 0.03916119252147549, + "grad_norm": 0.3526857793331146, + "learning_rate": 3.4992406444609434e-05, + "loss": 1.3731, + "num_input_tokens_seen": 29415744, + "step": 310 + }, + { + "epoch": 0.0404244567963618, + "grad_norm": 0.3010634183883667, + "learning_rate": 3.499015085466505e-05, + "loss": 1.3604, + "num_input_tokens_seen": 30396288, + "step": 320 + }, + { + "epoch": 0.0416877210712481, + "grad_norm": 0.30412164330482483, + "learning_rate": 3.498760244759094e-05, + "loss": 1.3192, + "num_input_tokens_seen": 31281632, + "step": 330 + }, + { + "epoch": 0.04295098534613441, + "grad_norm": 0.28709614276885986, + "learning_rate": 3.498476126605209e-05, + "loss": 1.3405, + "num_input_tokens_seen": 32139296, + "step": 340 + }, + { + "epoch": 0.04421424962102072, + "grad_norm": 0.2636132836341858, + "learning_rate": 3.4981627357615085e-05, + "loss": 1.3796, + "num_input_tokens_seen": 33140544, + "step": 350 + }, + { + "epoch": 0.045477513895907026, + "grad_norm": 0.27414971590042114, + "learning_rate": 3.497820077474728e-05, + "loss": 1.3502, + "num_input_tokens_seen": 34072480, + "step": 360 + }, + { + "epoch": 0.04674077817079333, + "grad_norm": 0.29717832803726196, + "learning_rate": 3.4974481574815955e-05, + "loss": 1.3218, + "num_input_tokens_seen": 35043552, + "step": 370 + }, + { + "epoch": 0.048004042445679636, + "grad_norm": 0.274935781955719, + "learning_rate": 3.49704698200873e-05, + "loss": 1.3101, + "num_input_tokens_seen": 36057536, + "step": 380 + }, + { + "epoch": 0.049267306720565944, + "grad_norm": 0.2995646893978119, + "learning_rate": 3.496616557772545e-05, + "loss": 1.3231, + "num_input_tokens_seen": 37053280, + "step": 390 + }, + { + "epoch": 0.050530570995452245, + "grad_norm": 0.2813841998577118, + "learning_rate": 3.4961568919791295e-05, + "loss": 1.3073, + "num_input_tokens_seen": 37949760, + "step": 400 + }, + { + "epoch": 0.051793835270338554, + "grad_norm": 0.25323453545570374, + "learning_rate": 3.49566799232413e-05, + "loss": 1.4188, + "num_input_tokens_seen": 38825888, + "step": 410 + }, + { + "epoch": 0.05305709954522486, + "grad_norm": 0.3185766339302063, + "learning_rate": 3.4951498669926205e-05, + "loss": 1.2551, + "num_input_tokens_seen": 39816832, + "step": 420 + }, + { + "epoch": 0.05432036382011117, + "grad_norm": 0.282988041639328, + "learning_rate": 3.494602524658968e-05, + "loss": 1.3429, + "num_input_tokens_seen": 40746208, + "step": 430 + }, + { + "epoch": 0.05558362809499747, + "grad_norm": 0.29383236169815063, + "learning_rate": 3.494025974486684e-05, + "loss": 1.2908, + "num_input_tokens_seen": 41732576, + "step": 440 + }, + { + "epoch": 0.05684689236988378, + "grad_norm": 0.2495247721672058, + "learning_rate": 3.4934202261282736e-05, + "loss": 1.3379, + "num_input_tokens_seen": 42725664, + "step": 450 + }, + { + "epoch": 0.05811015664477009, + "grad_norm": 0.27226462960243225, + "learning_rate": 3.4927852897250736e-05, + "loss": 1.2906, + "num_input_tokens_seen": 43636000, + "step": 460 + }, + { + "epoch": 0.05937342091965639, + "grad_norm": 0.2738124430179596, + "learning_rate": 3.49212117590708e-05, + "loss": 1.3382, + "num_input_tokens_seen": 44584384, + "step": 470 + }, + { + "epoch": 0.0606366851945427, + "grad_norm": 0.2823927700519562, + "learning_rate": 3.4914278957927746e-05, + "loss": 1.3572, + "num_input_tokens_seen": 45563296, + "step": 480 + }, + { + "epoch": 0.061899949469429005, + "grad_norm": 0.3090139329433441, + "learning_rate": 3.490705460988934e-05, + "loss": 1.3633, + "num_input_tokens_seen": 46504000, + "step": 490 + }, + { + "epoch": 0.06316321374431531, + "grad_norm": 0.2648494839668274, + "learning_rate": 3.4899538835904395e-05, + "loss": 1.296, + "num_input_tokens_seen": 47469568, + "step": 500 + }, + { + "epoch": 0.06442647801920162, + "grad_norm": 0.26772260665893555, + "learning_rate": 3.489173176180072e-05, + "loss": 1.3468, + "num_input_tokens_seen": 48428992, + "step": 510 + }, + { + "epoch": 0.06568974229408793, + "grad_norm": 0.2722509503364563, + "learning_rate": 3.488363351828301e-05, + "loss": 1.3298, + "num_input_tokens_seen": 49435616, + "step": 520 + }, + { + "epoch": 0.06695300656897422, + "grad_norm": 0.33240431547164917, + "learning_rate": 3.48752442409307e-05, + "loss": 1.3395, + "num_input_tokens_seen": 50444960, + "step": 530 + }, + { + "epoch": 0.06821627084386053, + "grad_norm": 0.33877724409103394, + "learning_rate": 3.4866564070195623e-05, + "loss": 1.3627, + "num_input_tokens_seen": 51354144, + "step": 540 + }, + { + "epoch": 0.06947953511874684, + "grad_norm": 0.25358885526657104, + "learning_rate": 3.485759315139974e-05, + "loss": 1.3665, + "num_input_tokens_seen": 52353568, + "step": 550 + }, + { + "epoch": 0.07074279939363315, + "grad_norm": 0.3228625953197479, + "learning_rate": 3.484833163473263e-05, + "loss": 1.3603, + "num_input_tokens_seen": 53330208, + "step": 560 + }, + { + "epoch": 0.07200606366851946, + "grad_norm": 0.27047306299209595, + "learning_rate": 3.483877967524903e-05, + "loss": 1.3918, + "num_input_tokens_seen": 54292704, + "step": 570 + }, + { + "epoch": 0.07326932794340577, + "grad_norm": 0.23836977779865265, + "learning_rate": 3.482893743286624e-05, + "loss": 1.3265, + "num_input_tokens_seen": 55289088, + "step": 580 + }, + { + "epoch": 0.07453259221829207, + "grad_norm": 0.2790107727050781, + "learning_rate": 3.4818805072361394e-05, + "loss": 1.34, + "num_input_tokens_seen": 56191520, + "step": 590 + }, + { + "epoch": 0.07579585649317837, + "grad_norm": 0.2909539043903351, + "learning_rate": 3.4808382763368746e-05, + "loss": 1.3827, + "num_input_tokens_seen": 57130144, + "step": 600 + }, + { + "epoch": 0.07705912076806468, + "grad_norm": 0.2930690050125122, + "learning_rate": 3.479767068037682e-05, + "loss": 1.2993, + "num_input_tokens_seen": 58166976, + "step": 610 + }, + { + "epoch": 0.07832238504295098, + "grad_norm": 0.2910405993461609, + "learning_rate": 3.4786669002725486e-05, + "loss": 1.4025, + "num_input_tokens_seen": 59115968, + "step": 620 + }, + { + "epoch": 0.07958564931783729, + "grad_norm": 0.2609618008136749, + "learning_rate": 3.477537791460297e-05, + "loss": 1.3454, + "num_input_tokens_seen": 60097152, + "step": 630 + }, + { + "epoch": 0.0808489135927236, + "grad_norm": 0.2621832489967346, + "learning_rate": 3.4763797605042735e-05, + "loss": 1.3193, + "num_input_tokens_seen": 61038400, + "step": 640 + }, + { + "epoch": 0.08211217786760991, + "grad_norm": 0.2869206666946411, + "learning_rate": 3.475192826792036e-05, + "loss": 1.3755, + "num_input_tokens_seen": 62005408, + "step": 650 + }, + { + "epoch": 0.0833754421424962, + "grad_norm": 0.2955986261367798, + "learning_rate": 3.473977010195027e-05, + "loss": 1.3446, + "num_input_tokens_seen": 62938944, + "step": 660 + }, + { + "epoch": 0.08463870641738251, + "grad_norm": 0.27759358286857605, + "learning_rate": 3.47273233106824e-05, + "loss": 1.3243, + "num_input_tokens_seen": 63825280, + "step": 670 + }, + { + "epoch": 0.08590197069226882, + "grad_norm": 0.2854154706001282, + "learning_rate": 3.471458810249883e-05, + "loss": 1.3274, + "num_input_tokens_seen": 64772224, + "step": 680 + }, + { + "epoch": 0.08716523496715513, + "grad_norm": 0.26865917444229126, + "learning_rate": 3.470156469061023e-05, + "loss": 1.3368, + "num_input_tokens_seen": 65757408, + "step": 690 + }, + { + "epoch": 0.08842849924204144, + "grad_norm": 0.3124206066131592, + "learning_rate": 3.468825329305235e-05, + "loss": 1.3619, + "num_input_tokens_seen": 66653856, + "step": 700 + }, + { + "epoch": 0.08969176351692774, + "grad_norm": 0.257878839969635, + "learning_rate": 3.467465413268235e-05, + "loss": 1.3705, + "num_input_tokens_seen": 67551136, + "step": 710 + }, + { + "epoch": 0.09095502779181405, + "grad_norm": 0.3039745092391968, + "learning_rate": 3.466076743717506e-05, + "loss": 1.3407, + "num_input_tokens_seen": 68461888, + "step": 720 + }, + { + "epoch": 0.09221829206670035, + "grad_norm": 0.297577828168869, + "learning_rate": 3.4646593439019164e-05, + "loss": 1.3068, + "num_input_tokens_seen": 69439936, + "step": 730 + }, + { + "epoch": 0.09348155634158666, + "grad_norm": 0.26858824491500854, + "learning_rate": 3.463213237551333e-05, + "loss": 1.3362, + "num_input_tokens_seen": 70315520, + "step": 740 + }, + { + "epoch": 0.09474482061647296, + "grad_norm": 0.32382968068122864, + "learning_rate": 3.461738448876223e-05, + "loss": 1.2972, + "num_input_tokens_seen": 71249088, + "step": 750 + }, + { + "epoch": 0.09600808489135927, + "grad_norm": 0.2890531122684479, + "learning_rate": 3.460235002567247e-05, + "loss": 1.2899, + "num_input_tokens_seen": 72123200, + "step": 760 + }, + { + "epoch": 0.09727134916624558, + "grad_norm": 0.2724192440509796, + "learning_rate": 3.458702923794847e-05, + "loss": 1.3435, + "num_input_tokens_seen": 73014048, + "step": 770 + }, + { + "epoch": 0.09853461344113189, + "grad_norm": 0.2698012888431549, + "learning_rate": 3.457142238208826e-05, + "loss": 1.3823, + "num_input_tokens_seen": 73970912, + "step": 780 + }, + { + "epoch": 0.0997978777160182, + "grad_norm": 0.25855422019958496, + "learning_rate": 3.455552971937915e-05, + "loss": 1.3545, + "num_input_tokens_seen": 74960032, + "step": 790 + }, + { + "epoch": 0.10106114199090449, + "grad_norm": 0.3183737099170685, + "learning_rate": 3.453935151589341e-05, + "loss": 1.3597, + "num_input_tokens_seen": 75886048, + "step": 800 + }, + { + "epoch": 0.1023244062657908, + "grad_norm": 0.2935165464878082, + "learning_rate": 3.4522888042483766e-05, + "loss": 1.3745, + "num_input_tokens_seen": 76882752, + "step": 810 + }, + { + "epoch": 0.10358767054067711, + "grad_norm": 0.2568333148956299, + "learning_rate": 3.450613957477889e-05, + "loss": 1.3502, + "num_input_tokens_seen": 77780736, + "step": 820 + }, + { + "epoch": 0.10485093481556342, + "grad_norm": 0.29373618960380554, + "learning_rate": 3.4489106393178774e-05, + "loss": 1.33, + "num_input_tokens_seen": 78738272, + "step": 830 + }, + { + "epoch": 0.10611419909044972, + "grad_norm": 0.2722548246383667, + "learning_rate": 3.447178878285004e-05, + "loss": 1.3533, + "num_input_tokens_seen": 79636736, + "step": 840 + }, + { + "epoch": 0.10737746336533603, + "grad_norm": 0.29016321897506714, + "learning_rate": 3.445418703372119e-05, + "loss": 1.365, + "num_input_tokens_seen": 80603008, + "step": 850 + }, + { + "epoch": 0.10864072764022234, + "grad_norm": 0.2636987268924713, + "learning_rate": 3.443630144047771e-05, + "loss": 1.3284, + "num_input_tokens_seen": 81556992, + "step": 860 + }, + { + "epoch": 0.10990399191510863, + "grad_norm": 0.2925853133201599, + "learning_rate": 3.441813230255714e-05, + "loss": 1.306, + "num_input_tokens_seen": 82544128, + "step": 870 + }, + { + "epoch": 0.11116725618999494, + "grad_norm": 0.32026803493499756, + "learning_rate": 3.439967992414412e-05, + "loss": 1.2703, + "num_input_tokens_seen": 83488864, + "step": 880 + }, + { + "epoch": 0.11243052046488125, + "grad_norm": 0.2739593982696533, + "learning_rate": 3.438094461416522e-05, + "loss": 1.3276, + "num_input_tokens_seen": 84447232, + "step": 890 + }, + { + "epoch": 0.11369378473976756, + "grad_norm": 0.26780998706817627, + "learning_rate": 3.4361926686283805e-05, + "loss": 1.3311, + "num_input_tokens_seen": 85353344, + "step": 900 + }, + { + "epoch": 0.11495704901465387, + "grad_norm": 0.3547651469707489, + "learning_rate": 3.43426264588948e-05, + "loss": 1.3696, + "num_input_tokens_seen": 86331744, + "step": 910 + }, + { + "epoch": 0.11622031328954018, + "grad_norm": 0.2572576105594635, + "learning_rate": 3.4323044255119314e-05, + "loss": 1.3226, + "num_input_tokens_seen": 87350592, + "step": 920 + }, + { + "epoch": 0.11748357756442648, + "grad_norm": 0.26348087191581726, + "learning_rate": 3.430318040279929e-05, + "loss": 1.339, + "num_input_tokens_seen": 88312000, + "step": 930 + }, + { + "epoch": 0.11874684183931278, + "grad_norm": 0.2919277846813202, + "learning_rate": 3.428303523449194e-05, + "loss": 1.3158, + "num_input_tokens_seen": 89257856, + "step": 940 + }, + { + "epoch": 0.12001010611419909, + "grad_norm": 0.2658417820930481, + "learning_rate": 3.426260908746427e-05, + "loss": 1.3073, + "num_input_tokens_seen": 90244352, + "step": 950 + }, + { + "epoch": 0.1212733703890854, + "grad_norm": 0.28189846873283386, + "learning_rate": 3.424190230368733e-05, + "loss": 1.3125, + "num_input_tokens_seen": 91129440, + "step": 960 + }, + { + "epoch": 0.1225366346639717, + "grad_norm": 0.279550701379776, + "learning_rate": 3.422091522983059e-05, + "loss": 1.2755, + "num_input_tokens_seen": 92033408, + "step": 970 + }, + { + "epoch": 0.12379989893885801, + "grad_norm": 0.28984683752059937, + "learning_rate": 3.419964821725607e-05, + "loss": 1.3188, + "num_input_tokens_seen": 92960864, + "step": 980 + }, + { + "epoch": 0.1250631632137443, + "grad_norm": 0.2627594769001007, + "learning_rate": 3.417810162201247e-05, + "loss": 1.3248, + "num_input_tokens_seen": 93996960, + "step": 990 + }, + { + "epoch": 0.12632642748863063, + "grad_norm": 0.2966674864292145, + "learning_rate": 3.415627580482923e-05, + "loss": 1.3486, + "num_input_tokens_seen": 94925600, + "step": 1000 + }, + { + "epoch": 0.12758969176351692, + "grad_norm": 0.2634032666683197, + "learning_rate": 3.413417113111045e-05, + "loss": 1.3315, + "num_input_tokens_seen": 95851200, + "step": 1010 + }, + { + "epoch": 0.12885295603840324, + "grad_norm": 0.29642611742019653, + "learning_rate": 3.4111787970928835e-05, + "loss": 1.2694, + "num_input_tokens_seen": 96800640, + "step": 1020 + }, + { + "epoch": 0.13011622031328954, + "grad_norm": 0.25690603256225586, + "learning_rate": 3.408912669901943e-05, + "loss": 1.3334, + "num_input_tokens_seen": 97827232, + "step": 1030 + }, + { + "epoch": 0.13137948458817586, + "grad_norm": 0.2836136817932129, + "learning_rate": 3.40661876947734e-05, + "loss": 1.3122, + "num_input_tokens_seen": 98797088, + "step": 1040 + }, + { + "epoch": 0.13264274886306215, + "grad_norm": 0.2613033354282379, + "learning_rate": 3.4042971342231655e-05, + "loss": 1.3665, + "num_input_tokens_seen": 99772384, + "step": 1050 + }, + { + "epoch": 0.13390601313794845, + "grad_norm": 0.2632371485233307, + "learning_rate": 3.401947803007841e-05, + "loss": 1.342, + "num_input_tokens_seen": 100704544, + "step": 1060 + }, + { + "epoch": 0.13516927741283477, + "grad_norm": 0.25628045201301575, + "learning_rate": 3.399570815163471e-05, + "loss": 1.3686, + "num_input_tokens_seen": 101608800, + "step": 1070 + }, + { + "epoch": 0.13643254168772107, + "grad_norm": 0.23973917961120605, + "learning_rate": 3.397166210485182e-05, + "loss": 1.393, + "num_input_tokens_seen": 102571712, + "step": 1080 + }, + { + "epoch": 0.1376958059626074, + "grad_norm": 0.32102668285369873, + "learning_rate": 3.394734029230454e-05, + "loss": 1.2795, + "num_input_tokens_seen": 103472640, + "step": 1090 + }, + { + "epoch": 0.13895907023749368, + "grad_norm": 0.2778148651123047, + "learning_rate": 3.3922743121184533e-05, + "loss": 1.2751, + "num_input_tokens_seen": 104464224, + "step": 1100 + }, + { + "epoch": 0.14022233451238, + "grad_norm": 0.2992386221885681, + "learning_rate": 3.3897871003293454e-05, + "loss": 1.2715, + "num_input_tokens_seen": 105472736, + "step": 1110 + }, + { + "epoch": 0.1414855987872663, + "grad_norm": 0.2530061900615692, + "learning_rate": 3.3872724355036066e-05, + "loss": 1.3162, + "num_input_tokens_seen": 106384480, + "step": 1120 + }, + { + "epoch": 0.1427488630621526, + "grad_norm": 0.2719084918498993, + "learning_rate": 3.384730359741327e-05, + "loss": 1.2827, + "num_input_tokens_seen": 107319712, + "step": 1130 + }, + { + "epoch": 0.14401212733703891, + "grad_norm": 0.26223063468933105, + "learning_rate": 3.3821609156015086e-05, + "loss": 1.3352, + "num_input_tokens_seen": 108260576, + "step": 1140 + }, + { + "epoch": 0.1452753916119252, + "grad_norm": 0.28642159700393677, + "learning_rate": 3.3795641461013454e-05, + "loss": 1.3423, + "num_input_tokens_seen": 109234720, + "step": 1150 + }, + { + "epoch": 0.14653865588681153, + "grad_norm": 0.3532911539077759, + "learning_rate": 3.376940094715512e-05, + "loss": 1.3319, + "num_input_tokens_seen": 110154176, + "step": 1160 + }, + { + "epoch": 0.14780192016169783, + "grad_norm": 0.2519535720348358, + "learning_rate": 3.3742888053754295e-05, + "loss": 1.3348, + "num_input_tokens_seen": 111066432, + "step": 1170 + }, + { + "epoch": 0.14906518443658415, + "grad_norm": 0.28797778487205505, + "learning_rate": 3.371610322468534e-05, + "loss": 1.3478, + "num_input_tokens_seen": 112032064, + "step": 1180 + }, + { + "epoch": 0.15032844871147044, + "grad_norm": 0.2780948281288147, + "learning_rate": 3.368904690837529e-05, + "loss": 1.3099, + "num_input_tokens_seen": 113065184, + "step": 1190 + }, + { + "epoch": 0.15159171298635674, + "grad_norm": 0.3206534683704376, + "learning_rate": 3.3661719557796405e-05, + "loss": 1.3218, + "num_input_tokens_seen": 114056096, + "step": 1200 + }, + { + "epoch": 0.15285497726124306, + "grad_norm": 0.30456361174583435, + "learning_rate": 3.363412163045853e-05, + "loss": 1.3439, + "num_input_tokens_seen": 115039808, + "step": 1210 + }, + { + "epoch": 0.15411824153612935, + "grad_norm": 0.27767330408096313, + "learning_rate": 3.3606253588401474e-05, + "loss": 1.2642, + "num_input_tokens_seen": 115943872, + "step": 1220 + }, + { + "epoch": 0.15538150581101567, + "grad_norm": 0.25447219610214233, + "learning_rate": 3.357811589818724e-05, + "loss": 1.3209, + "num_input_tokens_seen": 116934144, + "step": 1230 + }, + { + "epoch": 0.15664477008590197, + "grad_norm": 0.28984275460243225, + "learning_rate": 3.354970903089228e-05, + "loss": 1.2694, + "num_input_tokens_seen": 117866592, + "step": 1240 + }, + { + "epoch": 0.15790803436078826, + "grad_norm": 0.2603750228881836, + "learning_rate": 3.3521033462099505e-05, + "loss": 1.3538, + "num_input_tokens_seen": 118792000, + "step": 1250 + }, + { + "epoch": 0.15917129863567459, + "grad_norm": 0.2679465413093567, + "learning_rate": 3.3492089671890414e-05, + "loss": 1.3708, + "num_input_tokens_seen": 119700608, + "step": 1260 + }, + { + "epoch": 0.16043456291056088, + "grad_norm": 0.2753802537918091, + "learning_rate": 3.346287814483703e-05, + "loss": 1.2785, + "num_input_tokens_seen": 120664544, + "step": 1270 + }, + { + "epoch": 0.1616978271854472, + "grad_norm": 0.2532285153865814, + "learning_rate": 3.3433399369993764e-05, + "loss": 1.3176, + "num_input_tokens_seen": 121630656, + "step": 1280 + }, + { + "epoch": 0.1629610914603335, + "grad_norm": 0.2713632583618164, + "learning_rate": 3.340365384088924e-05, + "loss": 1.2721, + "num_input_tokens_seen": 122593728, + "step": 1290 + }, + { + "epoch": 0.16422435573521982, + "grad_norm": 0.31818637251853943, + "learning_rate": 3.337364205551805e-05, + "loss": 1.3474, + "num_input_tokens_seen": 123604064, + "step": 1300 + }, + { + "epoch": 0.1654876200101061, + "grad_norm": 0.28953075408935547, + "learning_rate": 3.3343364516332404e-05, + "loss": 1.3117, + "num_input_tokens_seen": 124606080, + "step": 1310 + }, + { + "epoch": 0.1667508842849924, + "grad_norm": 0.32029005885124207, + "learning_rate": 3.331282173023371e-05, + "loss": 1.3281, + "num_input_tokens_seen": 125569664, + "step": 1320 + }, + { + "epoch": 0.16801414855987873, + "grad_norm": 0.2608253061771393, + "learning_rate": 3.328201420856409e-05, + "loss": 1.2915, + "num_input_tokens_seen": 126460768, + "step": 1330 + }, + { + "epoch": 0.16927741283476502, + "grad_norm": 0.2563798725605011, + "learning_rate": 3.3250942467097835e-05, + "loss": 1.3308, + "num_input_tokens_seen": 127405408, + "step": 1340 + }, + { + "epoch": 0.17054067710965135, + "grad_norm": 0.26563408970832825, + "learning_rate": 3.3219607026032747e-05, + "loss": 1.294, + "num_input_tokens_seen": 128331968, + "step": 1350 + }, + { + "epoch": 0.17180394138453764, + "grad_norm": 0.2531772553920746, + "learning_rate": 3.318800840998146e-05, + "loss": 1.3276, + "num_input_tokens_seen": 129301248, + "step": 1360 + }, + { + "epoch": 0.17306720565942396, + "grad_norm": 0.2774362862110138, + "learning_rate": 3.3156147147962623e-05, + "loss": 1.2639, + "num_input_tokens_seen": 130282336, + "step": 1370 + }, + { + "epoch": 0.17433046993431026, + "grad_norm": 0.284277081489563, + "learning_rate": 3.312402377339206e-05, + "loss": 1.3216, + "num_input_tokens_seen": 131225056, + "step": 1380 + }, + { + "epoch": 0.17559373420919655, + "grad_norm": 0.2917383015155792, + "learning_rate": 3.309163882407384e-05, + "loss": 1.2568, + "num_input_tokens_seen": 132157504, + "step": 1390 + }, + { + "epoch": 0.17685699848408287, + "grad_norm": 0.2731410264968872, + "learning_rate": 3.305899284219128e-05, + "loss": 1.3375, + "num_input_tokens_seen": 133115200, + "step": 1400 + }, + { + "epoch": 0.17812026275896917, + "grad_norm": 0.28233301639556885, + "learning_rate": 3.302608637429786e-05, + "loss": 1.2466, + "num_input_tokens_seen": 134032192, + "step": 1410 + }, + { + "epoch": 0.1793835270338555, + "grad_norm": 0.2799434959888458, + "learning_rate": 3.2992919971308055e-05, + "loss": 1.2824, + "num_input_tokens_seen": 134994208, + "step": 1420 + }, + { + "epoch": 0.18064679130874178, + "grad_norm": 0.29594945907592773, + "learning_rate": 3.295949418848814e-05, + "loss": 1.3309, + "num_input_tokens_seen": 135938144, + "step": 1430 + }, + { + "epoch": 0.1819100555836281, + "grad_norm": 0.318526953458786, + "learning_rate": 3.29258095854469e-05, + "loss": 1.2905, + "num_input_tokens_seen": 136866336, + "step": 1440 + }, + { + "epoch": 0.1831733198585144, + "grad_norm": 0.2683306634426117, + "learning_rate": 3.289186672612621e-05, + "loss": 1.2648, + "num_input_tokens_seen": 137815456, + "step": 1450 + }, + { + "epoch": 0.1844365841334007, + "grad_norm": 0.27116644382476807, + "learning_rate": 3.2857666178791656e-05, + "loss": 1.2829, + "num_input_tokens_seen": 138780256, + "step": 1460 + }, + { + "epoch": 0.18569984840828702, + "grad_norm": 0.28254273533821106, + "learning_rate": 3.282320851602298e-05, + "loss": 1.3141, + "num_input_tokens_seen": 139750496, + "step": 1470 + }, + { + "epoch": 0.1869631126831733, + "grad_norm": 0.26385799050331116, + "learning_rate": 3.2788494314704503e-05, + "loss": 1.329, + "num_input_tokens_seen": 140654176, + "step": 1480 + }, + { + "epoch": 0.18822637695805963, + "grad_norm": 0.273930162191391, + "learning_rate": 3.275352415601548e-05, + "loss": 1.3267, + "num_input_tokens_seen": 141615424, + "step": 1490 + }, + { + "epoch": 0.18948964123294593, + "grad_norm": 0.2711365520954132, + "learning_rate": 3.2718298625420366e-05, + "loss": 1.2756, + "num_input_tokens_seen": 142543328, + "step": 1500 + }, + { + "epoch": 0.19075290550783225, + "grad_norm": 0.27136221528053284, + "learning_rate": 3.268281831265899e-05, + "loss": 1.3284, + "num_input_tokens_seen": 143524416, + "step": 1510 + }, + { + "epoch": 0.19201616978271854, + "grad_norm": 0.31618639826774597, + "learning_rate": 3.264708381173672e-05, + "loss": 1.3199, + "num_input_tokens_seen": 144454016, + "step": 1520 + }, + { + "epoch": 0.19327943405760484, + "grad_norm": 0.4721730053424835, + "learning_rate": 3.261109572091448e-05, + "loss": 1.3317, + "num_input_tokens_seen": 145434336, + "step": 1530 + }, + { + "epoch": 0.19454269833249116, + "grad_norm": 0.2652052342891693, + "learning_rate": 3.257485464269878e-05, + "loss": 1.3733, + "num_input_tokens_seen": 146342112, + "step": 1540 + }, + { + "epoch": 0.19580596260737745, + "grad_norm": 0.25424447655677795, + "learning_rate": 3.253836118383157e-05, + "loss": 1.2725, + "num_input_tokens_seen": 147287264, + "step": 1550 + }, + { + "epoch": 0.19706922688226378, + "grad_norm": 0.2884797751903534, + "learning_rate": 3.2501615955280134e-05, + "loss": 1.3223, + "num_input_tokens_seen": 148183456, + "step": 1560 + }, + { + "epoch": 0.19833249115715007, + "grad_norm": 0.2777753174304962, + "learning_rate": 3.2464619572226836e-05, + "loss": 1.3182, + "num_input_tokens_seen": 149094624, + "step": 1570 + }, + { + "epoch": 0.1995957554320364, + "grad_norm": 0.27247852087020874, + "learning_rate": 3.242737265405882e-05, + "loss": 1.3171, + "num_input_tokens_seen": 149997920, + "step": 1580 + }, + { + "epoch": 0.2008590197069227, + "grad_norm": 0.2738061249256134, + "learning_rate": 3.238987582435767e-05, + "loss": 1.2938, + "num_input_tokens_seen": 150960064, + "step": 1590 + }, + { + "epoch": 0.20212228398180898, + "grad_norm": 0.2913673520088196, + "learning_rate": 3.235212971088891e-05, + "loss": 1.3214, + "num_input_tokens_seen": 151918208, + "step": 1600 + }, + { + "epoch": 0.2033855482566953, + "grad_norm": 0.279725044965744, + "learning_rate": 3.231413494559156e-05, + "loss": 1.2746, + "num_input_tokens_seen": 152856864, + "step": 1610 + }, + { + "epoch": 0.2046488125315816, + "grad_norm": 0.27453747391700745, + "learning_rate": 3.227589216456752e-05, + "loss": 1.3174, + "num_input_tokens_seen": 153804192, + "step": 1620 + }, + { + "epoch": 0.20591207680646792, + "grad_norm": 0.22528155148029327, + "learning_rate": 3.223740200807091e-05, + "loss": 1.2745, + "num_input_tokens_seen": 154817632, + "step": 1630 + }, + { + "epoch": 0.20717534108135421, + "grad_norm": 0.27404505014419556, + "learning_rate": 3.2198665120497394e-05, + "loss": 1.3032, + "num_input_tokens_seen": 155756448, + "step": 1640 + }, + { + "epoch": 0.20843860535624054, + "grad_norm": 0.32085704803466797, + "learning_rate": 3.215968215037334e-05, + "loss": 1.3325, + "num_input_tokens_seen": 156763232, + "step": 1650 + }, + { + "epoch": 0.20970186963112683, + "grad_norm": 0.27827686071395874, + "learning_rate": 3.212045375034501e-05, + "loss": 1.2955, + "num_input_tokens_seen": 157709600, + "step": 1660 + }, + { + "epoch": 0.21096513390601312, + "grad_norm": 0.2595587968826294, + "learning_rate": 3.20809805771676e-05, + "loss": 1.2932, + "num_input_tokens_seen": 158695680, + "step": 1670 + }, + { + "epoch": 0.21222839818089945, + "grad_norm": 0.26113271713256836, + "learning_rate": 3.204126329169426e-05, + "loss": 1.2886, + "num_input_tokens_seen": 159651584, + "step": 1680 + }, + { + "epoch": 0.21349166245578574, + "grad_norm": 0.3666292428970337, + "learning_rate": 3.200130255886503e-05, + "loss": 1.3232, + "num_input_tokens_seen": 160621120, + "step": 1690 + }, + { + "epoch": 0.21475492673067206, + "grad_norm": 0.30534592270851135, + "learning_rate": 3.196109904769568e-05, + "loss": 1.3539, + "num_input_tokens_seen": 161585024, + "step": 1700 + }, + { + "epoch": 0.21601819100555836, + "grad_norm": 0.2684236466884613, + "learning_rate": 3.192065343126658e-05, + "loss": 1.2818, + "num_input_tokens_seen": 162539520, + "step": 1710 + }, + { + "epoch": 0.21728145528044468, + "grad_norm": 0.26715096831321716, + "learning_rate": 3.187996638671134e-05, + "loss": 1.2616, + "num_input_tokens_seen": 163462688, + "step": 1720 + }, + { + "epoch": 0.21854471955533097, + "grad_norm": 0.26400476694107056, + "learning_rate": 3.1839038595205555e-05, + "loss": 1.3017, + "num_input_tokens_seen": 164408768, + "step": 1730 + }, + { + "epoch": 0.21980798383021727, + "grad_norm": 0.2887386381626129, + "learning_rate": 3.1797870741955326e-05, + "loss": 1.2897, + "num_input_tokens_seen": 165382816, + "step": 1740 + }, + { + "epoch": 0.2210712481051036, + "grad_norm": 0.26668059825897217, + "learning_rate": 3.175646351618586e-05, + "loss": 1.3151, + "num_input_tokens_seen": 166320832, + "step": 1750 + }, + { + "epoch": 0.22233451237998988, + "grad_norm": 0.2531121075153351, + "learning_rate": 3.171481761112989e-05, + "loss": 1.3027, + "num_input_tokens_seen": 167349856, + "step": 1760 + }, + { + "epoch": 0.2235977766548762, + "grad_norm": 0.24423161149024963, + "learning_rate": 3.167293372401606e-05, + "loss": 1.3245, + "num_input_tokens_seen": 168295712, + "step": 1770 + }, + { + "epoch": 0.2248610409297625, + "grad_norm": 0.31519579887390137, + "learning_rate": 3.163081255605729e-05, + "loss": 1.2645, + "num_input_tokens_seen": 169282112, + "step": 1780 + }, + { + "epoch": 0.22612430520464882, + "grad_norm": 0.26210370659828186, + "learning_rate": 3.1588454812439e-05, + "loss": 1.3267, + "num_input_tokens_seen": 170222336, + "step": 1790 + }, + { + "epoch": 0.22738756947953512, + "grad_norm": 0.27912288904190063, + "learning_rate": 3.154586120230734e-05, + "loss": 1.277, + "num_input_tokens_seen": 171119488, + "step": 1800 + }, + { + "epoch": 0.2286508337544214, + "grad_norm": 0.26281440258026123, + "learning_rate": 3.150303243875727e-05, + "loss": 1.2892, + "num_input_tokens_seen": 172093984, + "step": 1810 + }, + { + "epoch": 0.22991409802930773, + "grad_norm": 0.2663213908672333, + "learning_rate": 3.1459969238820664e-05, + "loss": 1.3388, + "num_input_tokens_seen": 172993696, + "step": 1820 + }, + { + "epoch": 0.23117736230419403, + "grad_norm": 0.27080100774765015, + "learning_rate": 3.141667232345429e-05, + "loss": 1.3374, + "num_input_tokens_seen": 173906304, + "step": 1830 + }, + { + "epoch": 0.23244062657908035, + "grad_norm": 0.2679150104522705, + "learning_rate": 3.137314241752775e-05, + "loss": 1.288, + "num_input_tokens_seen": 174847680, + "step": 1840 + }, + { + "epoch": 0.23370389085396664, + "grad_norm": 0.2680162489414215, + "learning_rate": 3.1329380249811304e-05, + "loss": 1.3088, + "num_input_tokens_seen": 175814240, + "step": 1850 + }, + { + "epoch": 0.23496715512885297, + "grad_norm": 0.27686336636543274, + "learning_rate": 3.128538655296373e-05, + "loss": 1.2868, + "num_input_tokens_seen": 176805408, + "step": 1860 + }, + { + "epoch": 0.23623041940373926, + "grad_norm": 0.2732996344566345, + "learning_rate": 3.1241162063520015e-05, + "loss": 1.3692, + "num_input_tokens_seen": 177763168, + "step": 1870 + }, + { + "epoch": 0.23749368367862556, + "grad_norm": 0.25114187598228455, + "learning_rate": 3.1196707521879027e-05, + "loss": 1.3054, + "num_input_tokens_seen": 178689312, + "step": 1880 + }, + { + "epoch": 0.23875694795351188, + "grad_norm": 0.29648059606552124, + "learning_rate": 3.115202367229115e-05, + "loss": 1.3289, + "num_input_tokens_seen": 179578144, + "step": 1890 + }, + { + "epoch": 0.24002021222839817, + "grad_norm": 0.25034409761428833, + "learning_rate": 3.110711126284578e-05, + "loss": 1.305, + "num_input_tokens_seen": 180480192, + "step": 1900 + }, + { + "epoch": 0.2412834765032845, + "grad_norm": 0.26325249671936035, + "learning_rate": 3.106197104545884e-05, + "loss": 1.2645, + "num_input_tokens_seen": 181482336, + "step": 1910 + }, + { + "epoch": 0.2425467407781708, + "grad_norm": 0.279535710811615, + "learning_rate": 3.101660377586017e-05, + "loss": 1.2723, + "num_input_tokens_seen": 182353792, + "step": 1920 + }, + { + "epoch": 0.2438100050530571, + "grad_norm": 0.27417901158332825, + "learning_rate": 3.097101021358088e-05, + "loss": 1.2933, + "num_input_tokens_seen": 183284000, + "step": 1930 + }, + { + "epoch": 0.2450732693279434, + "grad_norm": 0.2854447066783905, + "learning_rate": 3.092519112194063e-05, + "loss": 1.2642, + "num_input_tokens_seen": 184244640, + "step": 1940 + }, + { + "epoch": 0.2463365336028297, + "grad_norm": 0.2935086190700531, + "learning_rate": 3.087914726803486e-05, + "loss": 1.3183, + "num_input_tokens_seen": 185157728, + "step": 1950 + }, + { + "epoch": 0.24759979787771602, + "grad_norm": 0.255464643239975, + "learning_rate": 3.0832879422721926e-05, + "loss": 1.2957, + "num_input_tokens_seen": 186099200, + "step": 1960 + }, + { + "epoch": 0.24886306215260232, + "grad_norm": 0.2608180642127991, + "learning_rate": 3.078638836061023e-05, + "loss": 1.3333, + "num_input_tokens_seen": 187017280, + "step": 1970 + }, + { + "epoch": 0.2501263264274886, + "grad_norm": 0.3294975459575653, + "learning_rate": 3.073967486004523e-05, + "loss": 1.332, + "num_input_tokens_seen": 187879360, + "step": 1980 + }, + { + "epoch": 0.25138959070237493, + "grad_norm": 0.2539006769657135, + "learning_rate": 3.069273970309639e-05, + "loss": 1.2726, + "num_input_tokens_seen": 188825632, + "step": 1990 + }, + { + "epoch": 0.25265285497726125, + "grad_norm": 0.282306969165802, + "learning_rate": 3.064558367554414e-05, + "loss": 1.32, + "num_input_tokens_seen": 189801824, + "step": 2000 + }, + { + "epoch": 0.25265285497726125, + "eval_loss": 1.321367859840393, + "eval_runtime": 11.9892, + "eval_samples_per_second": 12.511, + "eval_steps_per_second": 0.834, + "num_input_tokens_seen": 189801824, + "step": 2000 + }, + { + "epoch": 0.2539161192521476, + "grad_norm": 0.30715829133987427, + "learning_rate": 3.0598207566866656e-05, + "loss": 1.2423, + "num_input_tokens_seen": 190754304, + "step": 2010 + }, + { + "epoch": 0.25517938352703384, + "grad_norm": 0.2773028016090393, + "learning_rate": 3.055061217022669e-05, + "loss": 1.2411, + "num_input_tokens_seen": 191695456, + "step": 2020 + }, + { + "epoch": 0.25644264780192016, + "grad_norm": 0.267785906791687, + "learning_rate": 3.0502798282458278e-05, + "loss": 1.2461, + "num_input_tokens_seen": 192625312, + "step": 2030 + }, + { + "epoch": 0.2577059120768065, + "grad_norm": 0.2458842545747757, + "learning_rate": 3.0454766704053395e-05, + "loss": 1.2419, + "num_input_tokens_seen": 193574848, + "step": 2040 + }, + { + "epoch": 0.25896917635169275, + "grad_norm": 0.27695903182029724, + "learning_rate": 3.040651823914855e-05, + "loss": 1.3366, + "num_input_tokens_seen": 194470688, + "step": 2050 + }, + { + "epoch": 0.2602324406265791, + "grad_norm": 0.3028598725795746, + "learning_rate": 3.0358053695511335e-05, + "loss": 1.3199, + "num_input_tokens_seen": 195437280, + "step": 2060 + }, + { + "epoch": 0.2614957049014654, + "grad_norm": 0.2882876396179199, + "learning_rate": 3.030937388452689e-05, + "loss": 1.3221, + "num_input_tokens_seen": 196396320, + "step": 2070 + }, + { + "epoch": 0.2627589691763517, + "grad_norm": 0.29042840003967285, + "learning_rate": 3.026047962118433e-05, + "loss": 1.2693, + "num_input_tokens_seen": 197314176, + "step": 2080 + }, + { + "epoch": 0.264022233451238, + "grad_norm": 0.3192022740840912, + "learning_rate": 3.0211371724063097e-05, + "loss": 1.2668, + "num_input_tokens_seen": 198295456, + "step": 2090 + }, + { + "epoch": 0.2652854977261243, + "grad_norm": 0.250468373298645, + "learning_rate": 3.016205101531925e-05, + "loss": 1.2951, + "num_input_tokens_seen": 199239264, + "step": 2100 + }, + { + "epoch": 0.26654876200101063, + "grad_norm": 0.2620362639427185, + "learning_rate": 3.0112518320671694e-05, + "loss": 1.2826, + "num_input_tokens_seen": 200166720, + "step": 2110 + }, + { + "epoch": 0.2678120262758969, + "grad_norm": 0.2919938862323761, + "learning_rate": 3.0062774469388378e-05, + "loss": 1.3001, + "num_input_tokens_seen": 201163456, + "step": 2120 + }, + { + "epoch": 0.2690752905507832, + "grad_norm": 0.26850852370262146, + "learning_rate": 3.0012820294272402e-05, + "loss": 1.3118, + "num_input_tokens_seen": 202055360, + "step": 2130 + }, + { + "epoch": 0.27033855482566954, + "grad_norm": 0.2463986724615097, + "learning_rate": 2.9962656631648068e-05, + "loss": 1.2797, + "num_input_tokens_seen": 202973376, + "step": 2140 + }, + { + "epoch": 0.27160181910055586, + "grad_norm": 0.3001090884208679, + "learning_rate": 2.991228432134687e-05, + "loss": 1.2917, + "num_input_tokens_seen": 203918208, + "step": 2150 + }, + { + "epoch": 0.27286508337544213, + "grad_norm": 0.2551255524158478, + "learning_rate": 2.9861704206693464e-05, + "loss": 1.299, + "num_input_tokens_seen": 204934080, + "step": 2160 + }, + { + "epoch": 0.27412834765032845, + "grad_norm": 0.26097556948661804, + "learning_rate": 2.9810917134491515e-05, + "loss": 1.2935, + "num_input_tokens_seen": 205865376, + "step": 2170 + }, + { + "epoch": 0.2753916119252148, + "grad_norm": 0.2827478051185608, + "learning_rate": 2.975992395500956e-05, + "loss": 1.3006, + "num_input_tokens_seen": 206770144, + "step": 2180 + }, + { + "epoch": 0.27665487620010104, + "grad_norm": 0.28954237699508667, + "learning_rate": 2.9708725521966717e-05, + "loss": 1.3424, + "num_input_tokens_seen": 207706784, + "step": 2190 + }, + { + "epoch": 0.27791814047498736, + "grad_norm": 0.2639777660369873, + "learning_rate": 2.9657322692518452e-05, + "loss": 1.231, + "num_input_tokens_seen": 208641184, + "step": 2200 + }, + { + "epoch": 0.2791814047498737, + "grad_norm": 0.24287603795528412, + "learning_rate": 2.9605716327242188e-05, + "loss": 1.297, + "num_input_tokens_seen": 209596512, + "step": 2210 + }, + { + "epoch": 0.28044466902476, + "grad_norm": 0.2651768624782562, + "learning_rate": 2.9553907290122907e-05, + "loss": 1.3049, + "num_input_tokens_seen": 210586464, + "step": 2220 + }, + { + "epoch": 0.2817079332996463, + "grad_norm": 0.2656504809856415, + "learning_rate": 2.9501896448538696e-05, + "loss": 1.3497, + "num_input_tokens_seen": 211556992, + "step": 2230 + }, + { + "epoch": 0.2829711975745326, + "grad_norm": 0.26418015360832214, + "learning_rate": 2.9449684673246218e-05, + "loss": 1.2702, + "num_input_tokens_seen": 212522560, + "step": 2240 + }, + { + "epoch": 0.2842344618494189, + "grad_norm": 0.2586632966995239, + "learning_rate": 2.9397272838366127e-05, + "loss": 1.3232, + "num_input_tokens_seen": 213488448, + "step": 2250 + }, + { + "epoch": 0.2854977261243052, + "grad_norm": 0.28703370690345764, + "learning_rate": 2.934466182136845e-05, + "loss": 1.3158, + "num_input_tokens_seen": 214453408, + "step": 2260 + }, + { + "epoch": 0.2867609903991915, + "grad_norm": 0.2626774311065674, + "learning_rate": 2.9291852503057874e-05, + "loss": 1.3394, + "num_input_tokens_seen": 215412832, + "step": 2270 + }, + { + "epoch": 0.28802425467407783, + "grad_norm": 0.256173312664032, + "learning_rate": 2.923884576755903e-05, + "loss": 1.3325, + "num_input_tokens_seen": 216335968, + "step": 2280 + }, + { + "epoch": 0.28928751894896415, + "grad_norm": 0.26622363924980164, + "learning_rate": 2.9185642502301656e-05, + "loss": 1.2535, + "num_input_tokens_seen": 217269728, + "step": 2290 + }, + { + "epoch": 0.2905507832238504, + "grad_norm": 0.3084118068218231, + "learning_rate": 2.9132243598005775e-05, + "loss": 1.2808, + "num_input_tokens_seen": 218189440, + "step": 2300 + }, + { + "epoch": 0.29181404749873674, + "grad_norm": 0.32699644565582275, + "learning_rate": 2.9078649948666754e-05, + "loss": 1.3637, + "num_input_tokens_seen": 219151008, + "step": 2310 + }, + { + "epoch": 0.29307731177362306, + "grad_norm": 0.2988159954547882, + "learning_rate": 2.902486245154035e-05, + "loss": 1.2898, + "num_input_tokens_seen": 220065312, + "step": 2320 + }, + { + "epoch": 0.29434057604850933, + "grad_norm": 0.27708715200424194, + "learning_rate": 2.897088200712769e-05, + "loss": 1.2583, + "num_input_tokens_seen": 220958560, + "step": 2330 + }, + { + "epoch": 0.29560384032339565, + "grad_norm": 0.2532431185245514, + "learning_rate": 2.8916709519160187e-05, + "loss": 1.2647, + "num_input_tokens_seen": 221960800, + "step": 2340 + }, + { + "epoch": 0.296867104598282, + "grad_norm": 0.2507975101470947, + "learning_rate": 2.8862345894584418e-05, + "loss": 1.2569, + "num_input_tokens_seen": 222927616, + "step": 2350 + }, + { + "epoch": 0.2981303688731683, + "grad_norm": 0.30082589387893677, + "learning_rate": 2.880779204354694e-05, + "loss": 1.2582, + "num_input_tokens_seen": 223897536, + "step": 2360 + }, + { + "epoch": 0.29939363314805456, + "grad_norm": 0.25084131956100464, + "learning_rate": 2.875304887937904e-05, + "loss": 1.2445, + "num_input_tokens_seen": 224856256, + "step": 2370 + }, + { + "epoch": 0.3006568974229409, + "grad_norm": 0.27553117275238037, + "learning_rate": 2.869811731858146e-05, + "loss": 1.2693, + "num_input_tokens_seen": 225789760, + "step": 2380 + }, + { + "epoch": 0.3019201616978272, + "grad_norm": 0.31296080350875854, + "learning_rate": 2.864299828080905e-05, + "loss": 1.3125, + "num_input_tokens_seen": 226730144, + "step": 2390 + }, + { + "epoch": 0.30318342597271347, + "grad_norm": 0.2597751021385193, + "learning_rate": 2.858769268885535e-05, + "loss": 1.2959, + "num_input_tokens_seen": 227688608, + "step": 2400 + }, + { + "epoch": 0.3044466902475998, + "grad_norm": 0.27299267053604126, + "learning_rate": 2.8532201468637184e-05, + "loss": 1.2932, + "num_input_tokens_seen": 228590528, + "step": 2410 + }, + { + "epoch": 0.3057099545224861, + "grad_norm": 0.2804098129272461, + "learning_rate": 2.8476525549179103e-05, + "loss": 1.3001, + "num_input_tokens_seen": 229560000, + "step": 2420 + }, + { + "epoch": 0.30697321879737244, + "grad_norm": 0.30946534872055054, + "learning_rate": 2.8420665862597894e-05, + "loss": 1.2657, + "num_input_tokens_seen": 230542208, + "step": 2430 + }, + { + "epoch": 0.3082364830722587, + "grad_norm": 0.2868455648422241, + "learning_rate": 2.8364623344086917e-05, + "loss": 1.3603, + "num_input_tokens_seen": 231454912, + "step": 2440 + }, + { + "epoch": 0.309499747347145, + "grad_norm": 0.27222952246665955, + "learning_rate": 2.8308398931900488e-05, + "loss": 1.2796, + "num_input_tokens_seen": 232387808, + "step": 2450 + }, + { + "epoch": 0.31076301162203135, + "grad_norm": 0.29506227374076843, + "learning_rate": 2.825199356733814e-05, + "loss": 1.2863, + "num_input_tokens_seen": 233295584, + "step": 2460 + }, + { + "epoch": 0.3120262758969176, + "grad_norm": 0.25060921907424927, + "learning_rate": 2.8195408194728893e-05, + "loss": 1.2725, + "num_input_tokens_seen": 234308960, + "step": 2470 + }, + { + "epoch": 0.31328954017180394, + "grad_norm": 0.29915860295295715, + "learning_rate": 2.8138643761415432e-05, + "loss": 1.2656, + "num_input_tokens_seen": 235218880, + "step": 2480 + }, + { + "epoch": 0.31455280444669026, + "grad_norm": 0.30492904782295227, + "learning_rate": 2.8081701217738234e-05, + "loss": 1.2962, + "num_input_tokens_seen": 236173888, + "step": 2490 + }, + { + "epoch": 0.3158160687215765, + "grad_norm": 0.2989721894264221, + "learning_rate": 2.8024581517019686e-05, + "loss": 1.272, + "num_input_tokens_seen": 237219584, + "step": 2500 + }, + { + "epoch": 0.31707933299646285, + "grad_norm": 0.2604142725467682, + "learning_rate": 2.7967285615548084e-05, + "loss": 1.2846, + "num_input_tokens_seen": 238150432, + "step": 2510 + }, + { + "epoch": 0.31834259727134917, + "grad_norm": 0.2856138050556183, + "learning_rate": 2.790981447256168e-05, + "loss": 1.309, + "num_input_tokens_seen": 239091040, + "step": 2520 + }, + { + "epoch": 0.3196058615462355, + "grad_norm": 0.26201140880584717, + "learning_rate": 2.785216905023256e-05, + "loss": 1.3273, + "num_input_tokens_seen": 240005152, + "step": 2530 + }, + { + "epoch": 0.32086912582112176, + "grad_norm": 0.2805967628955841, + "learning_rate": 2.7794350313650574e-05, + "loss": 1.3044, + "num_input_tokens_seen": 240957856, + "step": 2540 + }, + { + "epoch": 0.3221323900960081, + "grad_norm": 0.25588178634643555, + "learning_rate": 2.7736359230807183e-05, + "loss": 1.4082, + "num_input_tokens_seen": 241939904, + "step": 2550 + }, + { + "epoch": 0.3233956543708944, + "grad_norm": 0.25974375009536743, + "learning_rate": 2.767819677257922e-05, + "loss": 1.3256, + "num_input_tokens_seen": 242886176, + "step": 2560 + }, + { + "epoch": 0.32465891864578067, + "grad_norm": 0.2552843689918518, + "learning_rate": 2.761986391271267e-05, + "loss": 1.3003, + "num_input_tokens_seen": 243769600, + "step": 2570 + }, + { + "epoch": 0.325922182920667, + "grad_norm": 0.2774961590766907, + "learning_rate": 2.7561361627806343e-05, + "loss": 1.3239, + "num_input_tokens_seen": 244675136, + "step": 2580 + }, + { + "epoch": 0.3271854471955533, + "grad_norm": 0.27106648683547974, + "learning_rate": 2.7502690897295546e-05, + "loss": 1.3087, + "num_input_tokens_seen": 245566400, + "step": 2590 + }, + { + "epoch": 0.32844871147043964, + "grad_norm": 0.253461629152298, + "learning_rate": 2.7443852703435657e-05, + "loss": 1.2503, + "num_input_tokens_seen": 246513216, + "step": 2600 + }, + { + "epoch": 0.3297119757453259, + "grad_norm": 0.290099173784256, + "learning_rate": 2.738484803128571e-05, + "loss": 1.3034, + "num_input_tokens_seen": 247488992, + "step": 2610 + }, + { + "epoch": 0.3309752400202122, + "grad_norm": 0.2331458479166031, + "learning_rate": 2.7325677868691897e-05, + "loss": 1.2443, + "num_input_tokens_seen": 248404800, + "step": 2620 + }, + { + "epoch": 0.33223850429509855, + "grad_norm": 0.2953519821166992, + "learning_rate": 2.7266343206271e-05, + "loss": 1.2703, + "num_input_tokens_seen": 249396800, + "step": 2630 + }, + { + "epoch": 0.3335017685699848, + "grad_norm": 0.2447034865617752, + "learning_rate": 2.7206845037393847e-05, + "loss": 1.2079, + "num_input_tokens_seen": 250344864, + "step": 2640 + }, + { + "epoch": 0.33476503284487114, + "grad_norm": 0.2688887417316437, + "learning_rate": 2.7147184358168654e-05, + "loss": 1.2866, + "num_input_tokens_seen": 251205088, + "step": 2650 + }, + { + "epoch": 0.33602829711975746, + "grad_norm": 0.284983366727829, + "learning_rate": 2.7087362167424363e-05, + "loss": 1.2328, + "num_input_tokens_seen": 252125664, + "step": 2660 + }, + { + "epoch": 0.3372915613946438, + "grad_norm": 0.26568886637687683, + "learning_rate": 2.7027379466693918e-05, + "loss": 1.3343, + "num_input_tokens_seen": 253090112, + "step": 2670 + }, + { + "epoch": 0.33855482566953005, + "grad_norm": 0.2735290229320526, + "learning_rate": 2.6967237260197486e-05, + "loss": 1.3117, + "num_input_tokens_seen": 254002816, + "step": 2680 + }, + { + "epoch": 0.33981808994441637, + "grad_norm": 0.2602190673351288, + "learning_rate": 2.6906936554825652e-05, + "loss": 1.2729, + "num_input_tokens_seen": 254977856, + "step": 2690 + }, + { + "epoch": 0.3410813542193027, + "grad_norm": 0.279680997133255, + "learning_rate": 2.6846478360122567e-05, + "loss": 1.2494, + "num_input_tokens_seen": 255872864, + "step": 2700 + }, + { + "epoch": 0.34234461849418896, + "grad_norm": 0.29687556624412537, + "learning_rate": 2.6785863688269038e-05, + "loss": 1.3039, + "num_input_tokens_seen": 256788352, + "step": 2710 + }, + { + "epoch": 0.3436078827690753, + "grad_norm": 0.24734219908714294, + "learning_rate": 2.6725093554065596e-05, + "loss": 1.2728, + "num_input_tokens_seen": 257691904, + "step": 2720 + }, + { + "epoch": 0.3448711470439616, + "grad_norm": 0.2798856496810913, + "learning_rate": 2.666416897491548e-05, + "loss": 1.2519, + "num_input_tokens_seen": 258613408, + "step": 2730 + }, + { + "epoch": 0.3461344113188479, + "grad_norm": 0.3039948046207428, + "learning_rate": 2.660309097080763e-05, + "loss": 1.354, + "num_input_tokens_seen": 259569248, + "step": 2740 + }, + { + "epoch": 0.3473976755937342, + "grad_norm": 0.25825923681259155, + "learning_rate": 2.6541860564299605e-05, + "loss": 1.265, + "num_input_tokens_seen": 260534624, + "step": 2750 + }, + { + "epoch": 0.3486609398686205, + "grad_norm": 0.2977043390274048, + "learning_rate": 2.6480478780500435e-05, + "loss": 1.3044, + "num_input_tokens_seen": 261467520, + "step": 2760 + }, + { + "epoch": 0.34992420414350683, + "grad_norm": 0.2831237018108368, + "learning_rate": 2.6418946647053525e-05, + "loss": 1.2419, + "num_input_tokens_seen": 262404128, + "step": 2770 + }, + { + "epoch": 0.3511874684183931, + "grad_norm": 0.27858638763427734, + "learning_rate": 2.635726519411936e-05, + "loss": 1.2902, + "num_input_tokens_seen": 263348320, + "step": 2780 + }, + { + "epoch": 0.3524507326932794, + "grad_norm": 0.2645137310028076, + "learning_rate": 2.629543545435835e-05, + "loss": 1.2151, + "num_input_tokens_seen": 264335616, + "step": 2790 + }, + { + "epoch": 0.35371399696816574, + "grad_norm": 0.2533610165119171, + "learning_rate": 2.623345846291347e-05, + "loss": 1.2592, + "num_input_tokens_seen": 265353120, + "step": 2800 + }, + { + "epoch": 0.35497726124305207, + "grad_norm": 0.25733280181884766, + "learning_rate": 2.6171335257392957e-05, + "loss": 1.3101, + "num_input_tokens_seen": 266300480, + "step": 2810 + }, + { + "epoch": 0.35624052551793833, + "grad_norm": 0.2579527199268341, + "learning_rate": 2.610906687785296e-05, + "loss": 1.3144, + "num_input_tokens_seen": 267223328, + "step": 2820 + }, + { + "epoch": 0.35750378979282466, + "grad_norm": 0.2560044527053833, + "learning_rate": 2.6046654366780096e-05, + "loss": 1.2442, + "num_input_tokens_seen": 268154112, + "step": 2830 + }, + { + "epoch": 0.358767054067711, + "grad_norm": 0.24506497383117676, + "learning_rate": 2.5984098769073995e-05, + "loss": 1.3063, + "num_input_tokens_seen": 269044736, + "step": 2840 + }, + { + "epoch": 0.36003031834259724, + "grad_norm": 0.27899622917175293, + "learning_rate": 2.592140113202984e-05, + "loss": 1.2877, + "num_input_tokens_seen": 270024064, + "step": 2850 + }, + { + "epoch": 0.36129358261748357, + "grad_norm": 0.2520020604133606, + "learning_rate": 2.5858562505320787e-05, + "loss": 1.2984, + "num_input_tokens_seen": 270993600, + "step": 2860 + }, + { + "epoch": 0.3625568468923699, + "grad_norm": 0.24186141788959503, + "learning_rate": 2.5795583940980456e-05, + "loss": 1.2663, + "num_input_tokens_seen": 271930176, + "step": 2870 + }, + { + "epoch": 0.3638201111672562, + "grad_norm": 0.28816744685173035, + "learning_rate": 2.5732466493385238e-05, + "loss": 1.281, + "num_input_tokens_seen": 272857216, + "step": 2880 + }, + { + "epoch": 0.3650833754421425, + "grad_norm": 0.29359421133995056, + "learning_rate": 2.566921121923671e-05, + "loss": 1.2804, + "num_input_tokens_seen": 273869376, + "step": 2890 + }, + { + "epoch": 0.3663466397170288, + "grad_norm": 0.2661145329475403, + "learning_rate": 2.5605819177543906e-05, + "loss": 1.3292, + "num_input_tokens_seen": 274802592, + "step": 2900 + }, + { + "epoch": 0.3676099039919151, + "grad_norm": 0.26722949743270874, + "learning_rate": 2.55422914296056e-05, + "loss": 1.3162, + "num_input_tokens_seen": 275777312, + "step": 2910 + }, + { + "epoch": 0.3688731682668014, + "grad_norm": 0.2770121991634369, + "learning_rate": 2.5478629038992545e-05, + "loss": 1.2678, + "num_input_tokens_seen": 276772352, + "step": 2920 + }, + { + "epoch": 0.3701364325416877, + "grad_norm": 0.24549973011016846, + "learning_rate": 2.5414833071529645e-05, + "loss": 1.2787, + "num_input_tokens_seen": 277728896, + "step": 2930 + }, + { + "epoch": 0.37139969681657403, + "grad_norm": 0.25942620635032654, + "learning_rate": 2.5350904595278142e-05, + "loss": 1.2834, + "num_input_tokens_seen": 278658272, + "step": 2940 + }, + { + "epoch": 0.37266296109146035, + "grad_norm": 0.25496846437454224, + "learning_rate": 2.52868446805177e-05, + "loss": 1.2753, + "num_input_tokens_seen": 279635456, + "step": 2950 + }, + { + "epoch": 0.3739262253663466, + "grad_norm": 0.26107245683670044, + "learning_rate": 2.5222654399728518e-05, + "loss": 1.2995, + "num_input_tokens_seen": 280610176, + "step": 2960 + }, + { + "epoch": 0.37518948964123294, + "grad_norm": 0.29526421427726746, + "learning_rate": 2.515833482757335e-05, + "loss": 1.2749, + "num_input_tokens_seen": 281500224, + "step": 2970 + }, + { + "epoch": 0.37645275391611926, + "grad_norm": 0.2750958204269409, + "learning_rate": 2.5093887040879536e-05, + "loss": 1.2654, + "num_input_tokens_seen": 282466240, + "step": 2980 + }, + { + "epoch": 0.37771601819100553, + "grad_norm": 0.26100271940231323, + "learning_rate": 2.502931211862095e-05, + "loss": 1.2777, + "num_input_tokens_seen": 283435136, + "step": 2990 + }, + { + "epoch": 0.37897928246589185, + "grad_norm": 0.29179760813713074, + "learning_rate": 2.4964611141899948e-05, + "loss": 1.258, + "num_input_tokens_seen": 284388960, + "step": 3000 + }, + { + "epoch": 0.3802425467407782, + "grad_norm": 0.2875267565250397, + "learning_rate": 2.489978519392929e-05, + "loss": 1.277, + "num_input_tokens_seen": 285277344, + "step": 3010 + }, + { + "epoch": 0.3815058110156645, + "grad_norm": 0.28722459077835083, + "learning_rate": 2.4834835360013953e-05, + "loss": 1.2274, + "num_input_tokens_seen": 286206112, + "step": 3020 + }, + { + "epoch": 0.38276907529055076, + "grad_norm": 0.2907884418964386, + "learning_rate": 2.476976272753301e-05, + "loss": 1.26, + "num_input_tokens_seen": 287188160, + "step": 3030 + }, + { + "epoch": 0.3840323395654371, + "grad_norm": 0.2554284334182739, + "learning_rate": 2.4704568385921404e-05, + "loss": 1.2949, + "num_input_tokens_seen": 288111200, + "step": 3040 + }, + { + "epoch": 0.3852956038403234, + "grad_norm": 0.24661648273468018, + "learning_rate": 2.4639253426651703e-05, + "loss": 1.2442, + "num_input_tokens_seen": 289071840, + "step": 3050 + }, + { + "epoch": 0.3865588681152097, + "grad_norm": 0.2564159035682678, + "learning_rate": 2.457381894321585e-05, + "loss": 1.2549, + "num_input_tokens_seen": 290037344, + "step": 3060 + }, + { + "epoch": 0.387822132390096, + "grad_norm": 0.24792881309986115, + "learning_rate": 2.4508266031106835e-05, + "loss": 1.2534, + "num_input_tokens_seen": 290963680, + "step": 3070 + }, + { + "epoch": 0.3890853966649823, + "grad_norm": 0.29164549708366394, + "learning_rate": 2.4442595787800345e-05, + "loss": 1.2799, + "num_input_tokens_seen": 291992224, + "step": 3080 + }, + { + "epoch": 0.39034866093986864, + "grad_norm": 0.24966460466384888, + "learning_rate": 2.4376809312736438e-05, + "loss": 1.2712, + "num_input_tokens_seen": 292976480, + "step": 3090 + }, + { + "epoch": 0.3916119252147549, + "grad_norm": 0.28835946321487427, + "learning_rate": 2.431090770730107e-05, + "loss": 1.3135, + "num_input_tokens_seen": 293943776, + "step": 3100 + }, + { + "epoch": 0.39287518948964123, + "grad_norm": 0.25582680106163025, + "learning_rate": 2.4244892074807714e-05, + "loss": 1.1963, + "num_input_tokens_seen": 294860864, + "step": 3110 + }, + { + "epoch": 0.39413845376452755, + "grad_norm": 0.24214211106300354, + "learning_rate": 2.4178763520478864e-05, + "loss": 1.225, + "num_input_tokens_seen": 295732256, + "step": 3120 + }, + { + "epoch": 0.3954017180394138, + "grad_norm": 0.30721724033355713, + "learning_rate": 2.4112523151427515e-05, + "loss": 1.2633, + "num_input_tokens_seen": 296664736, + "step": 3130 + }, + { + "epoch": 0.39666498231430014, + "grad_norm": 0.30337947607040405, + "learning_rate": 2.4046172076638657e-05, + "loss": 1.2676, + "num_input_tokens_seen": 297635488, + "step": 3140 + }, + { + "epoch": 0.39792824658918646, + "grad_norm": 0.28588712215423584, + "learning_rate": 2.3979711406950688e-05, + "loss": 1.2635, + "num_input_tokens_seen": 298546208, + "step": 3150 + }, + { + "epoch": 0.3991915108640728, + "grad_norm": 0.27065521478652954, + "learning_rate": 2.3913142255036848e-05, + "loss": 1.3024, + "num_input_tokens_seen": 299442720, + "step": 3160 + }, + { + "epoch": 0.40045477513895905, + "grad_norm": 0.2623492181301117, + "learning_rate": 2.384646573538654e-05, + "loss": 1.2968, + "num_input_tokens_seen": 300421664, + "step": 3170 + }, + { + "epoch": 0.4017180394138454, + "grad_norm": 0.27391478419303894, + "learning_rate": 2.3779682964286715e-05, + "loss": 1.2181, + "num_input_tokens_seen": 301369824, + "step": 3180 + }, + { + "epoch": 0.4029813036887317, + "grad_norm": 0.2633381187915802, + "learning_rate": 2.3712795059803166e-05, + "loss": 1.2459, + "num_input_tokens_seen": 302411648, + "step": 3190 + }, + { + "epoch": 0.40424456796361796, + "grad_norm": 0.2716757655143738, + "learning_rate": 2.36458031417618e-05, + "loss": 1.2883, + "num_input_tokens_seen": 303342464, + "step": 3200 + }, + { + "epoch": 0.4055078322385043, + "grad_norm": 0.26981112360954285, + "learning_rate": 2.3578708331729927e-05, + "loss": 1.2978, + "num_input_tokens_seen": 304307424, + "step": 3210 + }, + { + "epoch": 0.4067710965133906, + "grad_norm": 0.24773098528385162, + "learning_rate": 2.3511511752997423e-05, + "loss": 1.3291, + "num_input_tokens_seen": 305311648, + "step": 3220 + }, + { + "epoch": 0.40803436078827693, + "grad_norm": 0.2609155774116516, + "learning_rate": 2.3444214530557985e-05, + "loss": 1.2416, + "num_input_tokens_seen": 306299200, + "step": 3230 + }, + { + "epoch": 0.4092976250631632, + "grad_norm": 0.258277028799057, + "learning_rate": 2.3376817791090263e-05, + "loss": 1.2476, + "num_input_tokens_seen": 307199776, + "step": 3240 + }, + { + "epoch": 0.4105608893380495, + "grad_norm": 0.3055669963359833, + "learning_rate": 2.3309322662938994e-05, + "loss": 1.2846, + "num_input_tokens_seen": 308118080, + "step": 3250 + }, + { + "epoch": 0.41182415361293584, + "grad_norm": 0.28719931840896606, + "learning_rate": 2.3241730276096136e-05, + "loss": 1.2432, + "num_input_tokens_seen": 309095584, + "step": 3260 + }, + { + "epoch": 0.4130874178878221, + "grad_norm": 0.2620775103569031, + "learning_rate": 2.3174041762181924e-05, + "loss": 1.3018, + "num_input_tokens_seen": 310052032, + "step": 3270 + }, + { + "epoch": 0.41435068216270843, + "grad_norm": 0.2525536119937897, + "learning_rate": 2.310625825442595e-05, + "loss": 1.2721, + "num_input_tokens_seen": 311011040, + "step": 3280 + }, + { + "epoch": 0.41561394643759475, + "grad_norm": 0.24205638468265533, + "learning_rate": 2.3038380887648158e-05, + "loss": 1.283, + "num_input_tokens_seen": 311953920, + "step": 3290 + }, + { + "epoch": 0.41687721071248107, + "grad_norm": 0.2821497321128845, + "learning_rate": 2.2970410798239875e-05, + "loss": 1.2184, + "num_input_tokens_seen": 312900064, + "step": 3300 + }, + { + "epoch": 0.41814047498736734, + "grad_norm": 0.26797381043434143, + "learning_rate": 2.290234912414478e-05, + "loss": 1.2682, + "num_input_tokens_seen": 313856160, + "step": 3310 + }, + { + "epoch": 0.41940373926225366, + "grad_norm": 0.26029297709465027, + "learning_rate": 2.2834197004839832e-05, + "loss": 1.2241, + "num_input_tokens_seen": 314758112, + "step": 3320 + }, + { + "epoch": 0.42066700353714, + "grad_norm": 0.2785716950893402, + "learning_rate": 2.276595558131622e-05, + "loss": 1.1807, + "num_input_tokens_seen": 315687232, + "step": 3330 + }, + { + "epoch": 0.42193026781202625, + "grad_norm": 0.282991886138916, + "learning_rate": 2.2697625996060242e-05, + "loss": 1.2337, + "num_input_tokens_seen": 316675552, + "step": 3340 + }, + { + "epoch": 0.42319353208691257, + "grad_norm": 0.26791542768478394, + "learning_rate": 2.2629209393034202e-05, + "loss": 1.277, + "num_input_tokens_seen": 317594112, + "step": 3350 + }, + { + "epoch": 0.4244567963617989, + "grad_norm": 0.2645999789237976, + "learning_rate": 2.256070691765721e-05, + "loss": 1.2995, + "num_input_tokens_seen": 318542656, + "step": 3360 + }, + { + "epoch": 0.4257200606366852, + "grad_norm": 0.2621070146560669, + "learning_rate": 2.249211971678606e-05, + "loss": 1.2712, + "num_input_tokens_seen": 319529632, + "step": 3370 + }, + { + "epoch": 0.4269833249115715, + "grad_norm": 0.292126327753067, + "learning_rate": 2.2423448938696008e-05, + "loss": 1.281, + "num_input_tokens_seen": 320495008, + "step": 3380 + }, + { + "epoch": 0.4282465891864578, + "grad_norm": 0.26194462180137634, + "learning_rate": 2.235469573306152e-05, + "loss": 1.2705, + "num_input_tokens_seen": 321386944, + "step": 3390 + }, + { + "epoch": 0.4295098534613441, + "grad_norm": 0.26072680950164795, + "learning_rate": 2.2285861250937078e-05, + "loss": 1.3382, + "num_input_tokens_seen": 322285280, + "step": 3400 + }, + { + "epoch": 0.4307731177362304, + "grad_norm": 0.308788001537323, + "learning_rate": 2.2216946644737867e-05, + "loss": 1.3189, + "num_input_tokens_seen": 323297568, + "step": 3410 + }, + { + "epoch": 0.4320363820111167, + "grad_norm": 0.26922985911369324, + "learning_rate": 2.2147953068220498e-05, + "loss": 1.2132, + "num_input_tokens_seen": 324283360, + "step": 3420 + }, + { + "epoch": 0.43329964628600304, + "grad_norm": 0.27006080746650696, + "learning_rate": 2.207888167646369e-05, + "loss": 1.2268, + "num_input_tokens_seen": 325189760, + "step": 3430 + }, + { + "epoch": 0.43456291056088936, + "grad_norm": 0.26316067576408386, + "learning_rate": 2.2009733625848932e-05, + "loss": 1.2945, + "num_input_tokens_seen": 326144000, + "step": 3440 + }, + { + "epoch": 0.4358261748357756, + "grad_norm": 0.2620113790035248, + "learning_rate": 2.1940510074041124e-05, + "loss": 1.2857, + "num_input_tokens_seen": 327078432, + "step": 3450 + }, + { + "epoch": 0.43708943911066195, + "grad_norm": 0.3018427789211273, + "learning_rate": 2.1871212179969193e-05, + "loss": 1.2732, + "num_input_tokens_seen": 327975328, + "step": 3460 + }, + { + "epoch": 0.43835270338554827, + "grad_norm": 0.3014253079891205, + "learning_rate": 2.180184110380668e-05, + "loss": 1.2944, + "num_input_tokens_seen": 328923296, + "step": 3470 + }, + { + "epoch": 0.43961596766043454, + "grad_norm": 0.26709380745887756, + "learning_rate": 2.173239800695235e-05, + "loss": 1.2801, + "num_input_tokens_seen": 329852576, + "step": 3480 + }, + { + "epoch": 0.44087923193532086, + "grad_norm": 0.26904571056365967, + "learning_rate": 2.1662884052010715e-05, + "loss": 1.3081, + "num_input_tokens_seen": 330887712, + "step": 3490 + }, + { + "epoch": 0.4421424962102072, + "grad_norm": 0.2532831132411957, + "learning_rate": 2.1593300402772578e-05, + "loss": 1.2399, + "num_input_tokens_seen": 331852448, + "step": 3500 + }, + { + "epoch": 0.4434057604850935, + "grad_norm": 0.2727656364440918, + "learning_rate": 2.1523648224195553e-05, + "loss": 1.3334, + "num_input_tokens_seen": 332849824, + "step": 3510 + }, + { + "epoch": 0.44466902475997977, + "grad_norm": 0.2567518353462219, + "learning_rate": 2.1453928682384567e-05, + "loss": 1.2469, + "num_input_tokens_seen": 333796544, + "step": 3520 + }, + { + "epoch": 0.4459322890348661, + "grad_norm": 0.27944666147232056, + "learning_rate": 2.1384142944572327e-05, + "loss": 1.2182, + "num_input_tokens_seen": 334769728, + "step": 3530 + }, + { + "epoch": 0.4471955533097524, + "grad_norm": 0.26202327013015747, + "learning_rate": 2.131429217909978e-05, + "loss": 1.2556, + "num_input_tokens_seen": 335697824, + "step": 3540 + }, + { + "epoch": 0.4484588175846387, + "grad_norm": 0.2528652250766754, + "learning_rate": 2.1244377555396552e-05, + "loss": 1.2889, + "num_input_tokens_seen": 336718816, + "step": 3550 + }, + { + "epoch": 0.449722081859525, + "grad_norm": 0.27603092789649963, + "learning_rate": 2.1174400243961384e-05, + "loss": 1.2786, + "num_input_tokens_seen": 337621120, + "step": 3560 + }, + { + "epoch": 0.4509853461344113, + "grad_norm": 0.2740069627761841, + "learning_rate": 2.1104361416342515e-05, + "loss": 1.2048, + "num_input_tokens_seen": 338654368, + "step": 3570 + }, + { + "epoch": 0.45224861040929765, + "grad_norm": 0.2614036798477173, + "learning_rate": 2.1034262245118083e-05, + "loss": 1.299, + "num_input_tokens_seen": 339635072, + "step": 3580 + }, + { + "epoch": 0.4535118746841839, + "grad_norm": 0.2862122058868408, + "learning_rate": 2.0964103903876478e-05, + "loss": 1.2675, + "num_input_tokens_seen": 340587008, + "step": 3590 + }, + { + "epoch": 0.45477513895907024, + "grad_norm": 0.2503550946712494, + "learning_rate": 2.089388756719672e-05, + "loss": 1.3265, + "num_input_tokens_seen": 341507104, + "step": 3600 + }, + { + "epoch": 0.45603840323395656, + "grad_norm": 0.2760883867740631, + "learning_rate": 2.0823614410628762e-05, + "loss": 1.2568, + "num_input_tokens_seen": 342452832, + "step": 3610 + }, + { + "epoch": 0.4573016675088428, + "grad_norm": 0.25591230392456055, + "learning_rate": 2.075328561067385e-05, + "loss": 1.2854, + "num_input_tokens_seen": 343443968, + "step": 3620 + }, + { + "epoch": 0.45856493178372915, + "grad_norm": 0.247548446059227, + "learning_rate": 2.0682902344764768e-05, + "loss": 1.2427, + "num_input_tokens_seen": 344422112, + "step": 3630 + }, + { + "epoch": 0.45982819605861547, + "grad_norm": 0.2951701879501343, + "learning_rate": 2.0612465791246192e-05, + "loss": 1.2824, + "num_input_tokens_seen": 345312448, + "step": 3640 + }, + { + "epoch": 0.4610914603335018, + "grad_norm": 0.2961169481277466, + "learning_rate": 2.0541977129354912e-05, + "loss": 1.266, + "num_input_tokens_seen": 346277152, + "step": 3650 + }, + { + "epoch": 0.46235472460838806, + "grad_norm": 0.27115508913993835, + "learning_rate": 2.0471437539200107e-05, + "loss": 1.3118, + "num_input_tokens_seen": 347211840, + "step": 3660 + }, + { + "epoch": 0.4636179888832744, + "grad_norm": 0.27469298243522644, + "learning_rate": 2.0400848201743608e-05, + "loss": 1.1801, + "num_input_tokens_seen": 348124992, + "step": 3670 + }, + { + "epoch": 0.4648812531581607, + "grad_norm": 0.26864269375801086, + "learning_rate": 2.033021029878008e-05, + "loss": 1.2319, + "num_input_tokens_seen": 349074176, + "step": 3680 + }, + { + "epoch": 0.46614451743304697, + "grad_norm": 0.2966035008430481, + "learning_rate": 2.0259525012917273e-05, + "loss": 1.3158, + "num_input_tokens_seen": 350022112, + "step": 3690 + }, + { + "epoch": 0.4674077817079333, + "grad_norm": 0.24909211695194244, + "learning_rate": 2.0188793527556226e-05, + "loss": 1.2902, + "num_input_tokens_seen": 350974272, + "step": 3700 + }, + { + "epoch": 0.4686710459828196, + "grad_norm": 0.256197065114975, + "learning_rate": 2.011801702687142e-05, + "loss": 1.2275, + "num_input_tokens_seen": 351958848, + "step": 3710 + }, + { + "epoch": 0.46993431025770593, + "grad_norm": 0.2664201259613037, + "learning_rate": 2.0047196695791006e-05, + "loss": 1.2488, + "num_input_tokens_seen": 352921472, + "step": 3720 + }, + { + "epoch": 0.4711975745325922, + "grad_norm": 0.2655077278614044, + "learning_rate": 1.997633371997689e-05, + "loss": 1.2214, + "num_input_tokens_seen": 353841344, + "step": 3730 + }, + { + "epoch": 0.4724608388074785, + "grad_norm": 0.2981346845626831, + "learning_rate": 1.9905429285804987e-05, + "loss": 1.2257, + "num_input_tokens_seen": 354788480, + "step": 3740 + }, + { + "epoch": 0.47372410308236484, + "grad_norm": 0.3032223880290985, + "learning_rate": 1.9834484580345248e-05, + "loss": 1.2228, + "num_input_tokens_seen": 355683616, + "step": 3750 + }, + { + "epoch": 0.4749873673572511, + "grad_norm": 0.2835098206996918, + "learning_rate": 1.976350079134187e-05, + "loss": 1.2498, + "num_input_tokens_seen": 356653312, + "step": 3760 + }, + { + "epoch": 0.47625063163213743, + "grad_norm": 0.2348804771900177, + "learning_rate": 1.9692479107193365e-05, + "loss": 1.2461, + "num_input_tokens_seen": 357609024, + "step": 3770 + }, + { + "epoch": 0.47751389590702376, + "grad_norm": 0.28105470538139343, + "learning_rate": 1.962142071693269e-05, + "loss": 1.2909, + "num_input_tokens_seen": 358542368, + "step": 3780 + }, + { + "epoch": 0.4787771601819101, + "grad_norm": 0.27118179202079773, + "learning_rate": 1.9550326810207325e-05, + "loss": 1.2809, + "num_input_tokens_seen": 359444576, + "step": 3790 + }, + { + "epoch": 0.48004042445679634, + "grad_norm": 0.2707975506782532, + "learning_rate": 1.9479198577259356e-05, + "loss": 1.2116, + "num_input_tokens_seen": 360334912, + "step": 3800 + }, + { + "epoch": 0.48130368873168267, + "grad_norm": 0.2806662619113922, + "learning_rate": 1.9408037208905558e-05, + "loss": 1.2828, + "num_input_tokens_seen": 361304576, + "step": 3810 + }, + { + "epoch": 0.482566953006569, + "grad_norm": 0.2591959834098816, + "learning_rate": 1.9336843896517458e-05, + "loss": 1.1958, + "num_input_tokens_seen": 362211520, + "step": 3820 + }, + { + "epoch": 0.48383021728145525, + "grad_norm": 0.2818770706653595, + "learning_rate": 1.926561983200137e-05, + "loss": 1.3481, + "num_input_tokens_seen": 363114336, + "step": 3830 + }, + { + "epoch": 0.4850934815563416, + "grad_norm": 0.25823378562927246, + "learning_rate": 1.919436620777847e-05, + "loss": 1.2547, + "num_input_tokens_seen": 364014272, + "step": 3840 + }, + { + "epoch": 0.4863567458312279, + "grad_norm": 0.254759818315506, + "learning_rate": 1.9123084216764807e-05, + "loss": 1.2323, + "num_input_tokens_seen": 364978528, + "step": 3850 + }, + { + "epoch": 0.4876200101061142, + "grad_norm": 0.26032665371894836, + "learning_rate": 1.9051775052351343e-05, + "loss": 1.3204, + "num_input_tokens_seen": 365890720, + "step": 3860 + }, + { + "epoch": 0.4888832743810005, + "grad_norm": 0.26584163308143616, + "learning_rate": 1.8980439908383986e-05, + "loss": 1.2814, + "num_input_tokens_seen": 366818304, + "step": 3870 + }, + { + "epoch": 0.4901465386558868, + "grad_norm": 0.2640645205974579, + "learning_rate": 1.890907997914357e-05, + "loss": 1.2683, + "num_input_tokens_seen": 367770048, + "step": 3880 + }, + { + "epoch": 0.49140980293077313, + "grad_norm": 0.27595484256744385, + "learning_rate": 1.8837696459325896e-05, + "loss": 1.3023, + "num_input_tokens_seen": 368716352, + "step": 3890 + }, + { + "epoch": 0.4926730672056594, + "grad_norm": 0.2723195552825928, + "learning_rate": 1.8766290544021696e-05, + "loss": 1.2429, + "num_input_tokens_seen": 369700736, + "step": 3900 + }, + { + "epoch": 0.4939363314805457, + "grad_norm": 0.2871018052101135, + "learning_rate": 1.869486342869667e-05, + "loss": 1.3019, + "num_input_tokens_seen": 370702016, + "step": 3910 + }, + { + "epoch": 0.49519959575543204, + "grad_norm": 0.299991250038147, + "learning_rate": 1.8623416309171423e-05, + "loss": 1.2597, + "num_input_tokens_seen": 371647904, + "step": 3920 + }, + { + "epoch": 0.49646286003031836, + "grad_norm": 0.29281744360923767, + "learning_rate": 1.8551950381601466e-05, + "loss": 1.2109, + "num_input_tokens_seen": 372649376, + "step": 3930 + }, + { + "epoch": 0.49772612430520463, + "grad_norm": 0.2941571772098541, + "learning_rate": 1.8480466842457208e-05, + "loss": 1.2597, + "num_input_tokens_seen": 373577504, + "step": 3940 + }, + { + "epoch": 0.49898938858009095, + "grad_norm": 0.25515016913414, + "learning_rate": 1.8408966888503894e-05, + "loss": 1.2588, + "num_input_tokens_seen": 374508256, + "step": 3950 + }, + { + "epoch": 0.5002526528549772, + "grad_norm": 0.2905372083187103, + "learning_rate": 1.8337451716781592e-05, + "loss": 1.2734, + "num_input_tokens_seen": 375425088, + "step": 3960 + }, + { + "epoch": 0.5015159171298635, + "grad_norm": 0.27142760157585144, + "learning_rate": 1.8265922524585137e-05, + "loss": 1.2444, + "num_input_tokens_seen": 376367264, + "step": 3970 + }, + { + "epoch": 0.5027791814047499, + "grad_norm": 0.26266419887542725, + "learning_rate": 1.8194380509444095e-05, + "loss": 1.2504, + "num_input_tokens_seen": 377307360, + "step": 3980 + }, + { + "epoch": 0.5040424456796362, + "grad_norm": 0.24885958433151245, + "learning_rate": 1.8122826869102706e-05, + "loss": 1.2403, + "num_input_tokens_seen": 378238624, + "step": 3990 + }, + { + "epoch": 0.5053057099545225, + "grad_norm": 0.2766496241092682, + "learning_rate": 1.8051262801499845e-05, + "loss": 1.2614, + "num_input_tokens_seen": 379241088, + "step": 4000 + }, + { + "epoch": 0.5053057099545225, + "eval_loss": 1.2814823389053345, + "eval_runtime": 12.3847, + "eval_samples_per_second": 12.112, + "eval_steps_per_second": 0.807, + "num_input_tokens_seen": 379241088, + "step": 4000 + }, + { + "epoch": 0.5065689742294088, + "grad_norm": 0.2559678256511688, + "learning_rate": 1.7979689504748963e-05, + "loss": 1.2359, + "num_input_tokens_seen": 380145024, + "step": 4010 + }, + { + "epoch": 0.5078322385042952, + "grad_norm": 0.276067852973938, + "learning_rate": 1.7908108177118005e-05, + "loss": 1.2247, + "num_input_tokens_seen": 381154496, + "step": 4020 + }, + { + "epoch": 0.5090955027791814, + "grad_norm": 0.26673588156700134, + "learning_rate": 1.7836520017009383e-05, + "loss": 1.2377, + "num_input_tokens_seen": 382081728, + "step": 4030 + }, + { + "epoch": 0.5103587670540677, + "grad_norm": 0.2775169014930725, + "learning_rate": 1.7764926222939893e-05, + "loss": 1.2305, + "num_input_tokens_seen": 383040896, + "step": 4040 + }, + { + "epoch": 0.511622031328954, + "grad_norm": 0.2704101502895355, + "learning_rate": 1.7693327993520654e-05, + "loss": 1.2809, + "num_input_tokens_seen": 383997344, + "step": 4050 + }, + { + "epoch": 0.5128852956038403, + "grad_norm": 0.2597109079360962, + "learning_rate": 1.7621726527437044e-05, + "loss": 1.2637, + "num_input_tokens_seen": 384951744, + "step": 4060 + }, + { + "epoch": 0.5141485598787267, + "grad_norm": 0.265578955411911, + "learning_rate": 1.7550123023428622e-05, + "loss": 1.306, + "num_input_tokens_seen": 385818784, + "step": 4070 + }, + { + "epoch": 0.515411824153613, + "grad_norm": 0.2557640075683594, + "learning_rate": 1.7478518680269075e-05, + "loss": 1.2842, + "num_input_tokens_seen": 386759680, + "step": 4080 + }, + { + "epoch": 0.5166750884284993, + "grad_norm": 0.25985798239707947, + "learning_rate": 1.740691469674612e-05, + "loss": 1.2464, + "num_input_tokens_seen": 387730016, + "step": 4090 + }, + { + "epoch": 0.5179383527033855, + "grad_norm": 0.25625666975975037, + "learning_rate": 1.733531227164148e-05, + "loss": 1.2265, + "num_input_tokens_seen": 388693952, + "step": 4100 + }, + { + "epoch": 0.5192016169782718, + "grad_norm": 0.2758398950099945, + "learning_rate": 1.726371260371076e-05, + "loss": 1.2007, + "num_input_tokens_seen": 389669216, + "step": 4110 + }, + { + "epoch": 0.5204648812531582, + "grad_norm": 0.27401378750801086, + "learning_rate": 1.7192116891663433e-05, + "loss": 1.2657, + "num_input_tokens_seen": 390647360, + "step": 4120 + }, + { + "epoch": 0.5217281455280445, + "grad_norm": 0.29113706946372986, + "learning_rate": 1.712052633414272e-05, + "loss": 1.2834, + "num_input_tokens_seen": 391549504, + "step": 4130 + }, + { + "epoch": 0.5229914098029308, + "grad_norm": 0.2795151472091675, + "learning_rate": 1.7048942129705552e-05, + "loss": 1.2343, + "num_input_tokens_seen": 392518208, + "step": 4140 + }, + { + "epoch": 0.5242546740778171, + "grad_norm": 0.3003349006175995, + "learning_rate": 1.6977365476802505e-05, + "loss": 1.28, + "num_input_tokens_seen": 393502048, + "step": 4150 + }, + { + "epoch": 0.5255179383527034, + "grad_norm": 0.28123393654823303, + "learning_rate": 1.690579757375772e-05, + "loss": 1.2696, + "num_input_tokens_seen": 394482816, + "step": 4160 + }, + { + "epoch": 0.5267812026275897, + "grad_norm": 0.25133296847343445, + "learning_rate": 1.6834239618748856e-05, + "loss": 1.2744, + "num_input_tokens_seen": 395421792, + "step": 4170 + }, + { + "epoch": 0.528044466902476, + "grad_norm": 0.2568908631801605, + "learning_rate": 1.6762692809787007e-05, + "loss": 1.2162, + "num_input_tokens_seen": 396370464, + "step": 4180 + }, + { + "epoch": 0.5293077311773623, + "grad_norm": 0.24872644245624542, + "learning_rate": 1.66911583446967e-05, + "loss": 1.2291, + "num_input_tokens_seen": 397275616, + "step": 4190 + }, + { + "epoch": 0.5305709954522486, + "grad_norm": 0.2645767033100128, + "learning_rate": 1.6619637421095762e-05, + "loss": 1.2803, + "num_input_tokens_seen": 398260032, + "step": 4200 + }, + { + "epoch": 0.5318342597271349, + "grad_norm": 0.2733348608016968, + "learning_rate": 1.654813123637533e-05, + "loss": 1.2447, + "num_input_tokens_seen": 399281952, + "step": 4210 + }, + { + "epoch": 0.5330975240020213, + "grad_norm": 0.27618396282196045, + "learning_rate": 1.6476640987679787e-05, + "loss": 1.2296, + "num_input_tokens_seen": 400197792, + "step": 4220 + }, + { + "epoch": 0.5343607882769076, + "grad_norm": 0.2598818242549896, + "learning_rate": 1.64051678718867e-05, + "loss": 1.258, + "num_input_tokens_seen": 401102336, + "step": 4230 + }, + { + "epoch": 0.5356240525517938, + "grad_norm": 0.254782497882843, + "learning_rate": 1.6333713085586823e-05, + "loss": 1.2465, + "num_input_tokens_seen": 402011040, + "step": 4240 + }, + { + "epoch": 0.5368873168266801, + "grad_norm": 0.26978209614753723, + "learning_rate": 1.6262277825064032e-05, + "loss": 1.279, + "num_input_tokens_seen": 402950816, + "step": 4250 + }, + { + "epoch": 0.5381505811015664, + "grad_norm": 0.2889060378074646, + "learning_rate": 1.6190863286275296e-05, + "loss": 1.3152, + "num_input_tokens_seen": 403935136, + "step": 4260 + }, + { + "epoch": 0.5394138453764528, + "grad_norm": 0.3075631856918335, + "learning_rate": 1.611947066483068e-05, + "loss": 1.2845, + "num_input_tokens_seen": 404952864, + "step": 4270 + }, + { + "epoch": 0.5406771096513391, + "grad_norm": 0.27360478043556213, + "learning_rate": 1.6048101155973297e-05, + "loss": 1.2516, + "num_input_tokens_seen": 405957920, + "step": 4280 + }, + { + "epoch": 0.5419403739262254, + "grad_norm": 0.24361246824264526, + "learning_rate": 1.597675595455933e-05, + "loss": 1.2319, + "num_input_tokens_seen": 406898048, + "step": 4290 + }, + { + "epoch": 0.5432036382011117, + "grad_norm": 0.25894516706466675, + "learning_rate": 1.5905436255038e-05, + "loss": 1.3278, + "num_input_tokens_seen": 407848352, + "step": 4300 + }, + { + "epoch": 0.5444669024759979, + "grad_norm": 0.2489163875579834, + "learning_rate": 1.583414325143158e-05, + "loss": 1.2478, + "num_input_tokens_seen": 408813152, + "step": 4310 + }, + { + "epoch": 0.5457301667508843, + "grad_norm": 0.2795446217060089, + "learning_rate": 1.5762878137315406e-05, + "loss": 1.1847, + "num_input_tokens_seen": 409756608, + "step": 4320 + }, + { + "epoch": 0.5469934310257706, + "grad_norm": 0.2824794352054596, + "learning_rate": 1.5691642105797883e-05, + "loss": 1.2562, + "num_input_tokens_seen": 410623968, + "step": 4330 + }, + { + "epoch": 0.5482566953006569, + "grad_norm": 0.2690293788909912, + "learning_rate": 1.5620436349500548e-05, + "loss": 1.2486, + "num_input_tokens_seen": 411572768, + "step": 4340 + }, + { + "epoch": 0.5495199595755432, + "grad_norm": 0.3064996302127838, + "learning_rate": 1.5549262060538054e-05, + "loss": 1.2568, + "num_input_tokens_seen": 412493568, + "step": 4350 + }, + { + "epoch": 0.5507832238504295, + "grad_norm": 0.2691975235939026, + "learning_rate": 1.547812043049823e-05, + "loss": 1.275, + "num_input_tokens_seen": 413427264, + "step": 4360 + }, + { + "epoch": 0.5520464881253159, + "grad_norm": 0.27678680419921875, + "learning_rate": 1.5407012650422146e-05, + "loss": 1.2137, + "num_input_tokens_seen": 414404288, + "step": 4370 + }, + { + "epoch": 0.5533097524002021, + "grad_norm": 0.2862233519554138, + "learning_rate": 1.533593991078415e-05, + "loss": 1.2782, + "num_input_tokens_seen": 415391456, + "step": 4380 + }, + { + "epoch": 0.5545730166750884, + "grad_norm": 0.2569049298763275, + "learning_rate": 1.5264903401471965e-05, + "loss": 1.2294, + "num_input_tokens_seen": 416316512, + "step": 4390 + }, + { + "epoch": 0.5558362809499747, + "grad_norm": 0.291337788105011, + "learning_rate": 1.519390431176674e-05, + "loss": 1.1881, + "num_input_tokens_seen": 417250912, + "step": 4400 + }, + { + "epoch": 0.557099545224861, + "grad_norm": 0.28458911180496216, + "learning_rate": 1.5122943830323157e-05, + "loss": 1.2479, + "num_input_tokens_seen": 418203936, + "step": 4410 + }, + { + "epoch": 0.5583628094997474, + "grad_norm": 0.2543714642524719, + "learning_rate": 1.505202314514952e-05, + "loss": 1.2394, + "num_input_tokens_seen": 419118304, + "step": 4420 + }, + { + "epoch": 0.5596260737746337, + "grad_norm": 0.2531825304031372, + "learning_rate": 1.4981143443587867e-05, + "loss": 1.259, + "num_input_tokens_seen": 420057056, + "step": 4430 + }, + { + "epoch": 0.56088933804952, + "grad_norm": 0.2655525207519531, + "learning_rate": 1.4910305912294114e-05, + "loss": 1.2547, + "num_input_tokens_seen": 421040064, + "step": 4440 + }, + { + "epoch": 0.5621526023244062, + "grad_norm": 0.2566235363483429, + "learning_rate": 1.4839511737218156e-05, + "loss": 1.2314, + "num_input_tokens_seen": 421967616, + "step": 4450 + }, + { + "epoch": 0.5634158665992925, + "grad_norm": 0.2777341306209564, + "learning_rate": 1.476876210358402e-05, + "loss": 1.2543, + "num_input_tokens_seen": 422913952, + "step": 4460 + }, + { + "epoch": 0.5646791308741789, + "grad_norm": 0.26129183173179626, + "learning_rate": 1.4698058195870038e-05, + "loss": 1.247, + "num_input_tokens_seen": 423912288, + "step": 4470 + }, + { + "epoch": 0.5659423951490652, + "grad_norm": 0.2949627637863159, + "learning_rate": 1.462740119778899e-05, + "loss": 1.2653, + "num_input_tokens_seen": 424904672, + "step": 4480 + }, + { + "epoch": 0.5672056594239515, + "grad_norm": 0.2683241367340088, + "learning_rate": 1.4556792292268341e-05, + "loss": 1.2303, + "num_input_tokens_seen": 425895936, + "step": 4490 + }, + { + "epoch": 0.5684689236988378, + "grad_norm": 0.26744595170021057, + "learning_rate": 1.4486232661430359e-05, + "loss": 1.193, + "num_input_tokens_seen": 426778336, + "step": 4500 + }, + { + "epoch": 0.5697321879737242, + "grad_norm": 0.28104472160339355, + "learning_rate": 1.4415723486572379e-05, + "loss": 1.2065, + "num_input_tokens_seen": 427702848, + "step": 4510 + }, + { + "epoch": 0.5709954522486104, + "grad_norm": 0.2564327120780945, + "learning_rate": 1.434526594814701e-05, + "loss": 1.2315, + "num_input_tokens_seen": 428663616, + "step": 4520 + }, + { + "epoch": 0.5722587165234967, + "grad_norm": 0.246286079287529, + "learning_rate": 1.4274861225742369e-05, + "loss": 1.2768, + "num_input_tokens_seen": 429622080, + "step": 4530 + }, + { + "epoch": 0.573521980798383, + "grad_norm": 0.2924240529537201, + "learning_rate": 1.4204510498062347e-05, + "loss": 1.2405, + "num_input_tokens_seen": 430489344, + "step": 4540 + }, + { + "epoch": 0.5747852450732693, + "grad_norm": 0.26321151852607727, + "learning_rate": 1.4134214942906854e-05, + "loss": 1.2082, + "num_input_tokens_seen": 431465248, + "step": 4550 + }, + { + "epoch": 0.5760485093481557, + "grad_norm": 0.2737989127635956, + "learning_rate": 1.4063975737152111e-05, + "loss": 1.2378, + "num_input_tokens_seen": 432344320, + "step": 4560 + }, + { + "epoch": 0.577311773623042, + "grad_norm": 0.23963995277881622, + "learning_rate": 1.3993794056730945e-05, + "loss": 1.2195, + "num_input_tokens_seen": 433296800, + "step": 4570 + }, + { + "epoch": 0.5785750378979283, + "grad_norm": 0.25392717123031616, + "learning_rate": 1.3923671076613121e-05, + "loss": 1.2768, + "num_input_tokens_seen": 434228672, + "step": 4580 + }, + { + "epoch": 0.5798383021728145, + "grad_norm": 0.2499849945306778, + "learning_rate": 1.3853607970785636e-05, + "loss": 1.2608, + "num_input_tokens_seen": 435125376, + "step": 4590 + }, + { + "epoch": 0.5811015664477008, + "grad_norm": 0.2485542893409729, + "learning_rate": 1.3783605912233086e-05, + "loss": 1.3271, + "num_input_tokens_seen": 436060128, + "step": 4600 + }, + { + "epoch": 0.5823648307225872, + "grad_norm": 0.26257503032684326, + "learning_rate": 1.3713666072918025e-05, + "loss": 1.2772, + "num_input_tokens_seen": 437054208, + "step": 4610 + }, + { + "epoch": 0.5836280949974735, + "grad_norm": 0.27504444122314453, + "learning_rate": 1.3643789623761335e-05, + "loss": 1.2807, + "num_input_tokens_seen": 437972832, + "step": 4620 + }, + { + "epoch": 0.5848913592723598, + "grad_norm": 0.2476516216993332, + "learning_rate": 1.3573977734622654e-05, + "loss": 1.2403, + "num_input_tokens_seen": 438912832, + "step": 4630 + }, + { + "epoch": 0.5861546235472461, + "grad_norm": 0.26506373286247253, + "learning_rate": 1.3504231574280742e-05, + "loss": 1.2203, + "num_input_tokens_seen": 439899168, + "step": 4640 + }, + { + "epoch": 0.5874178878221324, + "grad_norm": 0.29639938473701477, + "learning_rate": 1.3434552310413948e-05, + "loss": 1.314, + "num_input_tokens_seen": 440917152, + "step": 4650 + }, + { + "epoch": 0.5886811520970187, + "grad_norm": 0.26634323596954346, + "learning_rate": 1.336494110958066e-05, + "loss": 1.2586, + "num_input_tokens_seen": 441860704, + "step": 4660 + }, + { + "epoch": 0.589944416371905, + "grad_norm": 0.26301464438438416, + "learning_rate": 1.3295399137199744e-05, + "loss": 1.2541, + "num_input_tokens_seen": 442838240, + "step": 4670 + }, + { + "epoch": 0.5912076806467913, + "grad_norm": 0.26125144958496094, + "learning_rate": 1.3225927557531086e-05, + "loss": 1.2743, + "num_input_tokens_seen": 443835552, + "step": 4680 + }, + { + "epoch": 0.5924709449216776, + "grad_norm": 0.2652340829372406, + "learning_rate": 1.3156527533656041e-05, + "loss": 1.2308, + "num_input_tokens_seen": 444788896, + "step": 4690 + }, + { + "epoch": 0.593734209196564, + "grad_norm": 0.2752208411693573, + "learning_rate": 1.3087200227458005e-05, + "loss": 1.2548, + "num_input_tokens_seen": 445779392, + "step": 4700 + }, + { + "epoch": 0.5949974734714503, + "grad_norm": 0.28993070125579834, + "learning_rate": 1.3017946799602943e-05, + "loss": 1.2103, + "num_input_tokens_seen": 446716864, + "step": 4710 + }, + { + "epoch": 0.5962607377463366, + "grad_norm": 0.248098686337471, + "learning_rate": 1.294876840951995e-05, + "loss": 1.2628, + "num_input_tokens_seen": 447604192, + "step": 4720 + }, + { + "epoch": 0.5975240020212228, + "grad_norm": 0.26949024200439453, + "learning_rate": 1.2879666215381881e-05, + "loss": 1.219, + "num_input_tokens_seen": 448549600, + "step": 4730 + }, + { + "epoch": 0.5987872662961091, + "grad_norm": 0.2639176547527313, + "learning_rate": 1.2810641374085904e-05, + "loss": 1.194, + "num_input_tokens_seen": 449481280, + "step": 4740 + }, + { + "epoch": 0.6000505305709954, + "grad_norm": 0.2593153417110443, + "learning_rate": 1.2741695041234165e-05, + "loss": 1.2001, + "num_input_tokens_seen": 450464096, + "step": 4750 + }, + { + "epoch": 0.6013137948458818, + "grad_norm": 0.2578306794166565, + "learning_rate": 1.2672828371114441e-05, + "loss": 1.1945, + "num_input_tokens_seen": 451387360, + "step": 4760 + }, + { + "epoch": 0.6025770591207681, + "grad_norm": 0.2578235864639282, + "learning_rate": 1.2604042516680797e-05, + "loss": 1.2215, + "num_input_tokens_seen": 452345664, + "step": 4770 + }, + { + "epoch": 0.6038403233956544, + "grad_norm": 0.2732868790626526, + "learning_rate": 1.2535338629534321e-05, + "loss": 1.2748, + "num_input_tokens_seen": 453247008, + "step": 4780 + }, + { + "epoch": 0.6051035876705407, + "grad_norm": 0.24936838448047638, + "learning_rate": 1.2466717859903794e-05, + "loss": 1.2132, + "num_input_tokens_seen": 454143616, + "step": 4790 + }, + { + "epoch": 0.6063668519454269, + "grad_norm": 0.2849110960960388, + "learning_rate": 1.2398181356626464e-05, + "loss": 1.2112, + "num_input_tokens_seen": 455058880, + "step": 4800 + }, + { + "epoch": 0.6076301162203133, + "grad_norm": 0.2991189956665039, + "learning_rate": 1.2329730267128808e-05, + "loss": 1.2349, + "num_input_tokens_seen": 456022464, + "step": 4810 + }, + { + "epoch": 0.6088933804951996, + "grad_norm": 0.262685626745224, + "learning_rate": 1.2261365737407316e-05, + "loss": 1.2596, + "num_input_tokens_seen": 457002592, + "step": 4820 + }, + { + "epoch": 0.6101566447700859, + "grad_norm": 0.25802651047706604, + "learning_rate": 1.2193088912009321e-05, + "loss": 1.1975, + "num_input_tokens_seen": 457977152, + "step": 4830 + }, + { + "epoch": 0.6114199090449722, + "grad_norm": 0.25570937991142273, + "learning_rate": 1.2124900934013812e-05, + "loss": 1.2774, + "num_input_tokens_seen": 458946368, + "step": 4840 + }, + { + "epoch": 0.6126831733198586, + "grad_norm": 0.2608765959739685, + "learning_rate": 1.2056802945012316e-05, + "loss": 1.2298, + "num_input_tokens_seen": 459789536, + "step": 4850 + }, + { + "epoch": 0.6139464375947449, + "grad_norm": 0.27471068501472473, + "learning_rate": 1.1988796085089777e-05, + "loss": 1.2663, + "num_input_tokens_seen": 460781856, + "step": 4860 + }, + { + "epoch": 0.6152097018696311, + "grad_norm": 0.30232349038124084, + "learning_rate": 1.1920881492805467e-05, + "loss": 1.2709, + "num_input_tokens_seen": 461735360, + "step": 4870 + }, + { + "epoch": 0.6164729661445174, + "grad_norm": 0.2713924050331116, + "learning_rate": 1.1853060305173947e-05, + "loss": 1.2925, + "num_input_tokens_seen": 462762272, + "step": 4880 + }, + { + "epoch": 0.6177362304194037, + "grad_norm": 0.2612393796443939, + "learning_rate": 1.1785333657645997e-05, + "loss": 1.2671, + "num_input_tokens_seen": 463701440, + "step": 4890 + }, + { + "epoch": 0.61899949469429, + "grad_norm": 0.2994194030761719, + "learning_rate": 1.1717702684089622e-05, + "loss": 1.2685, + "num_input_tokens_seen": 464628288, + "step": 4900 + }, + { + "epoch": 0.6202627589691764, + "grad_norm": 0.27403557300567627, + "learning_rate": 1.1650168516771077e-05, + "loss": 1.2313, + "num_input_tokens_seen": 465563264, + "step": 4910 + }, + { + "epoch": 0.6215260232440627, + "grad_norm": 0.2665519118309021, + "learning_rate": 1.1582732286335892e-05, + "loss": 1.2608, + "num_input_tokens_seen": 466527296, + "step": 4920 + }, + { + "epoch": 0.622789287518949, + "grad_norm": 0.2931445837020874, + "learning_rate": 1.151539512178998e-05, + "loss": 1.1978, + "num_input_tokens_seen": 467422144, + "step": 4930 + }, + { + "epoch": 0.6240525517938352, + "grad_norm": 0.243869349360466, + "learning_rate": 1.1448158150480684e-05, + "loss": 1.2584, + "num_input_tokens_seen": 468346080, + "step": 4940 + }, + { + "epoch": 0.6253158160687216, + "grad_norm": 0.24073927104473114, + "learning_rate": 1.1381022498077936e-05, + "loss": 1.2786, + "num_input_tokens_seen": 469268160, + "step": 4950 + }, + { + "epoch": 0.6265790803436079, + "grad_norm": 0.2580939531326294, + "learning_rate": 1.1313989288555403e-05, + "loss": 1.3028, + "num_input_tokens_seen": 470217248, + "step": 4960 + }, + { + "epoch": 0.6278423446184942, + "grad_norm": 0.27437812089920044, + "learning_rate": 1.1247059644171683e-05, + "loss": 1.1893, + "num_input_tokens_seen": 471134528, + "step": 4970 + }, + { + "epoch": 0.6291056088933805, + "grad_norm": 0.27005961537361145, + "learning_rate": 1.1180234685451485e-05, + "loss": 1.2873, + "num_input_tokens_seen": 472091616, + "step": 4980 + }, + { + "epoch": 0.6303688731682668, + "grad_norm": 0.2728407680988312, + "learning_rate": 1.1113515531166905e-05, + "loss": 1.2812, + "num_input_tokens_seen": 473036928, + "step": 4990 + }, + { + "epoch": 0.631632137443153, + "grad_norm": 0.2591012716293335, + "learning_rate": 1.1046903298318667e-05, + "loss": 1.2289, + "num_input_tokens_seen": 474006976, + "step": 5000 + }, + { + "epoch": 0.6328954017180394, + "grad_norm": 0.23528583347797394, + "learning_rate": 1.0980399102117435e-05, + "loss": 1.2315, + "num_input_tokens_seen": 474996096, + "step": 5010 + }, + { + "epoch": 0.6341586659929257, + "grad_norm": 0.27465859055519104, + "learning_rate": 1.0914004055965161e-05, + "loss": 1.3264, + "num_input_tokens_seen": 475933248, + "step": 5020 + }, + { + "epoch": 0.635421930267812, + "grad_norm": 0.27259302139282227, + "learning_rate": 1.08477192714364e-05, + "loss": 1.2479, + "num_input_tokens_seen": 476921888, + "step": 5030 + }, + { + "epoch": 0.6366851945426983, + "grad_norm": 0.2752089202404022, + "learning_rate": 1.078154585825974e-05, + "loss": 1.1889, + "num_input_tokens_seen": 477911648, + "step": 5040 + }, + { + "epoch": 0.6379484588175847, + "grad_norm": 0.2641167938709259, + "learning_rate": 1.0715484924299207e-05, + "loss": 1.1821, + "num_input_tokens_seen": 478897216, + "step": 5050 + }, + { + "epoch": 0.639211723092471, + "grad_norm": 0.24626615643501282, + "learning_rate": 1.0649537575535706e-05, + "loss": 1.3228, + "num_input_tokens_seen": 479897216, + "step": 5060 + }, + { + "epoch": 0.6404749873673572, + "grad_norm": 0.25866448879241943, + "learning_rate": 1.0583704916048546e-05, + "loss": 1.2286, + "num_input_tokens_seen": 480879104, + "step": 5070 + }, + { + "epoch": 0.6417382516422435, + "grad_norm": 0.2469986230134964, + "learning_rate": 1.05179880479969e-05, + "loss": 1.2382, + "num_input_tokens_seen": 481884800, + "step": 5080 + }, + { + "epoch": 0.6430015159171298, + "grad_norm": 0.26307523250579834, + "learning_rate": 1.0452388071601396e-05, + "loss": 1.2541, + "num_input_tokens_seen": 482806624, + "step": 5090 + }, + { + "epoch": 0.6442647801920162, + "grad_norm": 0.2624097168445587, + "learning_rate": 1.0386906085125676e-05, + "loss": 1.2405, + "num_input_tokens_seen": 483727232, + "step": 5100 + }, + { + "epoch": 0.6455280444669025, + "grad_norm": 0.25804755091667175, + "learning_rate": 1.0321543184858012e-05, + "loss": 1.2258, + "num_input_tokens_seen": 484757024, + "step": 5110 + }, + { + "epoch": 0.6467913087417888, + "grad_norm": 0.26082345843315125, + "learning_rate": 1.0256300465092968e-05, + "loss": 1.2453, + "num_input_tokens_seen": 485694944, + "step": 5120 + }, + { + "epoch": 0.6480545730166751, + "grad_norm": 0.26765161752700806, + "learning_rate": 1.0191179018113052e-05, + "loss": 1.2447, + "num_input_tokens_seen": 486613664, + "step": 5130 + }, + { + "epoch": 0.6493178372915613, + "grad_norm": 0.2676701545715332, + "learning_rate": 1.0126179934170446e-05, + "loss": 1.3095, + "num_input_tokens_seen": 487574816, + "step": 5140 + }, + { + "epoch": 0.6505811015664477, + "grad_norm": 0.2636936604976654, + "learning_rate": 1.0061304301468766e-05, + "loss": 1.2053, + "num_input_tokens_seen": 488516544, + "step": 5150 + }, + { + "epoch": 0.651844365841334, + "grad_norm": 0.2662390172481537, + "learning_rate": 9.996553206144797e-06, + "loss": 1.2751, + "num_input_tokens_seen": 489412608, + "step": 5160 + }, + { + "epoch": 0.6531076301162203, + "grad_norm": 0.26386016607284546, + "learning_rate": 9.931927732250374e-06, + "loss": 1.2631, + "num_input_tokens_seen": 490374624, + "step": 5170 + }, + { + "epoch": 0.6543708943911066, + "grad_norm": 0.27195560932159424, + "learning_rate": 9.867428961734188e-06, + "loss": 1.2587, + "num_input_tokens_seen": 491366592, + "step": 5180 + }, + { + "epoch": 0.655634158665993, + "grad_norm": 0.2867816686630249, + "learning_rate": 9.803057974423667e-06, + "loss": 1.2609, + "num_input_tokens_seen": 492314912, + "step": 5190 + }, + { + "epoch": 0.6568974229408793, + "grad_norm": 0.28000280261039734, + "learning_rate": 9.738815848006945e-06, + "loss": 1.2562, + "num_input_tokens_seen": 493215136, + "step": 5200 + }, + { + "epoch": 0.6581606872157655, + "grad_norm": 0.27017146348953247, + "learning_rate": 9.674703658014749e-06, + "loss": 1.2261, + "num_input_tokens_seen": 494146080, + "step": 5210 + }, + { + "epoch": 0.6594239514906518, + "grad_norm": 0.2675604522228241, + "learning_rate": 9.610722477802483e-06, + "loss": 1.292, + "num_input_tokens_seen": 495103840, + "step": 5220 + }, + { + "epoch": 0.6606872157655381, + "grad_norm": 0.2377164214849472, + "learning_rate": 9.546873378532158e-06, + "loss": 1.2278, + "num_input_tokens_seen": 496014752, + "step": 5230 + }, + { + "epoch": 0.6619504800404244, + "grad_norm": 0.2551622688770294, + "learning_rate": 9.483157429154547e-06, + "loss": 1.247, + "num_input_tokens_seen": 496955936, + "step": 5240 + }, + { + "epoch": 0.6632137443153108, + "grad_norm": 0.2615555226802826, + "learning_rate": 9.419575696391218e-06, + "loss": 1.2705, + "num_input_tokens_seen": 497881920, + "step": 5250 + }, + { + "epoch": 0.6644770085901971, + "grad_norm": 0.2722395956516266, + "learning_rate": 9.356129244716729e-06, + "loss": 1.2736, + "num_input_tokens_seen": 498859040, + "step": 5260 + }, + { + "epoch": 0.6657402728650834, + "grad_norm": 0.2843475639820099, + "learning_rate": 9.29281913634078e-06, + "loss": 1.2112, + "num_input_tokens_seen": 499848032, + "step": 5270 + }, + { + "epoch": 0.6670035371399696, + "grad_norm": 0.260781466960907, + "learning_rate": 9.22964643119044e-06, + "loss": 1.2301, + "num_input_tokens_seen": 500782656, + "step": 5280 + }, + { + "epoch": 0.668266801414856, + "grad_norm": 0.28937065601348877, + "learning_rate": 9.166612186892376e-06, + "loss": 1.2573, + "num_input_tokens_seen": 501775328, + "step": 5290 + }, + { + "epoch": 0.6695300656897423, + "grad_norm": 0.24364541471004486, + "learning_rate": 9.103717458755188e-06, + "loss": 1.2888, + "num_input_tokens_seen": 502721632, + "step": 5300 + }, + { + "epoch": 0.6707933299646286, + "grad_norm": 0.32249847054481506, + "learning_rate": 9.040963299751722e-06, + "loss": 1.2103, + "num_input_tokens_seen": 503649088, + "step": 5310 + }, + { + "epoch": 0.6720565942395149, + "grad_norm": 0.274586945772171, + "learning_rate": 8.978350760501413e-06, + "loss": 1.2604, + "num_input_tokens_seen": 504589696, + "step": 5320 + }, + { + "epoch": 0.6733198585144012, + "grad_norm": 0.25306662917137146, + "learning_rate": 8.915880889252758e-06, + "loss": 1.212, + "num_input_tokens_seen": 505495648, + "step": 5330 + }, + { + "epoch": 0.6745831227892876, + "grad_norm": 0.2675648629665375, + "learning_rate": 8.853554731865696e-06, + "loss": 1.2735, + "num_input_tokens_seen": 506399776, + "step": 5340 + }, + { + "epoch": 0.6758463870641738, + "grad_norm": 0.25868740677833557, + "learning_rate": 8.791373331794155e-06, + "loss": 1.2346, + "num_input_tokens_seen": 507369920, + "step": 5350 + }, + { + "epoch": 0.6771096513390601, + "grad_norm": 0.26915502548217773, + "learning_rate": 8.729337730068559e-06, + "loss": 1.2514, + "num_input_tokens_seen": 508312480, + "step": 5360 + }, + { + "epoch": 0.6783729156139464, + "grad_norm": 0.27946212887763977, + "learning_rate": 8.667448965278404e-06, + "loss": 1.2084, + "num_input_tokens_seen": 509257024, + "step": 5370 + }, + { + "epoch": 0.6796361798888327, + "grad_norm": 0.2765122950077057, + "learning_rate": 8.60570807355484e-06, + "loss": 1.2396, + "num_input_tokens_seen": 510240480, + "step": 5380 + }, + { + "epoch": 0.6808994441637191, + "grad_norm": 0.24776999652385712, + "learning_rate": 8.54411608855339e-06, + "loss": 1.1789, + "num_input_tokens_seen": 511188832, + "step": 5390 + }, + { + "epoch": 0.6821627084386054, + "grad_norm": 0.2991964519023895, + "learning_rate": 8.482674041436567e-06, + "loss": 1.2665, + "num_input_tokens_seen": 512158368, + "step": 5400 + }, + { + "epoch": 0.6834259727134917, + "grad_norm": 0.28031983971595764, + "learning_rate": 8.421382960856695e-06, + "loss": 1.2297, + "num_input_tokens_seen": 513132704, + "step": 5410 + }, + { + "epoch": 0.6846892369883779, + "grad_norm": 0.2627319395542145, + "learning_rate": 8.360243872938599e-06, + "loss": 1.2734, + "num_input_tokens_seen": 514124160, + "step": 5420 + }, + { + "epoch": 0.6859525012632642, + "grad_norm": 0.2459687888622284, + "learning_rate": 8.299257801262496e-06, + "loss": 1.2091, + "num_input_tokens_seen": 515011840, + "step": 5430 + }, + { + "epoch": 0.6872157655381506, + "grad_norm": 0.26756593585014343, + "learning_rate": 8.238425766846812e-06, + "loss": 1.2104, + "num_input_tokens_seen": 515957856, + "step": 5440 + }, + { + "epoch": 0.6884790298130369, + "grad_norm": 0.293277382850647, + "learning_rate": 8.177748788131119e-06, + "loss": 1.2523, + "num_input_tokens_seen": 516907040, + "step": 5450 + }, + { + "epoch": 0.6897422940879232, + "grad_norm": 0.2430182844400406, + "learning_rate": 8.117227880959081e-06, + "loss": 1.2209, + "num_input_tokens_seen": 517874624, + "step": 5460 + }, + { + "epoch": 0.6910055583628095, + "grad_norm": 0.26824715733528137, + "learning_rate": 8.056864058561416e-06, + "loss": 1.2237, + "num_input_tokens_seen": 518780064, + "step": 5470 + }, + { + "epoch": 0.6922688226376958, + "grad_norm": 0.2571701407432556, + "learning_rate": 7.996658331538978e-06, + "loss": 1.2251, + "num_input_tokens_seen": 519746560, + "step": 5480 + }, + { + "epoch": 0.6935320869125821, + "grad_norm": 0.25399723649024963, + "learning_rate": 7.936611707845793e-06, + "loss": 1.2448, + "num_input_tokens_seen": 520710432, + "step": 5490 + }, + { + "epoch": 0.6947953511874684, + "grad_norm": 0.24103257060050964, + "learning_rate": 7.876725192772224e-06, + "loss": 1.1599, + "num_input_tokens_seen": 521672128, + "step": 5500 + }, + { + "epoch": 0.6960586154623547, + "grad_norm": 0.2598767876625061, + "learning_rate": 7.816999788928119e-06, + "loss": 1.2595, + "num_input_tokens_seen": 522644576, + "step": 5510 + }, + { + "epoch": 0.697321879737241, + "grad_norm": 0.28568968176841736, + "learning_rate": 7.757436496226034e-06, + "loss": 1.2672, + "num_input_tokens_seen": 523695168, + "step": 5520 + }, + { + "epoch": 0.6985851440121273, + "grad_norm": 0.264839768409729, + "learning_rate": 7.698036311864467e-06, + "loss": 1.2521, + "num_input_tokens_seen": 524620992, + "step": 5530 + }, + { + "epoch": 0.6998484082870137, + "grad_norm": 0.27619093656539917, + "learning_rate": 7.638800230311206e-06, + "loss": 1.1977, + "num_input_tokens_seen": 525573280, + "step": 5540 + }, + { + "epoch": 0.7011116725619, + "grad_norm": 0.2585349380970001, + "learning_rate": 7.579729243286638e-06, + "loss": 1.2956, + "num_input_tokens_seen": 526491552, + "step": 5550 + }, + { + "epoch": 0.7023749368367862, + "grad_norm": 0.26802536845207214, + "learning_rate": 7.5208243397471995e-06, + "loss": 1.2719, + "num_input_tokens_seen": 527423648, + "step": 5560 + }, + { + "epoch": 0.7036382011116725, + "grad_norm": 0.2632644474506378, + "learning_rate": 7.462086505868744e-06, + "loss": 1.208, + "num_input_tokens_seen": 528368960, + "step": 5570 + }, + { + "epoch": 0.7049014653865588, + "grad_norm": 0.25977852940559387, + "learning_rate": 7.4035167250301035e-06, + "loss": 1.1928, + "num_input_tokens_seen": 529333984, + "step": 5580 + }, + { + "epoch": 0.7061647296614452, + "grad_norm": 0.2557479739189148, + "learning_rate": 7.345115977796573e-06, + "loss": 1.1766, + "num_input_tokens_seen": 530305760, + "step": 5590 + }, + { + "epoch": 0.7074279939363315, + "grad_norm": 0.2768225073814392, + "learning_rate": 7.286885241903531e-06, + "loss": 1.2209, + "num_input_tokens_seen": 531239232, + "step": 5600 + }, + { + "epoch": 0.7086912582112178, + "grad_norm": 0.27175867557525635, + "learning_rate": 7.2288254922400575e-06, + "loss": 1.2839, + "num_input_tokens_seen": 532124640, + "step": 5610 + }, + { + "epoch": 0.7099545224861041, + "grad_norm": 0.28098565340042114, + "learning_rate": 7.1709377008325895e-06, + "loss": 1.2523, + "num_input_tokens_seen": 533148320, + "step": 5620 + }, + { + "epoch": 0.7112177867609903, + "grad_norm": 0.2613276541233063, + "learning_rate": 7.113222836828695e-06, + "loss": 1.1796, + "num_input_tokens_seen": 534125856, + "step": 5630 + }, + { + "epoch": 0.7124810510358767, + "grad_norm": 0.24941375851631165, + "learning_rate": 7.055681866480792e-06, + "loss": 1.2102, + "num_input_tokens_seen": 535057408, + "step": 5640 + }, + { + "epoch": 0.713744315310763, + "grad_norm": 0.28444018959999084, + "learning_rate": 6.998315753130024e-06, + "loss": 1.1713, + "num_input_tokens_seen": 536041280, + "step": 5650 + }, + { + "epoch": 0.7150075795856493, + "grad_norm": 0.2781004309654236, + "learning_rate": 6.9411254571901e-06, + "loss": 1.2121, + "num_input_tokens_seen": 536970048, + "step": 5660 + }, + { + "epoch": 0.7162708438605356, + "grad_norm": 0.2684124708175659, + "learning_rate": 6.884111936131231e-06, + "loss": 1.2733, + "num_input_tokens_seen": 537863008, + "step": 5670 + }, + { + "epoch": 0.717534108135422, + "grad_norm": 0.27960875630378723, + "learning_rate": 6.82727614446407e-06, + "loss": 1.1975, + "num_input_tokens_seen": 538773152, + "step": 5680 + }, + { + "epoch": 0.7187973724103083, + "grad_norm": 0.24374781548976898, + "learning_rate": 6.770619033723783e-06, + "loss": 1.2273, + "num_input_tokens_seen": 539793088, + "step": 5690 + }, + { + "epoch": 0.7200606366851945, + "grad_norm": 0.2838081121444702, + "learning_rate": 6.714141552454072e-06, + "loss": 1.2066, + "num_input_tokens_seen": 540656768, + "step": 5700 + }, + { + "epoch": 0.7213239009600808, + "grad_norm": 0.24478621780872345, + "learning_rate": 6.657844646191328e-06, + "loss": 1.2102, + "num_input_tokens_seen": 541561248, + "step": 5710 + }, + { + "epoch": 0.7225871652349671, + "grad_norm": 0.2654918432235718, + "learning_rate": 6.6017292574487635e-06, + "loss": 1.2756, + "num_input_tokens_seen": 542457408, + "step": 5720 + }, + { + "epoch": 0.7238504295098535, + "grad_norm": 0.24361199140548706, + "learning_rate": 6.545796325700683e-06, + "loss": 1.1843, + "num_input_tokens_seen": 543394112, + "step": 5730 + }, + { + "epoch": 0.7251136937847398, + "grad_norm": 0.27256685495376587, + "learning_rate": 6.4900467873667e-06, + "loss": 1.2305, + "num_input_tokens_seen": 544360768, + "step": 5740 + }, + { + "epoch": 0.7263769580596261, + "grad_norm": 0.24635472893714905, + "learning_rate": 6.434481575796107e-06, + "loss": 1.243, + "num_input_tokens_seen": 545282080, + "step": 5750 + }, + { + "epoch": 0.7276402223345124, + "grad_norm": 0.306068480014801, + "learning_rate": 6.3791016212522256e-06, + "loss": 1.2045, + "num_input_tokens_seen": 546234848, + "step": 5760 + }, + { + "epoch": 0.7289034866093986, + "grad_norm": 0.26721495389938354, + "learning_rate": 6.32390785089682e-06, + "loss": 1.2897, + "num_input_tokens_seen": 547182400, + "step": 5770 + }, + { + "epoch": 0.730166750884285, + "grad_norm": 0.25117790699005127, + "learning_rate": 6.268901188774617e-06, + "loss": 1.2824, + "num_input_tokens_seen": 548096000, + "step": 5780 + }, + { + "epoch": 0.7314300151591713, + "grad_norm": 0.2862393260002136, + "learning_rate": 6.2140825557977745e-06, + "loss": 1.2498, + "num_input_tokens_seen": 549029216, + "step": 5790 + }, + { + "epoch": 0.7326932794340576, + "grad_norm": 0.25375497341156006, + "learning_rate": 6.159452869730546e-06, + "loss": 1.2498, + "num_input_tokens_seen": 550029152, + "step": 5800 + }, + { + "epoch": 0.7339565437089439, + "grad_norm": 0.2733435034751892, + "learning_rate": 6.1050130451738186e-06, + "loss": 1.1756, + "num_input_tokens_seen": 551018848, + "step": 5810 + }, + { + "epoch": 0.7352198079838302, + "grad_norm": 0.25357958674430847, + "learning_rate": 6.050763993549884e-06, + "loss": 1.1967, + "num_input_tokens_seen": 551936608, + "step": 5820 + }, + { + "epoch": 0.7364830722587166, + "grad_norm": 0.2535962760448456, + "learning_rate": 5.996706623087126e-06, + "loss": 1.251, + "num_input_tokens_seen": 552928192, + "step": 5830 + }, + { + "epoch": 0.7377463365336028, + "grad_norm": 0.26090991497039795, + "learning_rate": 5.942841838804848e-06, + "loss": 1.2385, + "num_input_tokens_seen": 553912960, + "step": 5840 + }, + { + "epoch": 0.7390096008084891, + "grad_norm": 0.2640230357646942, + "learning_rate": 5.889170542498102e-06, + "loss": 1.2426, + "num_input_tokens_seen": 554837248, + "step": 5850 + }, + { + "epoch": 0.7402728650833754, + "grad_norm": 0.24669994413852692, + "learning_rate": 5.835693632722607e-06, + "loss": 1.1978, + "num_input_tokens_seen": 555733696, + "step": 5860 + }, + { + "epoch": 0.7415361293582617, + "grad_norm": 0.2583445608615875, + "learning_rate": 5.7824120047796725e-06, + "loss": 1.2602, + "num_input_tokens_seen": 556739392, + "step": 5870 + }, + { + "epoch": 0.7427993936331481, + "grad_norm": 0.24428869783878326, + "learning_rate": 5.729326550701263e-06, + "loss": 1.2476, + "num_input_tokens_seen": 557767840, + "step": 5880 + }, + { + "epoch": 0.7440626579080344, + "grad_norm": 0.26555436849594116, + "learning_rate": 5.676438159235005e-06, + "loss": 1.265, + "num_input_tokens_seen": 558685312, + "step": 5890 + }, + { + "epoch": 0.7453259221829207, + "grad_norm": 0.29612812399864197, + "learning_rate": 5.623747715829356e-06, + "loss": 1.2436, + "num_input_tokens_seen": 559607904, + "step": 5900 + }, + { + "epoch": 0.7465891864578069, + "grad_norm": 0.26325854659080505, + "learning_rate": 5.571256102618758e-06, + "loss": 1.2447, + "num_input_tokens_seen": 560536256, + "step": 5910 + }, + { + "epoch": 0.7478524507326932, + "grad_norm": 0.2596051096916199, + "learning_rate": 5.518964198408862e-06, + "loss": 1.2401, + "num_input_tokens_seen": 561426784, + "step": 5920 + }, + { + "epoch": 0.7491157150075796, + "grad_norm": 0.28517597913742065, + "learning_rate": 5.466872878661839e-06, + "loss": 1.2213, + "num_input_tokens_seen": 562311360, + "step": 5930 + }, + { + "epoch": 0.7503789792824659, + "grad_norm": 0.24300004541873932, + "learning_rate": 5.414983015481682e-06, + "loss": 1.2828, + "num_input_tokens_seen": 563216640, + "step": 5940 + }, + { + "epoch": 0.7516422435573522, + "grad_norm": 0.26081758737564087, + "learning_rate": 5.363295477599677e-06, + "loss": 1.2356, + "num_input_tokens_seen": 564140992, + "step": 5950 + }, + { + "epoch": 0.7529055078322385, + "grad_norm": 0.30684077739715576, + "learning_rate": 5.311811130359772e-06, + "loss": 1.2487, + "num_input_tokens_seen": 565051296, + "step": 5960 + }, + { + "epoch": 0.7541687721071249, + "grad_norm": 0.243248850107193, + "learning_rate": 5.260530835704159e-06, + "loss": 1.2313, + "num_input_tokens_seen": 566038848, + "step": 5970 + }, + { + "epoch": 0.7554320363820111, + "grad_norm": 0.2502289116382599, + "learning_rate": 5.209455452158796e-06, + "loss": 1.2092, + "num_input_tokens_seen": 567044608, + "step": 5980 + }, + { + "epoch": 0.7566953006568974, + "grad_norm": 0.26396942138671875, + "learning_rate": 5.1585858348190666e-06, + "loss": 1.2309, + "num_input_tokens_seen": 567994848, + "step": 5990 + }, + { + "epoch": 0.7579585649317837, + "grad_norm": 0.2504906952381134, + "learning_rate": 5.107922835335452e-06, + "loss": 1.2367, + "num_input_tokens_seen": 568955808, + "step": 6000 + }, + { + "epoch": 0.7579585649317837, + "eval_loss": 1.2595031261444092, + "eval_runtime": 13.0677, + "eval_samples_per_second": 11.479, + "eval_steps_per_second": 0.765, + "num_input_tokens_seen": 568955808, + "step": 6000 + }, + { + "epoch": 0.75922182920667, + "grad_norm": 0.2684820592403412, + "learning_rate": 5.057467301899274e-06, + "loss": 1.1746, + "num_input_tokens_seen": 569895776, + "step": 6010 + }, + { + "epoch": 0.7604850934815564, + "grad_norm": 0.2721717655658722, + "learning_rate": 5.007220079228478e-06, + "loss": 1.2066, + "num_input_tokens_seen": 570859552, + "step": 6020 + }, + { + "epoch": 0.7617483577564427, + "grad_norm": 0.25938835740089417, + "learning_rate": 4.957182008553527e-06, + "loss": 1.2192, + "num_input_tokens_seen": 571787136, + "step": 6030 + }, + { + "epoch": 0.763011622031329, + "grad_norm": 0.25528407096862793, + "learning_rate": 4.9073539276032756e-06, + "loss": 1.2433, + "num_input_tokens_seen": 572685056, + "step": 6040 + }, + { + "epoch": 0.7642748863062152, + "grad_norm": 0.22747959196567535, + "learning_rate": 4.857736670590982e-06, + "loss": 1.2425, + "num_input_tokens_seen": 573630944, + "step": 6050 + }, + { + "epoch": 0.7655381505811015, + "grad_norm": 0.23501618206501007, + "learning_rate": 4.808331068200329e-06, + "loss": 1.3179, + "num_input_tokens_seen": 574504000, + "step": 6060 + }, + { + "epoch": 0.7668014148559879, + "grad_norm": 0.2590336203575134, + "learning_rate": 4.759137947571491e-06, + "loss": 1.2479, + "num_input_tokens_seen": 575465184, + "step": 6070 + }, + { + "epoch": 0.7680646791308742, + "grad_norm": 0.2563855051994324, + "learning_rate": 4.710158132287332e-06, + "loss": 1.2028, + "num_input_tokens_seen": 576397088, + "step": 6080 + }, + { + "epoch": 0.7693279434057605, + "grad_norm": 0.29565200209617615, + "learning_rate": 4.661392442359582e-06, + "loss": 1.2799, + "num_input_tokens_seen": 577387744, + "step": 6090 + }, + { + "epoch": 0.7705912076806468, + "grad_norm": 0.26293325424194336, + "learning_rate": 4.612841694215136e-06, + "loss": 1.2272, + "num_input_tokens_seen": 578310496, + "step": 6100 + }, + { + "epoch": 0.7718544719555331, + "grad_norm": 0.2616961598396301, + "learning_rate": 4.56450670068234e-06, + "loss": 1.2489, + "num_input_tokens_seen": 579258496, + "step": 6110 + }, + { + "epoch": 0.7731177362304194, + "grad_norm": 0.24685987830162048, + "learning_rate": 4.51638827097745e-06, + "loss": 1.2588, + "num_input_tokens_seen": 580197760, + "step": 6120 + }, + { + "epoch": 0.7743810005053057, + "grad_norm": 0.2490658164024353, + "learning_rate": 4.46848721069101e-06, + "loss": 1.293, + "num_input_tokens_seen": 581108448, + "step": 6130 + }, + { + "epoch": 0.775644264780192, + "grad_norm": 0.24475279450416565, + "learning_rate": 4.420804321774441e-06, + "loss": 1.287, + "num_input_tokens_seen": 582039072, + "step": 6140 + }, + { + "epoch": 0.7769075290550783, + "grad_norm": 0.2623221278190613, + "learning_rate": 4.373340402526543e-06, + "loss": 1.2117, + "num_input_tokens_seen": 582932992, + "step": 6150 + }, + { + "epoch": 0.7781707933299646, + "grad_norm": 0.27465909719467163, + "learning_rate": 4.326096247580186e-06, + "loss": 1.2135, + "num_input_tokens_seen": 583861568, + "step": 6160 + }, + { + "epoch": 0.779434057604851, + "grad_norm": 0.28181222081184387, + "learning_rate": 4.27907264788896e-06, + "loss": 1.2537, + "num_input_tokens_seen": 584843136, + "step": 6170 + }, + { + "epoch": 0.7806973218797373, + "grad_norm": 0.2493135631084442, + "learning_rate": 4.23227039071398e-06, + "loss": 1.2263, + "num_input_tokens_seen": 585837664, + "step": 6180 + }, + { + "epoch": 0.7819605861546235, + "grad_norm": 0.26791173219680786, + "learning_rate": 4.1856902596106726e-06, + "loss": 1.2273, + "num_input_tokens_seen": 586797536, + "step": 6190 + }, + { + "epoch": 0.7832238504295098, + "grad_norm": 0.26550182700157166, + "learning_rate": 4.139333034415663e-06, + "loss": 1.2031, + "num_input_tokens_seen": 587734880, + "step": 6200 + }, + { + "epoch": 0.7844871147043961, + "grad_norm": 0.27607518434524536, + "learning_rate": 4.0931994912337345e-06, + "loss": 1.2426, + "num_input_tokens_seen": 588659360, + "step": 6210 + }, + { + "epoch": 0.7857503789792825, + "grad_norm": 0.2891901433467865, + "learning_rate": 4.047290402424806e-06, + "loss": 1.2864, + "num_input_tokens_seen": 589628256, + "step": 6220 + }, + { + "epoch": 0.7870136432541688, + "grad_norm": 0.2835799753665924, + "learning_rate": 4.001606536591042e-06, + "loss": 1.2634, + "num_input_tokens_seen": 590567904, + "step": 6230 + }, + { + "epoch": 0.7882769075290551, + "grad_norm": 0.2466340959072113, + "learning_rate": 3.956148658563945e-06, + "loss": 1.1893, + "num_input_tokens_seen": 591514912, + "step": 6240 + }, + { + "epoch": 0.7895401718039414, + "grad_norm": 0.2408566027879715, + "learning_rate": 3.910917529391582e-06, + "loss": 1.1672, + "num_input_tokens_seen": 592500416, + "step": 6250 + }, + { + "epoch": 0.7908034360788276, + "grad_norm": 0.28810036182403564, + "learning_rate": 3.8659139063258146e-06, + "loss": 1.2376, + "num_input_tokens_seen": 593538144, + "step": 6260 + }, + { + "epoch": 0.792066700353714, + "grad_norm": 0.26853030920028687, + "learning_rate": 3.8211385428096474e-06, + "loss": 1.2726, + "num_input_tokens_seen": 594506272, + "step": 6270 + }, + { + "epoch": 0.7933299646286003, + "grad_norm": 0.2816145122051239, + "learning_rate": 3.7765921884645917e-06, + "loss": 1.3003, + "num_input_tokens_seen": 595431904, + "step": 6280 + }, + { + "epoch": 0.7945932289034866, + "grad_norm": 0.26149782538414, + "learning_rate": 3.7322755890781368e-06, + "loss": 1.2477, + "num_input_tokens_seen": 596461440, + "step": 6290 + }, + { + "epoch": 0.7958564931783729, + "grad_norm": 0.260708749294281, + "learning_rate": 3.68818948659125e-06, + "loss": 1.256, + "num_input_tokens_seen": 597473312, + "step": 6300 + }, + { + "epoch": 0.7971197574532592, + "grad_norm": 0.27105608582496643, + "learning_rate": 3.6443346190859598e-06, + "loss": 1.2488, + "num_input_tokens_seen": 598412000, + "step": 6310 + }, + { + "epoch": 0.7983830217281456, + "grad_norm": 0.24419113993644714, + "learning_rate": 3.600711720772991e-06, + "loss": 1.2774, + "num_input_tokens_seen": 599430656, + "step": 6320 + }, + { + "epoch": 0.7996462860030318, + "grad_norm": 0.25261548161506653, + "learning_rate": 3.557321521979489e-06, + "loss": 1.2279, + "num_input_tokens_seen": 600412224, + "step": 6330 + }, + { + "epoch": 0.8009095502779181, + "grad_norm": 0.25508007407188416, + "learning_rate": 3.51416474913678e-06, + "loss": 1.251, + "num_input_tokens_seen": 601375968, + "step": 6340 + }, + { + "epoch": 0.8021728145528044, + "grad_norm": 0.2806225121021271, + "learning_rate": 3.471242124768207e-06, + "loss": 1.2055, + "num_input_tokens_seen": 602286496, + "step": 6350 + }, + { + "epoch": 0.8034360788276907, + "grad_norm": 0.32982784509658813, + "learning_rate": 3.42855436747705e-06, + "loss": 1.2309, + "num_input_tokens_seen": 603281216, + "step": 6360 + }, + { + "epoch": 0.8046993431025771, + "grad_norm": 0.27231696248054504, + "learning_rate": 3.3861021919344735e-06, + "loss": 1.1807, + "num_input_tokens_seen": 604231360, + "step": 6370 + }, + { + "epoch": 0.8059626073774634, + "grad_norm": 0.2853865325450897, + "learning_rate": 3.3438863088675783e-06, + "loss": 1.2638, + "num_input_tokens_seen": 605138944, + "step": 6380 + }, + { + "epoch": 0.8072258716523497, + "grad_norm": 0.2520991563796997, + "learning_rate": 3.301907425047496e-06, + "loss": 1.2291, + "num_input_tokens_seen": 606092896, + "step": 6390 + }, + { + "epoch": 0.8084891359272359, + "grad_norm": 0.2628813683986664, + "learning_rate": 3.260166243277564e-06, + "loss": 1.2588, + "num_input_tokens_seen": 607004512, + "step": 6400 + }, + { + "epoch": 0.8097524002021222, + "grad_norm": 0.24886657297611237, + "learning_rate": 3.2186634623815337e-06, + "loss": 1.2636, + "num_input_tokens_seen": 607919360, + "step": 6410 + }, + { + "epoch": 0.8110156644770086, + "grad_norm": 0.2556428909301758, + "learning_rate": 3.177399777191912e-06, + "loss": 1.2427, + "num_input_tokens_seen": 608921984, + "step": 6420 + }, + { + "epoch": 0.8122789287518949, + "grad_norm": 0.24436554312705994, + "learning_rate": 3.1363758785382866e-06, + "loss": 1.2667, + "num_input_tokens_seen": 609854816, + "step": 6430 + }, + { + "epoch": 0.8135421930267812, + "grad_norm": 0.26374685764312744, + "learning_rate": 3.0955924532357908e-06, + "loss": 1.2398, + "num_input_tokens_seen": 610815712, + "step": 6440 + }, + { + "epoch": 0.8148054573016675, + "grad_norm": 0.28322839736938477, + "learning_rate": 3.055050184073599e-06, + "loss": 1.2552, + "num_input_tokens_seen": 611770144, + "step": 6450 + }, + { + "epoch": 0.8160687215765539, + "grad_norm": 0.2539218068122864, + "learning_rate": 3.0147497498034735e-06, + "loss": 1.202, + "num_input_tokens_seen": 612729024, + "step": 6460 + }, + { + "epoch": 0.8173319858514401, + "grad_norm": 0.27928316593170166, + "learning_rate": 2.974691825128433e-06, + "loss": 1.2777, + "num_input_tokens_seen": 613643488, + "step": 6470 + }, + { + "epoch": 0.8185952501263264, + "grad_norm": 0.26042285561561584, + "learning_rate": 2.934877080691438e-06, + "loss": 1.2077, + "num_input_tokens_seen": 614610560, + "step": 6480 + }, + { + "epoch": 0.8198585144012127, + "grad_norm": 0.24354539811611176, + "learning_rate": 2.8953061830641663e-06, + "loss": 1.191, + "num_input_tokens_seen": 615577216, + "step": 6490 + }, + { + "epoch": 0.821121778676099, + "grad_norm": 0.2690410912036896, + "learning_rate": 2.8559797947358463e-06, + "loss": 1.1872, + "num_input_tokens_seen": 616548384, + "step": 6500 + }, + { + "epoch": 0.8223850429509854, + "grad_norm": 0.2414551079273224, + "learning_rate": 2.8168985741021875e-06, + "loss": 1.2318, + "num_input_tokens_seen": 617543904, + "step": 6510 + }, + { + "epoch": 0.8236483072258717, + "grad_norm": 0.23589564859867096, + "learning_rate": 2.7780631754543265e-06, + "loss": 1.2087, + "num_input_tokens_seen": 618540128, + "step": 6520 + }, + { + "epoch": 0.824911571500758, + "grad_norm": 0.25712019205093384, + "learning_rate": 2.739474248967916e-06, + "loss": 1.1912, + "num_input_tokens_seen": 619500352, + "step": 6530 + }, + { + "epoch": 0.8261748357756442, + "grad_norm": 0.26267293095588684, + "learning_rate": 2.7011324406921816e-06, + "loss": 1.2882, + "num_input_tokens_seen": 620453920, + "step": 6540 + }, + { + "epoch": 0.8274381000505305, + "grad_norm": 0.2525344789028168, + "learning_rate": 2.6630383925391654e-06, + "loss": 1.2602, + "num_input_tokens_seen": 621427552, + "step": 6550 + }, + { + "epoch": 0.8287013643254169, + "grad_norm": 0.25016433000564575, + "learning_rate": 2.6251927422729305e-06, + "loss": 1.2071, + "num_input_tokens_seen": 622454432, + "step": 6560 + }, + { + "epoch": 0.8299646286003032, + "grad_norm": 0.24579358100891113, + "learning_rate": 2.5875961234989185e-06, + "loss": 1.2262, + "num_input_tokens_seen": 623389792, + "step": 6570 + }, + { + "epoch": 0.8312278928751895, + "grad_norm": 0.24960210919380188, + "learning_rate": 2.5502491656533293e-06, + "loss": 1.1894, + "num_input_tokens_seen": 624352928, + "step": 6580 + }, + { + "epoch": 0.8324911571500758, + "grad_norm": 0.2529809772968292, + "learning_rate": 2.513152493992568e-06, + "loss": 1.2355, + "num_input_tokens_seen": 625237472, + "step": 6590 + }, + { + "epoch": 0.8337544214249621, + "grad_norm": 0.2756924331188202, + "learning_rate": 2.4763067295828053e-06, + "loss": 1.1959, + "num_input_tokens_seen": 626200416, + "step": 6600 + }, + { + "epoch": 0.8350176856998484, + "grad_norm": 0.2560481131076813, + "learning_rate": 2.439712489289555e-06, + "loss": 1.1686, + "num_input_tokens_seen": 627085760, + "step": 6610 + }, + { + "epoch": 0.8362809499747347, + "grad_norm": 0.2564622461795807, + "learning_rate": 2.403370385767364e-06, + "loss": 1.2475, + "num_input_tokens_seen": 628078240, + "step": 6620 + }, + { + "epoch": 0.837544214249621, + "grad_norm": 0.2827485203742981, + "learning_rate": 2.367281027449548e-06, + "loss": 1.1958, + "num_input_tokens_seen": 629016384, + "step": 6630 + }, + { + "epoch": 0.8388074785245073, + "grad_norm": 0.2654615342617035, + "learning_rate": 2.3314450185380047e-06, + "loss": 1.278, + "num_input_tokens_seen": 629963040, + "step": 6640 + }, + { + "epoch": 0.8400707427993936, + "grad_norm": 0.26686492562294006, + "learning_rate": 2.295862958993091e-06, + "loss": 1.2544, + "num_input_tokens_seen": 630921504, + "step": 6650 + }, + { + "epoch": 0.84133400707428, + "grad_norm": 0.2568102180957794, + "learning_rate": 2.2605354445236036e-06, + "loss": 1.1788, + "num_input_tokens_seen": 631837184, + "step": 6660 + }, + { + "epoch": 0.8425972713491663, + "grad_norm": 0.2527879476547241, + "learning_rate": 2.2254630665767636e-06, + "loss": 1.2889, + "num_input_tokens_seen": 632828288, + "step": 6670 + }, + { + "epoch": 0.8438605356240525, + "grad_norm": 0.26815953850746155, + "learning_rate": 2.1906464123283744e-06, + "loss": 1.2576, + "num_input_tokens_seen": 633815520, + "step": 6680 + }, + { + "epoch": 0.8451237998989388, + "grad_norm": 0.2878230810165405, + "learning_rate": 2.156086064672924e-06, + "loss": 1.2808, + "num_input_tokens_seen": 634722208, + "step": 6690 + }, + { + "epoch": 0.8463870641738251, + "grad_norm": 0.2378537356853485, + "learning_rate": 2.1217826022138783e-06, + "loss": 1.1683, + "num_input_tokens_seen": 635706144, + "step": 6700 + }, + { + "epoch": 0.8476503284487115, + "grad_norm": 0.25701719522476196, + "learning_rate": 2.0877365992539653e-06, + "loss": 1.2215, + "num_input_tokens_seen": 636619104, + "step": 6710 + }, + { + "epoch": 0.8489135927235978, + "grad_norm": 0.24454209208488464, + "learning_rate": 2.0539486257855774e-06, + "loss": 1.262, + "num_input_tokens_seen": 637517568, + "step": 6720 + }, + { + "epoch": 0.8501768569984841, + "grad_norm": 0.2640119791030884, + "learning_rate": 2.0204192474812166e-06, + "loss": 1.2826, + "num_input_tokens_seen": 638479936, + "step": 6730 + }, + { + "epoch": 0.8514401212733704, + "grad_norm": 0.2534317076206207, + "learning_rate": 1.987149025684028e-06, + "loss": 1.2236, + "num_input_tokens_seen": 639357088, + "step": 6740 + }, + { + "epoch": 0.8527033855482566, + "grad_norm": 0.2551516890525818, + "learning_rate": 1.9541385173984074e-06, + "loss": 1.1855, + "num_input_tokens_seen": 640362912, + "step": 6750 + }, + { + "epoch": 0.853966649823143, + "grad_norm": 0.257917582988739, + "learning_rate": 1.921388275280664e-06, + "loss": 1.2111, + "num_input_tokens_seen": 641336448, + "step": 6760 + }, + { + "epoch": 0.8552299140980293, + "grad_norm": 0.2687523663043976, + "learning_rate": 1.888898847629779e-06, + "loss": 1.2092, + "num_input_tokens_seen": 642348704, + "step": 6770 + }, + { + "epoch": 0.8564931783729156, + "grad_norm": 0.27500104904174805, + "learning_rate": 1.8566707783782231e-06, + "loss": 1.2022, + "num_input_tokens_seen": 643290272, + "step": 6780 + }, + { + "epoch": 0.8577564426478019, + "grad_norm": 0.27554988861083984, + "learning_rate": 1.8247046070828535e-06, + "loss": 1.1901, + "num_input_tokens_seen": 644221792, + "step": 6790 + }, + { + "epoch": 0.8590197069226883, + "grad_norm": 0.2787459194660187, + "learning_rate": 1.7930008689158637e-06, + "loss": 1.2127, + "num_input_tokens_seen": 645176224, + "step": 6800 + }, + { + "epoch": 0.8602829711975746, + "grad_norm": 0.23403003811836243, + "learning_rate": 1.761560094655851e-06, + "loss": 1.2688, + "num_input_tokens_seen": 646193152, + "step": 6810 + }, + { + "epoch": 0.8615462354724608, + "grad_norm": 0.2776746451854706, + "learning_rate": 1.730382810678895e-06, + "loss": 1.2174, + "num_input_tokens_seen": 647194528, + "step": 6820 + }, + { + "epoch": 0.8628094997473471, + "grad_norm": 0.2932538092136383, + "learning_rate": 1.6994695389497982e-06, + "loss": 1.1361, + "num_input_tokens_seen": 648208224, + "step": 6830 + }, + { + "epoch": 0.8640727640222334, + "grad_norm": 0.26842474937438965, + "learning_rate": 1.6688207970132808e-06, + "loss": 1.2041, + "num_input_tokens_seen": 649171072, + "step": 6840 + }, + { + "epoch": 0.8653360282971198, + "grad_norm": 0.2833315134048462, + "learning_rate": 1.6384370979853776e-06, + "loss": 1.27, + "num_input_tokens_seen": 650172224, + "step": 6850 + }, + { + "epoch": 0.8665992925720061, + "grad_norm": 0.26029422879219055, + "learning_rate": 1.6083189505447964e-06, + "loss": 1.2732, + "num_input_tokens_seen": 651096864, + "step": 6860 + }, + { + "epoch": 0.8678625568468924, + "grad_norm": 0.2853679060935974, + "learning_rate": 1.578466858924442e-06, + "loss": 1.1936, + "num_input_tokens_seen": 652020192, + "step": 6870 + }, + { + "epoch": 0.8691258211217787, + "grad_norm": 0.28354784846305847, + "learning_rate": 1.548881322902959e-06, + "loss": 1.2461, + "num_input_tokens_seen": 652919488, + "step": 6880 + }, + { + "epoch": 0.8703890853966649, + "grad_norm": 0.2513621747493744, + "learning_rate": 1.5195628377963493e-06, + "loss": 1.2352, + "num_input_tokens_seen": 653868192, + "step": 6890 + }, + { + "epoch": 0.8716523496715513, + "grad_norm": 0.2537190616130829, + "learning_rate": 1.4905118944497058e-06, + "loss": 1.1954, + "num_input_tokens_seen": 654866304, + "step": 6900 + }, + { + "epoch": 0.8729156139464376, + "grad_norm": 0.26647478342056274, + "learning_rate": 1.4617289792289743e-06, + "loss": 1.2386, + "num_input_tokens_seen": 655850752, + "step": 6910 + }, + { + "epoch": 0.8741788782213239, + "grad_norm": 0.2586477994918823, + "learning_rate": 1.4332145740128345e-06, + "loss": 1.256, + "num_input_tokens_seen": 656778176, + "step": 6920 + }, + { + "epoch": 0.8754421424962102, + "grad_norm": 0.2705184817314148, + "learning_rate": 1.4049691561845975e-06, + "loss": 1.2329, + "num_input_tokens_seen": 657784128, + "step": 6930 + }, + { + "epoch": 0.8767054067710965, + "grad_norm": 0.2453477680683136, + "learning_rate": 1.376993198624248e-06, + "loss": 1.1833, + "num_input_tokens_seen": 658703168, + "step": 6940 + }, + { + "epoch": 0.8779686710459829, + "grad_norm": 0.25567731261253357, + "learning_rate": 1.3492871697005042e-06, + "loss": 1.2284, + "num_input_tokens_seen": 659688864, + "step": 6950 + }, + { + "epoch": 0.8792319353208691, + "grad_norm": 0.29871034622192383, + "learning_rate": 1.3218515332629892e-06, + "loss": 1.2664, + "num_input_tokens_seen": 660603104, + "step": 6960 + }, + { + "epoch": 0.8804951995957554, + "grad_norm": 0.25376957654953003, + "learning_rate": 1.2946867486344597e-06, + "loss": 1.2197, + "num_input_tokens_seen": 661552704, + "step": 6970 + }, + { + "epoch": 0.8817584638706417, + "grad_norm": 0.3075960874557495, + "learning_rate": 1.267793270603122e-06, + "loss": 1.1982, + "num_input_tokens_seen": 662524096, + "step": 6980 + }, + { + "epoch": 0.883021728145528, + "grad_norm": 0.2471645623445511, + "learning_rate": 1.2411715494150024e-06, + "loss": 1.1913, + "num_input_tokens_seen": 663442336, + "step": 6990 + }, + { + "epoch": 0.8842849924204144, + "grad_norm": 0.2692629098892212, + "learning_rate": 1.214822030766437e-06, + "loss": 1.2643, + "num_input_tokens_seen": 664365344, + "step": 7000 + }, + { + "epoch": 0.8855482566953007, + "grad_norm": 0.2840708792209625, + "learning_rate": 1.1887451557965732e-06, + "loss": 1.1826, + "num_input_tokens_seen": 665290880, + "step": 7010 + }, + { + "epoch": 0.886811520970187, + "grad_norm": 0.2730172574520111, + "learning_rate": 1.1629413610800198e-06, + "loss": 1.2738, + "num_input_tokens_seen": 666231392, + "step": 7020 + }, + { + "epoch": 0.8880747852450732, + "grad_norm": 0.28216251730918884, + "learning_rate": 1.1374110786195212e-06, + "loss": 1.1925, + "num_input_tokens_seen": 667211072, + "step": 7030 + }, + { + "epoch": 0.8893380495199595, + "grad_norm": 0.25766119360923767, + "learning_rate": 1.1121547358387154e-06, + "loss": 1.2013, + "num_input_tokens_seen": 668144320, + "step": 7040 + }, + { + "epoch": 0.8906013137948459, + "grad_norm": 0.24992607533931732, + "learning_rate": 1.087172755575001e-06, + "loss": 1.1939, + "num_input_tokens_seen": 669092064, + "step": 7050 + }, + { + "epoch": 0.8918645780697322, + "grad_norm": 0.26488760113716125, + "learning_rate": 1.0624655560724363e-06, + "loss": 1.2276, + "num_input_tokens_seen": 670011840, + "step": 7060 + }, + { + "epoch": 0.8931278423446185, + "grad_norm": 0.25586891174316406, + "learning_rate": 1.0380335509747583e-06, + "loss": 1.2528, + "num_input_tokens_seen": 670906560, + "step": 7070 + }, + { + "epoch": 0.8943911066195048, + "grad_norm": 0.2638219892978668, + "learning_rate": 1.0138771493184352e-06, + "loss": 1.2721, + "num_input_tokens_seen": 671885760, + "step": 7080 + }, + { + "epoch": 0.8956543708943911, + "grad_norm": 0.25774410367012024, + "learning_rate": 9.899967555258347e-07, + "loss": 1.2788, + "num_input_tokens_seen": 672838336, + "step": 7090 + }, + { + "epoch": 0.8969176351692774, + "grad_norm": 0.24537810683250427, + "learning_rate": 9.663927693984438e-07, + "loss": 1.2218, + "num_input_tokens_seen": 673773728, + "step": 7100 + }, + { + "epoch": 0.8981808994441637, + "grad_norm": 0.269209623336792, + "learning_rate": 9.430655861101829e-07, + "loss": 1.1914, + "num_input_tokens_seen": 674686496, + "step": 7110 + }, + { + "epoch": 0.89944416371905, + "grad_norm": 0.2713133692741394, + "learning_rate": 9.200155962007868e-07, + "loss": 1.221, + "num_input_tokens_seen": 675659040, + "step": 7120 + }, + { + "epoch": 0.9007074279939363, + "grad_norm": 0.2782800793647766, + "learning_rate": 8.972431855692685e-07, + "loss": 1.2197, + "num_input_tokens_seen": 676523936, + "step": 7130 + }, + { + "epoch": 0.9019706922688226, + "grad_norm": 0.28656941652297974, + "learning_rate": 8.747487354674457e-07, + "loss": 1.2924, + "num_input_tokens_seen": 677481408, + "step": 7140 + }, + { + "epoch": 0.903233956543709, + "grad_norm": 0.2603612542152405, + "learning_rate": 8.525326224935794e-07, + "loss": 1.2418, + "num_input_tokens_seen": 678461056, + "step": 7150 + }, + { + "epoch": 0.9044972208185953, + "grad_norm": 0.2789015471935272, + "learning_rate": 8.305952185860484e-07, + "loss": 1.1934, + "num_input_tokens_seen": 679452256, + "step": 7160 + }, + { + "epoch": 0.9057604850934815, + "grad_norm": 0.29948341846466064, + "learning_rate": 8.089368910171396e-07, + "loss": 1.2467, + "num_input_tokens_seen": 680371648, + "step": 7170 + }, + { + "epoch": 0.9070237493683678, + "grad_norm": 0.26572108268737793, + "learning_rate": 7.875580023868885e-07, + "loss": 1.1925, + "num_input_tokens_seen": 681355648, + "step": 7180 + }, + { + "epoch": 0.9082870136432541, + "grad_norm": 0.24899084866046906, + "learning_rate": 7.664589106170069e-07, + "loss": 1.252, + "num_input_tokens_seen": 682361344, + "step": 7190 + }, + { + "epoch": 0.9095502779181405, + "grad_norm": 0.24572855234146118, + "learning_rate": 7.456399689449052e-07, + "loss": 1.2339, + "num_input_tokens_seen": 683316896, + "step": 7200 + }, + { + "epoch": 0.9108135421930268, + "grad_norm": 0.2785273492336273, + "learning_rate": 7.251015259177561e-07, + "loss": 1.2259, + "num_input_tokens_seen": 684286528, + "step": 7210 + }, + { + "epoch": 0.9120768064679131, + "grad_norm": 0.24116089940071106, + "learning_rate": 7.048439253866866e-07, + "loss": 1.1971, + "num_input_tokens_seen": 685241440, + "step": 7220 + }, + { + "epoch": 0.9133400707427994, + "grad_norm": 0.25249651074409485, + "learning_rate": 6.848675065009904e-07, + "loss": 1.1883, + "num_input_tokens_seen": 686179008, + "step": 7230 + }, + { + "epoch": 0.9146033350176856, + "grad_norm": 0.24898767471313477, + "learning_rate": 6.651726037024796e-07, + "loss": 1.2214, + "num_input_tokens_seen": 687148992, + "step": 7240 + }, + { + "epoch": 0.915866599292572, + "grad_norm": 0.2656947672367096, + "learning_rate": 6.457595467198567e-07, + "loss": 1.1936, + "num_input_tokens_seen": 688136000, + "step": 7250 + }, + { + "epoch": 0.9171298635674583, + "grad_norm": 0.2621888816356659, + "learning_rate": 6.266286605632295e-07, + "loss": 1.2068, + "num_input_tokens_seen": 689067328, + "step": 7260 + }, + { + "epoch": 0.9183931278423446, + "grad_norm": 0.2367779016494751, + "learning_rate": 6.07780265518632e-07, + "loss": 1.2581, + "num_input_tokens_seen": 690001664, + "step": 7270 + }, + { + "epoch": 0.9196563921172309, + "grad_norm": 0.24973830580711365, + "learning_rate": 5.892146771426915e-07, + "loss": 1.2381, + "num_input_tokens_seen": 690943648, + "step": 7280 + }, + { + "epoch": 0.9209196563921173, + "grad_norm": 0.2687539756298065, + "learning_rate": 5.70932206257326e-07, + "loss": 1.2386, + "num_input_tokens_seen": 691864224, + "step": 7290 + }, + { + "epoch": 0.9221829206670036, + "grad_norm": 0.25320330262184143, + "learning_rate": 5.529331589445516e-07, + "loss": 1.2678, + "num_input_tokens_seen": 692833472, + "step": 7300 + }, + { + "epoch": 0.9234461849418898, + "grad_norm": 0.2584136426448822, + "learning_rate": 5.35217836541362e-07, + "loss": 1.2621, + "num_input_tokens_seen": 693706112, + "step": 7310 + }, + { + "epoch": 0.9247094492167761, + "grad_norm": 0.2527817487716675, + "learning_rate": 5.177865356346644e-07, + "loss": 1.2521, + "num_input_tokens_seen": 694636736, + "step": 7320 + }, + { + "epoch": 0.9259727134916624, + "grad_norm": 0.24299506843090057, + "learning_rate": 5.00639548056338e-07, + "loss": 1.2517, + "num_input_tokens_seen": 695631264, + "step": 7330 + }, + { + "epoch": 0.9272359777665488, + "grad_norm": 0.24970118701457977, + "learning_rate": 4.837771608783264e-07, + "loss": 1.2364, + "num_input_tokens_seen": 696587872, + "step": 7340 + }, + { + "epoch": 0.9284992420414351, + "grad_norm": 0.2587854564189911, + "learning_rate": 4.6719965640784676e-07, + "loss": 1.2376, + "num_input_tokens_seen": 697601376, + "step": 7350 + }, + { + "epoch": 0.9297625063163214, + "grad_norm": 0.26746806502342224, + "learning_rate": 4.509073121826623e-07, + "loss": 1.2466, + "num_input_tokens_seen": 698550432, + "step": 7360 + }, + { + "epoch": 0.9310257705912077, + "grad_norm": 0.269715815782547, + "learning_rate": 4.349004009664275e-07, + "loss": 1.2421, + "num_input_tokens_seen": 699511744, + "step": 7370 + }, + { + "epoch": 0.9322890348660939, + "grad_norm": 0.24946600198745728, + "learning_rate": 4.1917919074412416e-07, + "loss": 1.1982, + "num_input_tokens_seen": 700446176, + "step": 7380 + }, + { + "epoch": 0.9335522991409803, + "grad_norm": 0.281342089176178, + "learning_rate": 4.037439447175789e-07, + "loss": 1.2408, + "num_input_tokens_seen": 701373568, + "step": 7390 + }, + { + "epoch": 0.9348155634158666, + "grad_norm": 0.2512856125831604, + "learning_rate": 3.88594921301055e-07, + "loss": 1.2414, + "num_input_tokens_seen": 702294016, + "step": 7400 + }, + { + "epoch": 0.9360788276907529, + "grad_norm": 0.2601119577884674, + "learning_rate": 3.737323741169257e-07, + "loss": 1.2491, + "num_input_tokens_seen": 703232672, + "step": 7410 + }, + { + "epoch": 0.9373420919656392, + "grad_norm": 0.270298033952713, + "learning_rate": 3.5915655199142663e-07, + "loss": 1.2174, + "num_input_tokens_seen": 704175744, + "step": 7420 + }, + { + "epoch": 0.9386053562405255, + "grad_norm": 0.23530983924865723, + "learning_rate": 3.448676989504925e-07, + "loss": 1.2368, + "num_input_tokens_seen": 705141664, + "step": 7430 + }, + { + "epoch": 0.9398686205154119, + "grad_norm": 0.2633696496486664, + "learning_rate": 3.308660542156694e-07, + "loss": 1.2018, + "num_input_tokens_seen": 706067200, + "step": 7440 + }, + { + "epoch": 0.9411318847902981, + "grad_norm": 0.26215797662734985, + "learning_rate": 3.1715185220010984e-07, + "loss": 1.2193, + "num_input_tokens_seen": 706966304, + "step": 7450 + }, + { + "epoch": 0.9423951490651844, + "grad_norm": 0.27117466926574707, + "learning_rate": 3.037253225046529e-07, + "loss": 1.2907, + "num_input_tokens_seen": 707921440, + "step": 7460 + }, + { + "epoch": 0.9436584133400707, + "grad_norm": 0.27227288484573364, + "learning_rate": 2.905866899139708e-07, + "loss": 1.251, + "num_input_tokens_seen": 708838784, + "step": 7470 + }, + { + "epoch": 0.944921677614957, + "grad_norm": 0.26309284567832947, + "learning_rate": 2.777361743928194e-07, + "loss": 1.2574, + "num_input_tokens_seen": 709754176, + "step": 7480 + }, + { + "epoch": 0.9461849418898434, + "grad_norm": 0.24601784348487854, + "learning_rate": 2.6517399108233886e-07, + "loss": 1.1808, + "num_input_tokens_seen": 710722944, + "step": 7490 + }, + { + "epoch": 0.9474482061647297, + "grad_norm": 0.28660014271736145, + "learning_rate": 2.5290035029646523e-07, + "loss": 1.2572, + "num_input_tokens_seen": 711716256, + "step": 7500 + }, + { + "epoch": 0.948711470439616, + "grad_norm": 0.2446954995393753, + "learning_rate": 2.409154575184077e-07, + "loss": 1.1996, + "num_input_tokens_seen": 712625856, + "step": 7510 + }, + { + "epoch": 0.9499747347145022, + "grad_norm": 0.2447938770055771, + "learning_rate": 2.2921951339720053e-07, + "loss": 1.2414, + "num_input_tokens_seen": 713581728, + "step": 7520 + }, + { + "epoch": 0.9512379989893885, + "grad_norm": 0.2409149706363678, + "learning_rate": 2.178127137443489e-07, + "loss": 1.1916, + "num_input_tokens_seen": 714471360, + "step": 7530 + }, + { + "epoch": 0.9525012632642749, + "grad_norm": 0.25430941581726074, + "learning_rate": 2.0669524953055377e-07, + "loss": 1.2343, + "num_input_tokens_seen": 715391488, + "step": 7540 + }, + { + "epoch": 0.9537645275391612, + "grad_norm": 0.27573850750923157, + "learning_rate": 1.9586730688250395e-07, + "loss": 1.2559, + "num_input_tokens_seen": 716352896, + "step": 7550 + }, + { + "epoch": 0.9550277918140475, + "grad_norm": 0.2683832347393036, + "learning_rate": 1.8532906707978106e-07, + "loss": 1.2169, + "num_input_tokens_seen": 717298784, + "step": 7560 + }, + { + "epoch": 0.9562910560889338, + "grad_norm": 0.28321197628974915, + "learning_rate": 1.7508070655179757e-07, + "loss": 1.2796, + "num_input_tokens_seen": 718316000, + "step": 7570 + }, + { + "epoch": 0.9575543203638202, + "grad_norm": 0.25757691264152527, + "learning_rate": 1.65122396874863e-07, + "loss": 1.2222, + "num_input_tokens_seen": 719217248, + "step": 7580 + }, + { + "epoch": 0.9588175846387064, + "grad_norm": 0.2687084972858429, + "learning_rate": 1.5545430476930465e-07, + "loss": 1.1853, + "num_input_tokens_seen": 720198464, + "step": 7590 + }, + { + "epoch": 0.9600808489135927, + "grad_norm": 0.2586497664451599, + "learning_rate": 1.4607659209667165e-07, + "loss": 1.2438, + "num_input_tokens_seen": 721068160, + "step": 7600 + }, + { + "epoch": 0.961344113188479, + "grad_norm": 0.24861587584018707, + "learning_rate": 1.3698941585704033e-07, + "loss": 1.2712, + "num_input_tokens_seen": 722061472, + "step": 7610 + }, + { + "epoch": 0.9626073774633653, + "grad_norm": 0.244459331035614, + "learning_rate": 1.281929281863639e-07, + "loss": 1.1897, + "num_input_tokens_seen": 723015232, + "step": 7620 + }, + { + "epoch": 0.9638706417382517, + "grad_norm": 0.225861594080925, + "learning_rate": 1.1968727635394497e-07, + "loss": 1.2689, + "num_input_tokens_seen": 724000384, + "step": 7630 + }, + { + "epoch": 0.965133906013138, + "grad_norm": 0.246552512049675, + "learning_rate": 1.1147260275995634e-07, + "loss": 1.1784, + "num_input_tokens_seen": 724964992, + "step": 7640 + }, + { + "epoch": 0.9663971702880243, + "grad_norm": 0.2584232687950134, + "learning_rate": 1.0354904493306865e-07, + "loss": 1.2263, + "num_input_tokens_seen": 725923104, + "step": 7650 + }, + { + "epoch": 0.9676604345629105, + "grad_norm": 0.25840452313423157, + "learning_rate": 9.591673552813844e-08, + "loss": 1.2081, + "num_input_tokens_seen": 726876224, + "step": 7660 + }, + { + "epoch": 0.9689236988377968, + "grad_norm": 0.28871768712997437, + "learning_rate": 8.85758023239913e-08, + "loss": 1.2545, + "num_input_tokens_seen": 727721568, + "step": 7670 + }, + { + "epoch": 0.9701869631126832, + "grad_norm": 0.29037731885910034, + "learning_rate": 8.152636822127883e-08, + "loss": 1.2221, + "num_input_tokens_seen": 728634912, + "step": 7680 + }, + { + "epoch": 0.9714502273875695, + "grad_norm": 0.2691645324230194, + "learning_rate": 7.476855124043086e-08, + "loss": 1.2158, + "num_input_tokens_seen": 729574464, + "step": 7690 + }, + { + "epoch": 0.9727134916624558, + "grad_norm": 0.2742849290370941, + "learning_rate": 6.830246451966975e-08, + "loss": 1.2089, + "num_input_tokens_seen": 730499136, + "step": 7700 + }, + { + "epoch": 0.9739767559373421, + "grad_norm": 0.26165613532066345, + "learning_rate": 6.212821631311621e-08, + "loss": 1.2314, + "num_input_tokens_seen": 731461280, + "step": 7710 + }, + { + "epoch": 0.9752400202122284, + "grad_norm": 0.24117015302181244, + "learning_rate": 5.624590998898615e-08, + "loss": 1.2055, + "num_input_tokens_seen": 732374848, + "step": 7720 + }, + { + "epoch": 0.9765032844871147, + "grad_norm": 0.2643440365791321, + "learning_rate": 5.0655644027847994e-08, + "loss": 1.2044, + "num_input_tokens_seen": 733271648, + "step": 7730 + }, + { + "epoch": 0.977766548762001, + "grad_norm": 0.24681268632411957, + "learning_rate": 4.5357512020986755e-08, + "loss": 1.1749, + "num_input_tokens_seen": 734233312, + "step": 7740 + }, + { + "epoch": 0.9790298130368873, + "grad_norm": 0.28687500953674316, + "learning_rate": 4.0351602668824423e-08, + "loss": 1.2237, + "num_input_tokens_seen": 735189120, + "step": 7750 + }, + { + "epoch": 0.9802930773117736, + "grad_norm": 0.2667155861854553, + "learning_rate": 3.563799977944537e-08, + "loss": 1.2138, + "num_input_tokens_seen": 736120128, + "step": 7760 + }, + { + "epoch": 0.9815563415866599, + "grad_norm": 0.25432640314102173, + "learning_rate": 3.121678226718577e-08, + "loss": 1.1976, + "num_input_tokens_seen": 737063456, + "step": 7770 + }, + { + "epoch": 0.9828196058615463, + "grad_norm": 0.2468518167734146, + "learning_rate": 2.708802415131828e-08, + "loss": 1.2268, + "num_input_tokens_seen": 738004096, + "step": 7780 + }, + { + "epoch": 0.9840828701364326, + "grad_norm": 0.27853333950042725, + "learning_rate": 2.3251794554806636e-08, + "loss": 1.2074, + "num_input_tokens_seen": 739017440, + "step": 7790 + }, + { + "epoch": 0.9853461344113188, + "grad_norm": 0.26621630787849426, + "learning_rate": 1.9708157703157424e-08, + "loss": 1.213, + "num_input_tokens_seen": 740034656, + "step": 7800 + }, + { + "epoch": 0.9866093986862051, + "grad_norm": 0.2626071572303772, + "learning_rate": 1.645717292333204e-08, + "loss": 1.2604, + "num_input_tokens_seen": 741063104, + "step": 7810 + }, + { + "epoch": 0.9878726629610914, + "grad_norm": 0.26386693120002747, + "learning_rate": 1.3498894642769432e-08, + "loss": 1.2779, + "num_input_tokens_seen": 742014688, + "step": 7820 + }, + { + "epoch": 0.9891359272359778, + "grad_norm": 0.2615217864513397, + "learning_rate": 1.0833372388455442e-08, + "loss": 1.2108, + "num_input_tokens_seen": 742960160, + "step": 7830 + }, + { + "epoch": 0.9903991915108641, + "grad_norm": 0.2661604881286621, + "learning_rate": 8.460650786114576e-09, + "loss": 1.1899, + "num_input_tokens_seen": 743845760, + "step": 7840 + }, + { + "epoch": 0.9916624557857504, + "grad_norm": 0.26591452956199646, + "learning_rate": 6.380769559444499e-09, + "loss": 1.2474, + "num_input_tokens_seen": 744760672, + "step": 7850 + }, + { + "epoch": 0.9929257200606367, + "grad_norm": 0.27036914229393005, + "learning_rate": 4.5937635294671094e-09, + "loss": 1.2709, + "num_input_tokens_seen": 745728352, + "step": 7860 + }, + { + "epoch": 0.9941889843355229, + "grad_norm": 0.24849487841129303, + "learning_rate": 3.099662613930132e-09, + "loss": 1.2096, + "num_input_tokens_seen": 746640928, + "step": 7870 + }, + { + "epoch": 0.9954522486104093, + "grad_norm": 0.2538692057132721, + "learning_rate": 1.8984918268175055e-09, + "loss": 1.2464, + "num_input_tokens_seen": 747588896, + "step": 7880 + }, + { + "epoch": 0.9967155128852956, + "grad_norm": 0.26595503091812134, + "learning_rate": 9.902712779277788e-10, + "loss": 1.2883, + "num_input_tokens_seen": 748464864, + "step": 7890 + }, + { + "epoch": 0.9979787771601819, + "grad_norm": 0.27239322662353516, + "learning_rate": 3.7501617253216096e-10, + "loss": 1.1961, + "num_input_tokens_seen": 749490752, + "step": 7900 + }, + { + "epoch": 0.9992420414350682, + "grad_norm": 0.2784164249897003, + "learning_rate": 5.2736811129716613e-11, + "loss": 1.2785, + "num_input_tokens_seen": 750395392, + "step": 7910 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 750938410, + "step": 7916, + "total_flos": 3.6248418467253043e+18, + "train_loss": 1.2696220230851407, + "train_runtime": 79988.0702, + "train_samples_per_second": 12.667, + "train_steps_per_second": 0.099 + } + ], + "logging_steps": 10, + "max_steps": 7916, + "num_input_tokens_seen": 750938410, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.6248418467253043e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}