{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50000, "global_step": 255210, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007836683515536225, "grad_norm": 0.5945625901222229, "learning_rate": 4.99612084165981e-05, "loss": 0.3949, "step": 200 }, { "epoch": 0.001567336703107245, "grad_norm": 0.3452136516571045, "learning_rate": 4.992202499902042e-05, "loss": 0.391, "step": 400 }, { "epoch": 0.0023510050546608676, "grad_norm": 0.33119624853134155, "learning_rate": 4.988284158144273e-05, "loss": 0.3661, "step": 600 }, { "epoch": 0.00313467340621449, "grad_norm": 0.13218963146209717, "learning_rate": 4.984365816386505e-05, "loss": 0.4386, "step": 800 }, { "epoch": 0.003918341757768113, "grad_norm": 0.569341778755188, "learning_rate": 4.980447474628737e-05, "loss": 0.3566, "step": 1000 }, { "epoch": 0.004702010109321735, "grad_norm": 0.29856380820274353, "learning_rate": 4.976529132870969e-05, "loss": 0.3683, "step": 1200 }, { "epoch": 0.005485678460875358, "grad_norm": 0.24387845396995544, "learning_rate": 4.9726107911132014e-05, "loss": 0.3514, "step": 1400 }, { "epoch": 0.00626934681242898, "grad_norm": 0.34159696102142334, "learning_rate": 4.9686924493554334e-05, "loss": 0.3873, "step": 1600 }, { "epoch": 0.007053015163982603, "grad_norm": 0.27332696318626404, "learning_rate": 4.964774107597665e-05, "loss": 0.3593, "step": 1800 }, { "epoch": 0.007836683515536225, "grad_norm": 0.3903495669364929, "learning_rate": 4.960855765839897e-05, "loss": 0.3818, "step": 2000 }, { "epoch": 0.008620351867089848, "grad_norm": 0.5951546430587769, "learning_rate": 4.956937424082129e-05, "loss": 0.3865, "step": 2200 }, { "epoch": 0.00940402021864347, "grad_norm": 0.8191711902618408, "learning_rate": 4.953038674033149e-05, "loss": 0.4202, "step": 2400 }, { "epoch": 0.010187688570197093, "grad_norm": 1.3128159046173096, "learning_rate": 4.949120332275381e-05, "loss": 0.4125, "step": 2600 }, { "epoch": 0.010971356921750716, "grad_norm": 0.5406262278556824, "learning_rate": 4.9452019905176134e-05, "loss": 0.3776, "step": 2800 }, { "epoch": 0.011755025273304338, "grad_norm": 0.2904791533946991, "learning_rate": 4.9412836487598454e-05, "loss": 0.4015, "step": 3000 }, { "epoch": 0.01253869362485796, "grad_norm": 0.3807118833065033, "learning_rate": 4.937365307002077e-05, "loss": 0.374, "step": 3200 }, { "epoch": 0.013322361976411583, "grad_norm": 0.24626630544662476, "learning_rate": 4.933446965244309e-05, "loss": 0.3905, "step": 3400 }, { "epoch": 0.014106030327965206, "grad_norm": 0.3328797519207001, "learning_rate": 4.929528623486541e-05, "loss": 0.4208, "step": 3600 }, { "epoch": 0.014889698679518828, "grad_norm": 0.545254647731781, "learning_rate": 4.925610281728772e-05, "loss": 0.4045, "step": 3800 }, { "epoch": 0.01567336703107245, "grad_norm": 0.29221510887145996, "learning_rate": 4.921691939971004e-05, "loss": 0.3758, "step": 4000 }, { "epoch": 0.01645703538262607, "grad_norm": 0.516792893409729, "learning_rate": 4.917773598213236e-05, "loss": 0.3715, "step": 4200 }, { "epoch": 0.017240703734179696, "grad_norm": 0.2908971905708313, "learning_rate": 4.913855256455468e-05, "loss": 0.4018, "step": 4400 }, { "epoch": 0.018024372085733317, "grad_norm": 0.71240234375, "learning_rate": 4.9099369146977e-05, "loss": 0.3969, "step": 4600 }, { "epoch": 0.01880804043728694, "grad_norm": 0.6078171730041504, "learning_rate": 4.906038164648721e-05, "loss": 0.3703, "step": 4800 }, { "epoch": 0.01959170878884056, "grad_norm": 0.44275930523872375, "learning_rate": 4.902119822890953e-05, "loss": 0.3918, "step": 5000 }, { "epoch": 0.020375377140394186, "grad_norm": 0.4709361791610718, "learning_rate": 4.898201481133184e-05, "loss": 0.3677, "step": 5200 }, { "epoch": 0.021159045491947807, "grad_norm": 0.25307971239089966, "learning_rate": 4.894283139375417e-05, "loss": 0.3935, "step": 5400 }, { "epoch": 0.02194271384350143, "grad_norm": 0.13995467126369476, "learning_rate": 4.890364797617649e-05, "loss": 0.3515, "step": 5600 }, { "epoch": 0.022726382195055052, "grad_norm": 0.44970160722732544, "learning_rate": 4.88644645585988e-05, "loss": 0.3742, "step": 5800 }, { "epoch": 0.023510050546608676, "grad_norm": 0.22797733545303345, "learning_rate": 4.882528114102112e-05, "loss": 0.3772, "step": 6000 }, { "epoch": 0.024293718898162297, "grad_norm": 0.17690172791481018, "learning_rate": 4.878609772344344e-05, "loss": 0.367, "step": 6200 }, { "epoch": 0.02507738724971592, "grad_norm": 0.29871538281440735, "learning_rate": 4.8746914305865756e-05, "loss": 0.3923, "step": 6400 }, { "epoch": 0.025861055601269542, "grad_norm": 0.6097844839096069, "learning_rate": 4.8707730888288077e-05, "loss": 0.3593, "step": 6600 }, { "epoch": 0.026644723952823166, "grad_norm": 0.1999012678861618, "learning_rate": 4.866874338779829e-05, "loss": 0.3938, "step": 6800 }, { "epoch": 0.027428392304376787, "grad_norm": 0.5505908131599426, "learning_rate": 4.862955997022061e-05, "loss": 0.4115, "step": 7000 }, { "epoch": 0.02821206065593041, "grad_norm": 0.1734248548746109, "learning_rate": 4.859037655264292e-05, "loss": 0.3701, "step": 7200 }, { "epoch": 0.028995729007484032, "grad_norm": 0.276464581489563, "learning_rate": 4.855119313506524e-05, "loss": 0.3624, "step": 7400 }, { "epoch": 0.029779397359037656, "grad_norm": 0.3009423017501831, "learning_rate": 4.851200971748756e-05, "loss": 0.4136, "step": 7600 }, { "epoch": 0.030563065710591277, "grad_norm": 0.6675565838813782, "learning_rate": 4.8472826299909876e-05, "loss": 0.4103, "step": 7800 }, { "epoch": 0.0313467340621449, "grad_norm": 0.4492146372795105, "learning_rate": 4.8433642882332196e-05, "loss": 0.3836, "step": 8000 }, { "epoch": 0.03213040241369852, "grad_norm": 0.15747825801372528, "learning_rate": 4.839445946475452e-05, "loss": 0.4112, "step": 8200 }, { "epoch": 0.03291407076525214, "grad_norm": 0.8615152835845947, "learning_rate": 4.835527604717684e-05, "loss": 0.3957, "step": 8400 }, { "epoch": 0.03369773911680577, "grad_norm": 0.28233879804611206, "learning_rate": 4.831609262959916e-05, "loss": 0.3608, "step": 8600 }, { "epoch": 0.03448140746835939, "grad_norm": 0.8058581948280334, "learning_rate": 4.827710512910936e-05, "loss": 0.3757, "step": 8800 }, { "epoch": 0.03526507581991301, "grad_norm": 0.45508599281311035, "learning_rate": 4.823792171153168e-05, "loss": 0.3622, "step": 9000 }, { "epoch": 0.03604874417146663, "grad_norm": 0.4726542532444, "learning_rate": 4.8198738293953996e-05, "loss": 0.3638, "step": 9200 }, { "epoch": 0.03683241252302026, "grad_norm": 0.26873549818992615, "learning_rate": 4.8159554876376316e-05, "loss": 0.3748, "step": 9400 }, { "epoch": 0.03761608087457388, "grad_norm": 0.5933759212493896, "learning_rate": 4.812037145879864e-05, "loss": 0.352, "step": 9600 }, { "epoch": 0.0383997492261275, "grad_norm": 0.40285825729370117, "learning_rate": 4.808118804122096e-05, "loss": 0.4177, "step": 9800 }, { "epoch": 0.03918341757768112, "grad_norm": 1.7145395278930664, "learning_rate": 4.804200462364328e-05, "loss": 0.3758, "step": 10000 }, { "epoch": 0.03996708592923475, "grad_norm": 0.743889570236206, "learning_rate": 4.80028212060656e-05, "loss": 0.3642, "step": 10200 }, { "epoch": 0.04075075428078837, "grad_norm": 0.6327961087226868, "learning_rate": 4.796363778848791e-05, "loss": 0.4351, "step": 10400 }, { "epoch": 0.04153442263234199, "grad_norm": 0.14780865609645844, "learning_rate": 4.792445437091023e-05, "loss": 0.3693, "step": 10600 }, { "epoch": 0.04231809098389561, "grad_norm": 0.5431840419769287, "learning_rate": 4.788527095333255e-05, "loss": 0.3743, "step": 10800 }, { "epoch": 0.04310175933544924, "grad_norm": 0.5209939479827881, "learning_rate": 4.784608753575487e-05, "loss": 0.4316, "step": 11000 }, { "epoch": 0.04388542768700286, "grad_norm": 0.4877757728099823, "learning_rate": 4.7806904118177185e-05, "loss": 0.3879, "step": 11200 }, { "epoch": 0.04466909603855648, "grad_norm": 0.45312076807022095, "learning_rate": 4.776772070059951e-05, "loss": 0.3924, "step": 11400 }, { "epoch": 0.045452764390110104, "grad_norm": 0.5892153382301331, "learning_rate": 4.7728537283021826e-05, "loss": 0.3826, "step": 11600 }, { "epoch": 0.04623643274166373, "grad_norm": 0.5837638974189758, "learning_rate": 4.7689353865444146e-05, "loss": 0.4123, "step": 11800 }, { "epoch": 0.04702010109321735, "grad_norm": 0.306490421295166, "learning_rate": 4.7650170447866466e-05, "loss": 0.3811, "step": 12000 }, { "epoch": 0.04780376944477097, "grad_norm": 0.715175211429596, "learning_rate": 4.761098703028879e-05, "loss": 0.4019, "step": 12200 }, { "epoch": 0.048587437796324594, "grad_norm": 0.419628381729126, "learning_rate": 4.757199952979899e-05, "loss": 0.3907, "step": 12400 }, { "epoch": 0.049371106147878215, "grad_norm": 0.40230727195739746, "learning_rate": 4.753281611222131e-05, "loss": 0.4054, "step": 12600 }, { "epoch": 0.05015477449943184, "grad_norm": 0.38158226013183594, "learning_rate": 4.749363269464363e-05, "loss": 0.3897, "step": 12800 }, { "epoch": 0.05093844285098546, "grad_norm": 0.17908357083797455, "learning_rate": 4.7454449277065946e-05, "loss": 0.3799, "step": 13000 }, { "epoch": 0.051722111202539084, "grad_norm": 2.7669689655303955, "learning_rate": 4.7415265859488266e-05, "loss": 0.3638, "step": 13200 }, { "epoch": 0.052505779554092705, "grad_norm": 0.9569264054298401, "learning_rate": 4.7376082441910586e-05, "loss": 0.3669, "step": 13400 }, { "epoch": 0.05328944790564633, "grad_norm": 0.36103129386901855, "learning_rate": 4.7336899024332907e-05, "loss": 0.4151, "step": 13600 }, { "epoch": 0.05407311625719995, "grad_norm": 0.24037201702594757, "learning_rate": 4.729771560675522e-05, "loss": 0.4038, "step": 13800 }, { "epoch": 0.054856784608753574, "grad_norm": 0.31338945031166077, "learning_rate": 4.725853218917754e-05, "loss": 0.3909, "step": 14000 }, { "epoch": 0.055640452960307195, "grad_norm": 0.21747402846813202, "learning_rate": 4.721934877159986e-05, "loss": 0.3733, "step": 14200 }, { "epoch": 0.05642412131186082, "grad_norm": 0.44257447123527527, "learning_rate": 4.718016535402218e-05, "loss": 0.4058, "step": 14400 }, { "epoch": 0.05720778966341444, "grad_norm": 0.5667888522148132, "learning_rate": 4.71409819364445e-05, "loss": 0.434, "step": 14600 }, { "epoch": 0.057991458014968064, "grad_norm": 0.4107826054096222, "learning_rate": 4.710179851886682e-05, "loss": 0.3837, "step": 14800 }, { "epoch": 0.058775126366521685, "grad_norm": 0.48772132396698, "learning_rate": 4.7062615101289135e-05, "loss": 0.3768, "step": 15000 }, { "epoch": 0.05955879471807531, "grad_norm": 1.0837293863296509, "learning_rate": 4.7023431683711455e-05, "loss": 0.3605, "step": 15200 }, { "epoch": 0.060342463069628934, "grad_norm": 0.1726633906364441, "learning_rate": 4.6984248266133776e-05, "loss": 0.3611, "step": 15400 }, { "epoch": 0.061126131421182554, "grad_norm": 0.4770794212818146, "learning_rate": 4.694506484855609e-05, "loss": 0.3852, "step": 15600 }, { "epoch": 0.061909799772736175, "grad_norm": 0.4889805316925049, "learning_rate": 4.690588143097841e-05, "loss": 0.4082, "step": 15800 }, { "epoch": 0.0626934681242898, "grad_norm": 0.2940739691257477, "learning_rate": 4.686669801340073e-05, "loss": 0.3619, "step": 16000 }, { "epoch": 0.06347713647584342, "grad_norm": 0.18743853271007538, "learning_rate": 4.682751459582305e-05, "loss": 0.3515, "step": 16200 }, { "epoch": 0.06426080482739704, "grad_norm": 0.5064111351966858, "learning_rate": 4.678833117824537e-05, "loss": 0.3118, "step": 16400 }, { "epoch": 0.06504447317895067, "grad_norm": 0.33767470717430115, "learning_rate": 4.6749343677755575e-05, "loss": 0.3918, "step": 16600 }, { "epoch": 0.06582814153050429, "grad_norm": 0.3339788615703583, "learning_rate": 4.6710160260177895e-05, "loss": 0.3936, "step": 16800 }, { "epoch": 0.0666118098820579, "grad_norm": 0.3252772092819214, "learning_rate": 4.667097684260021e-05, "loss": 0.3962, "step": 17000 }, { "epoch": 0.06739547823361154, "grad_norm": 0.2177598625421524, "learning_rate": 4.6631793425022536e-05, "loss": 0.3857, "step": 17200 }, { "epoch": 0.06817914658516516, "grad_norm": 0.4870295226573944, "learning_rate": 4.6592610007444856e-05, "loss": 0.3392, "step": 17400 }, { "epoch": 0.06896281493671878, "grad_norm": 0.3281099796295166, "learning_rate": 4.655342658986717e-05, "loss": 0.4033, "step": 17600 }, { "epoch": 0.0697464832882724, "grad_norm": 0.34649309515953064, "learning_rate": 4.651424317228949e-05, "loss": 0.3762, "step": 17800 }, { "epoch": 0.07053015163982602, "grad_norm": 0.422496497631073, "learning_rate": 4.647505975471181e-05, "loss": 0.3741, "step": 18000 }, { "epoch": 0.07131381999137965, "grad_norm": 0.3017028570175171, "learning_rate": 4.6435876337134124e-05, "loss": 0.3918, "step": 18200 }, { "epoch": 0.07209748834293327, "grad_norm": 0.5374196767807007, "learning_rate": 4.6396692919556444e-05, "loss": 0.3872, "step": 18400 }, { "epoch": 0.07288115669448689, "grad_norm": 0.4020719826221466, "learning_rate": 4.6357509501978764e-05, "loss": 0.3394, "step": 18600 }, { "epoch": 0.07366482504604052, "grad_norm": 0.2771855294704437, "learning_rate": 4.6318326084401085e-05, "loss": 0.3846, "step": 18800 }, { "epoch": 0.07444849339759414, "grad_norm": 0.6207616329193115, "learning_rate": 4.627933858391129e-05, "loss": 0.4045, "step": 19000 }, { "epoch": 0.07523216174914776, "grad_norm": 0.480933278799057, "learning_rate": 4.624015516633361e-05, "loss": 0.3964, "step": 19200 }, { "epoch": 0.07601583010070138, "grad_norm": 0.1714453101158142, "learning_rate": 4.620097174875593e-05, "loss": 0.3908, "step": 19400 }, { "epoch": 0.076799498452255, "grad_norm": 0.8989331722259521, "learning_rate": 4.6161788331178244e-05, "loss": 0.4001, "step": 19600 }, { "epoch": 0.07758316680380863, "grad_norm": 0.2786279022693634, "learning_rate": 4.6122604913600564e-05, "loss": 0.3784, "step": 19800 }, { "epoch": 0.07836683515536225, "grad_norm": 0.11766529083251953, "learning_rate": 4.6083421496022884e-05, "loss": 0.3971, "step": 20000 }, { "epoch": 0.07915050350691587, "grad_norm": 0.4076923727989197, "learning_rate": 4.6044238078445205e-05, "loss": 0.3895, "step": 20200 }, { "epoch": 0.0799341718584695, "grad_norm": 0.09145388752222061, "learning_rate": 4.6005054660867525e-05, "loss": 0.3889, "step": 20400 }, { "epoch": 0.08071784021002312, "grad_norm": 0.4803103804588318, "learning_rate": 4.5965871243289845e-05, "loss": 0.3908, "step": 20600 }, { "epoch": 0.08150150856157674, "grad_norm": 0.3921396732330322, "learning_rate": 4.592668782571216e-05, "loss": 0.3435, "step": 20800 }, { "epoch": 0.08228517691313036, "grad_norm": 0.21081089973449707, "learning_rate": 4.5887700325222364e-05, "loss": 0.3728, "step": 21000 }, { "epoch": 0.08306884526468399, "grad_norm": 0.7610578536987305, "learning_rate": 4.584851690764469e-05, "loss": 0.4136, "step": 21200 }, { "epoch": 0.0838525136162376, "grad_norm": 0.4941267967224121, "learning_rate": 4.580933349006701e-05, "loss": 0.3886, "step": 21400 }, { "epoch": 0.08463618196779123, "grad_norm": 0.17048139870166779, "learning_rate": 4.5770150072489324e-05, "loss": 0.3685, "step": 21600 }, { "epoch": 0.08541985031934485, "grad_norm": 0.2399785816669464, "learning_rate": 4.5730966654911645e-05, "loss": 0.4047, "step": 21800 }, { "epoch": 0.08620351867089848, "grad_norm": 0.2618391811847687, "learning_rate": 4.5691783237333965e-05, "loss": 0.3593, "step": 22000 }, { "epoch": 0.0869871870224521, "grad_norm": 0.2619889974594116, "learning_rate": 4.565259981975628e-05, "loss": 0.4062, "step": 22200 }, { "epoch": 0.08777085537400572, "grad_norm": 0.15558627247810364, "learning_rate": 4.56134164021786e-05, "loss": 0.4025, "step": 22400 }, { "epoch": 0.08855452372555934, "grad_norm": 1.6381131410598755, "learning_rate": 4.557423298460092e-05, "loss": 0.351, "step": 22600 }, { "epoch": 0.08933819207711297, "grad_norm": 0.32529759407043457, "learning_rate": 4.553504956702323e-05, "loss": 0.3703, "step": 22800 }, { "epoch": 0.09012186042866659, "grad_norm": 0.5802072882652283, "learning_rate": 4.549586614944556e-05, "loss": 0.3789, "step": 23000 }, { "epoch": 0.09090552878022021, "grad_norm": 0.3980289399623871, "learning_rate": 4.5456878648955765e-05, "loss": 0.3776, "step": 23200 }, { "epoch": 0.09168919713177383, "grad_norm": 0.43737566471099854, "learning_rate": 4.5417695231378085e-05, "loss": 0.4388, "step": 23400 }, { "epoch": 0.09247286548332746, "grad_norm": 0.5042510628700256, "learning_rate": 4.53785118138004e-05, "loss": 0.4253, "step": 23600 }, { "epoch": 0.09325653383488108, "grad_norm": 0.16933372616767883, "learning_rate": 4.533932839622272e-05, "loss": 0.4064, "step": 23800 }, { "epoch": 0.0940402021864347, "grad_norm": 0.4473813772201538, "learning_rate": 4.530014497864504e-05, "loss": 0.385, "step": 24000 }, { "epoch": 0.09482387053798833, "grad_norm": 0.35813969373703003, "learning_rate": 4.526096156106736e-05, "loss": 0.3774, "step": 24200 }, { "epoch": 0.09560753888954195, "grad_norm": 0.17471584677696228, "learning_rate": 4.522177814348968e-05, "loss": 0.3554, "step": 24400 }, { "epoch": 0.09639120724109557, "grad_norm": 0.6126578450202942, "learning_rate": 4.5182594725912e-05, "loss": 0.4104, "step": 24600 }, { "epoch": 0.09717487559264919, "grad_norm": 0.42057156562805176, "learning_rate": 4.514341130833431e-05, "loss": 0.4014, "step": 24800 }, { "epoch": 0.09795854394420281, "grad_norm": 0.2032351940870285, "learning_rate": 4.5104227890756634e-05, "loss": 0.3886, "step": 25000 }, { "epoch": 0.09874221229575643, "grad_norm": 0.792407751083374, "learning_rate": 4.5065044473178954e-05, "loss": 0.4, "step": 25200 }, { "epoch": 0.09952588064731006, "grad_norm": 0.2855859100818634, "learning_rate": 4.502586105560127e-05, "loss": 0.4081, "step": 25400 }, { "epoch": 0.10030954899886368, "grad_norm": 0.3872545659542084, "learning_rate": 4.498667763802359e-05, "loss": 0.3587, "step": 25600 }, { "epoch": 0.1010932173504173, "grad_norm": 0.3444635272026062, "learning_rate": 4.49476901375338e-05, "loss": 0.3754, "step": 25800 }, { "epoch": 0.10187688570197093, "grad_norm": 0.23079633712768555, "learning_rate": 4.4908702637044004e-05, "loss": 0.3847, "step": 26000 }, { "epoch": 0.10266055405352455, "grad_norm": 0.1402076780796051, "learning_rate": 4.4869519219466325e-05, "loss": 0.3682, "step": 26200 }, { "epoch": 0.10344422240507817, "grad_norm": 0.41062939167022705, "learning_rate": 4.4830335801888645e-05, "loss": 0.3899, "step": 26400 }, { "epoch": 0.10422789075663179, "grad_norm": 0.3105030059814453, "learning_rate": 4.4791152384310965e-05, "loss": 0.3724, "step": 26600 }, { "epoch": 0.10501155910818541, "grad_norm": 0.7441399693489075, "learning_rate": 4.475196896673328e-05, "loss": 0.3619, "step": 26800 }, { "epoch": 0.10579522745973904, "grad_norm": 0.582595944404602, "learning_rate": 4.47127855491556e-05, "loss": 0.3884, "step": 27000 }, { "epoch": 0.10657889581129266, "grad_norm": 0.3682032823562622, "learning_rate": 4.467360213157792e-05, "loss": 0.413, "step": 27200 }, { "epoch": 0.10736256416284629, "grad_norm": 0.19573107361793518, "learning_rate": 4.463441871400024e-05, "loss": 0.3817, "step": 27400 }, { "epoch": 0.1081462325143999, "grad_norm": 0.8091766238212585, "learning_rate": 4.459523529642255e-05, "loss": 0.3872, "step": 27600 }, { "epoch": 0.10892990086595353, "grad_norm": 0.3995172381401062, "learning_rate": 4.455605187884487e-05, "loss": 0.3579, "step": 27800 }, { "epoch": 0.10971356921750715, "grad_norm": 0.3911612331867218, "learning_rate": 4.4516868461267194e-05, "loss": 0.3708, "step": 28000 }, { "epoch": 0.11049723756906077, "grad_norm": 0.10565478354692459, "learning_rate": 4.4477685043689514e-05, "loss": 0.3541, "step": 28200 }, { "epoch": 0.11128090592061439, "grad_norm": 0.44396963715553284, "learning_rate": 4.4438501626111834e-05, "loss": 0.3943, "step": 28400 }, { "epoch": 0.11206457427216802, "grad_norm": 2.1696653366088867, "learning_rate": 4.4399318208534154e-05, "loss": 0.3619, "step": 28600 }, { "epoch": 0.11284824262372165, "grad_norm": 0.3560933768749237, "learning_rate": 4.436013479095647e-05, "loss": 0.3488, "step": 28800 }, { "epoch": 0.11363191097527527, "grad_norm": 0.5040505528450012, "learning_rate": 4.432095137337879e-05, "loss": 0.4071, "step": 29000 }, { "epoch": 0.11441557932682889, "grad_norm": 0.16824057698249817, "learning_rate": 4.428176795580111e-05, "loss": 0.386, "step": 29200 }, { "epoch": 0.11519924767838251, "grad_norm": 0.8319584131240845, "learning_rate": 4.424258453822342e-05, "loss": 0.4021, "step": 29400 }, { "epoch": 0.11598291602993613, "grad_norm": 0.5349191427230835, "learning_rate": 4.4203597037733634e-05, "loss": 0.3899, "step": 29600 }, { "epoch": 0.11676658438148975, "grad_norm": 0.3066231310367584, "learning_rate": 4.4164413620155954e-05, "loss": 0.4211, "step": 29800 }, { "epoch": 0.11755025273304337, "grad_norm": 0.27127805352211, "learning_rate": 4.4125230202578274e-05, "loss": 0.4116, "step": 30000 }, { "epoch": 0.118333921084597, "grad_norm": 0.12929223477840424, "learning_rate": 4.408604678500059e-05, "loss": 0.3689, "step": 30200 }, { "epoch": 0.11911758943615063, "grad_norm": 0.3684684932231903, "learning_rate": 4.404686336742291e-05, "loss": 0.3669, "step": 30400 }, { "epoch": 0.11990125778770425, "grad_norm": 0.29743534326553345, "learning_rate": 4.400767994984523e-05, "loss": 0.3697, "step": 30600 }, { "epoch": 0.12068492613925787, "grad_norm": 0.24909648299217224, "learning_rate": 4.396849653226754e-05, "loss": 0.3841, "step": 30800 }, { "epoch": 0.12146859449081149, "grad_norm": 0.34889253973960876, "learning_rate": 4.392931311468986e-05, "loss": 0.3898, "step": 31000 }, { "epoch": 0.12225226284236511, "grad_norm": 0.29384544491767883, "learning_rate": 4.389012969711219e-05, "loss": 0.3578, "step": 31200 }, { "epoch": 0.12303593119391873, "grad_norm": 0.7216317653656006, "learning_rate": 4.38509462795345e-05, "loss": 0.3525, "step": 31400 }, { "epoch": 0.12381959954547235, "grad_norm": 0.19061321020126343, "learning_rate": 4.381176286195682e-05, "loss": 0.3811, "step": 31600 }, { "epoch": 0.12460326789702598, "grad_norm": 0.11437461525201797, "learning_rate": 4.377257944437914e-05, "loss": 0.3489, "step": 31800 }, { "epoch": 0.1253869362485796, "grad_norm": 0.8204576373100281, "learning_rate": 4.373339602680146e-05, "loss": 0.4186, "step": 32000 }, { "epoch": 0.1261706046001332, "grad_norm": 0.5701767206192017, "learning_rate": 4.369421260922378e-05, "loss": 0.3865, "step": 32200 }, { "epoch": 0.12695427295168685, "grad_norm": 0.10129725933074951, "learning_rate": 4.36550291916461e-05, "loss": 0.3769, "step": 32400 }, { "epoch": 0.12773794130324048, "grad_norm": 0.3074261248111725, "learning_rate": 4.361584577406842e-05, "loss": 0.3989, "step": 32600 }, { "epoch": 0.1285216096547941, "grad_norm": 0.2859583795070648, "learning_rate": 4.357666235649073e-05, "loss": 0.3511, "step": 32800 }, { "epoch": 0.12930527800634772, "grad_norm": 0.2709607481956482, "learning_rate": 4.353767485600094e-05, "loss": 0.3977, "step": 33000 }, { "epoch": 0.13008894635790133, "grad_norm": 0.5966998338699341, "learning_rate": 4.349849143842326e-05, "loss": 0.3686, "step": 33200 }, { "epoch": 0.13087261470945497, "grad_norm": 0.36328116059303284, "learning_rate": 4.345930802084558e-05, "loss": 0.397, "step": 33400 }, { "epoch": 0.13165628306100857, "grad_norm": 0.44545626640319824, "learning_rate": 4.34201246032679e-05, "loss": 0.3967, "step": 33600 }, { "epoch": 0.1324399514125622, "grad_norm": 0.49635568261146545, "learning_rate": 4.338094118569022e-05, "loss": 0.3877, "step": 33800 }, { "epoch": 0.1332236197641158, "grad_norm": 0.33987563848495483, "learning_rate": 4.334175776811254e-05, "loss": 0.4172, "step": 34000 }, { "epoch": 0.13400728811566945, "grad_norm": 0.818518877029419, "learning_rate": 4.330257435053486e-05, "loss": 0.3539, "step": 34200 }, { "epoch": 0.13479095646722308, "grad_norm": 0.43743157386779785, "learning_rate": 4.326339093295718e-05, "loss": 0.3675, "step": 34400 }, { "epoch": 0.1355746248187767, "grad_norm": 0.595392107963562, "learning_rate": 4.322420751537949e-05, "loss": 0.3696, "step": 34600 }, { "epoch": 0.13635829317033032, "grad_norm": 0.5163072347640991, "learning_rate": 4.318502409780181e-05, "loss": 0.4279, "step": 34800 }, { "epoch": 0.13714196152188393, "grad_norm": 0.3196135461330414, "learning_rate": 4.314584068022413e-05, "loss": 0.3663, "step": 35000 }, { "epoch": 0.13792562987343757, "grad_norm": 0.2517015039920807, "learning_rate": 4.310665726264645e-05, "loss": 0.3798, "step": 35200 }, { "epoch": 0.13870929822499117, "grad_norm": 0.25268664956092834, "learning_rate": 4.306766976215666e-05, "loss": 0.4006, "step": 35400 }, { "epoch": 0.1394929665765448, "grad_norm": 0.2162117213010788, "learning_rate": 4.302848634457898e-05, "loss": 0.393, "step": 35600 }, { "epoch": 0.14027663492809844, "grad_norm": 0.207589253783226, "learning_rate": 4.29893029270013e-05, "loss": 0.354, "step": 35800 }, { "epoch": 0.14106030327965205, "grad_norm": 0.380723237991333, "learning_rate": 4.295011950942361e-05, "loss": 0.3776, "step": 36000 }, { "epoch": 0.14184397163120568, "grad_norm": 0.16514042019844055, "learning_rate": 4.291093609184593e-05, "loss": 0.4212, "step": 36200 }, { "epoch": 0.1426276399827593, "grad_norm": 0.6130224466323853, "learning_rate": 4.287175267426825e-05, "loss": 0.4312, "step": 36400 }, { "epoch": 0.14341130833431293, "grad_norm": 0.1590093970298767, "learning_rate": 4.2832569256690566e-05, "loss": 0.4055, "step": 36600 }, { "epoch": 0.14419497668586653, "grad_norm": 0.39541569352149963, "learning_rate": 4.2793385839112886e-05, "loss": 0.3631, "step": 36800 }, { "epoch": 0.14497864503742017, "grad_norm": 0.5678555369377136, "learning_rate": 4.275420242153521e-05, "loss": 0.3534, "step": 37000 }, { "epoch": 0.14576231338897377, "grad_norm": 0.7044442892074585, "learning_rate": 4.2715019003957526e-05, "loss": 0.382, "step": 37200 }, { "epoch": 0.1465459817405274, "grad_norm": 0.22131168842315674, "learning_rate": 4.267583558637985e-05, "loss": 0.3702, "step": 37400 }, { "epoch": 0.14732965009208104, "grad_norm": 0.414786159992218, "learning_rate": 4.263665216880217e-05, "loss": 0.4217, "step": 37600 }, { "epoch": 0.14811331844363465, "grad_norm": 0.4424242079257965, "learning_rate": 4.259746875122449e-05, "loss": 0.3484, "step": 37800 }, { "epoch": 0.14889698679518829, "grad_norm": 1.2707728147506714, "learning_rate": 4.25582853336468e-05, "loss": 0.3674, "step": 38000 }, { "epoch": 0.1496806551467419, "grad_norm": 0.508650541305542, "learning_rate": 4.251910191606912e-05, "loss": 0.3787, "step": 38200 }, { "epoch": 0.15046432349829553, "grad_norm": 0.4704892635345459, "learning_rate": 4.247991849849144e-05, "loss": 0.3968, "step": 38400 }, { "epoch": 0.15124799184984913, "grad_norm": 0.5690171718597412, "learning_rate": 4.2440735080913755e-05, "loss": 0.3541, "step": 38600 }, { "epoch": 0.15203166020140277, "grad_norm": 0.37451255321502686, "learning_rate": 4.2401747580423967e-05, "loss": 0.3912, "step": 38800 }, { "epoch": 0.15281532855295638, "grad_norm": 0.45569321513175964, "learning_rate": 4.236256416284629e-05, "loss": 0.3792, "step": 39000 }, { "epoch": 0.15359899690451, "grad_norm": 0.2282724827528, "learning_rate": 4.23233807452686e-05, "loss": 0.3611, "step": 39200 }, { "epoch": 0.15438266525606364, "grad_norm": 0.12927883863449097, "learning_rate": 4.228419732769092e-05, "loss": 0.3634, "step": 39400 }, { "epoch": 0.15516633360761725, "grad_norm": 0.10790091007947922, "learning_rate": 4.224501391011324e-05, "loss": 0.3735, "step": 39600 }, { "epoch": 0.1559500019591709, "grad_norm": 0.4959287643432617, "learning_rate": 4.220583049253556e-05, "loss": 0.397, "step": 39800 }, { "epoch": 0.1567336703107245, "grad_norm": 0.508669912815094, "learning_rate": 4.216664707495788e-05, "loss": 0.3648, "step": 40000 }, { "epoch": 0.15751733866227813, "grad_norm": 0.12706239521503448, "learning_rate": 4.21274636573802e-05, "loss": 0.3847, "step": 40200 }, { "epoch": 0.15830100701383173, "grad_norm": 0.34408318996429443, "learning_rate": 4.208828023980252e-05, "loss": 0.3878, "step": 40400 }, { "epoch": 0.15908467536538537, "grad_norm": 0.7723474502563477, "learning_rate": 4.2049096822224836e-05, "loss": 0.3864, "step": 40600 }, { "epoch": 0.159868343716939, "grad_norm": 0.5163781046867371, "learning_rate": 4.2009913404647156e-05, "loss": 0.4009, "step": 40800 }, { "epoch": 0.1606520120684926, "grad_norm": 0.5977711081504822, "learning_rate": 4.1970729987069476e-05, "loss": 0.3636, "step": 41000 }, { "epoch": 0.16143568042004625, "grad_norm": 0.45771196484565735, "learning_rate": 4.193154656949179e-05, "loss": 0.4214, "step": 41200 }, { "epoch": 0.16221934877159985, "grad_norm": 0.419515460729599, "learning_rate": 4.1892559069002e-05, "loss": 0.3806, "step": 41400 }, { "epoch": 0.1630030171231535, "grad_norm": 0.24074016511440277, "learning_rate": 4.185337565142432e-05, "loss": 0.4009, "step": 41600 }, { "epoch": 0.1637866854747071, "grad_norm": 0.2404531091451645, "learning_rate": 4.1814192233846635e-05, "loss": 0.414, "step": 41800 }, { "epoch": 0.16457035382626073, "grad_norm": 0.4593123495578766, "learning_rate": 4.1775008816268955e-05, "loss": 0.3933, "step": 42000 }, { "epoch": 0.16535402217781434, "grad_norm": 1.2924460172653198, "learning_rate": 4.1735825398691276e-05, "loss": 0.4005, "step": 42200 }, { "epoch": 0.16613769052936797, "grad_norm": 0.9045054912567139, "learning_rate": 4.1696641981113596e-05, "loss": 0.3948, "step": 42400 }, { "epoch": 0.1669213588809216, "grad_norm": 0.3707156479358673, "learning_rate": 4.165745856353591e-05, "loss": 0.4123, "step": 42600 }, { "epoch": 0.1677050272324752, "grad_norm": 0.6375682950019836, "learning_rate": 4.161827514595823e-05, "loss": 0.391, "step": 42800 }, { "epoch": 0.16848869558402885, "grad_norm": 0.5584186315536499, "learning_rate": 4.157909172838056e-05, "loss": 0.3916, "step": 43000 }, { "epoch": 0.16927236393558245, "grad_norm": 0.6738666296005249, "learning_rate": 4.153990831080287e-05, "loss": 0.3617, "step": 43200 }, { "epoch": 0.1700560322871361, "grad_norm": 0.13204245269298553, "learning_rate": 4.150072489322519e-05, "loss": 0.366, "step": 43400 }, { "epoch": 0.1708397006386897, "grad_norm": 0.39881256222724915, "learning_rate": 4.146154147564751e-05, "loss": 0.3682, "step": 43600 }, { "epoch": 0.17162336899024333, "grad_norm": 0.41980597376823425, "learning_rate": 4.1422553975157716e-05, "loss": 0.4205, "step": 43800 }, { "epoch": 0.17240703734179696, "grad_norm": 0.1891050487756729, "learning_rate": 4.1383370557580036e-05, "loss": 0.3426, "step": 44000 }, { "epoch": 0.17319070569335057, "grad_norm": 0.4376268684864044, "learning_rate": 4.1344187140002356e-05, "loss": 0.3869, "step": 44200 }, { "epoch": 0.1739743740449042, "grad_norm": 0.24275416135787964, "learning_rate": 4.130500372242467e-05, "loss": 0.3622, "step": 44400 }, { "epoch": 0.1747580423964578, "grad_norm": 0.6265236139297485, "learning_rate": 4.126582030484699e-05, "loss": 0.4394, "step": 44600 }, { "epoch": 0.17554171074801145, "grad_norm": 0.3068256676197052, "learning_rate": 4.122663688726931e-05, "loss": 0.3369, "step": 44800 }, { "epoch": 0.17632537909956505, "grad_norm": 0.22054247558116913, "learning_rate": 4.118745346969163e-05, "loss": 0.3804, "step": 45000 }, { "epoch": 0.1771090474511187, "grad_norm": 0.2504449486732483, "learning_rate": 4.1148270052113944e-05, "loss": 0.3651, "step": 45200 }, { "epoch": 0.1778927158026723, "grad_norm": 0.2043372392654419, "learning_rate": 4.1109086634536265e-05, "loss": 0.3724, "step": 45400 }, { "epoch": 0.17867638415422593, "grad_norm": 0.13071323931217194, "learning_rate": 4.1069903216958585e-05, "loss": 0.3941, "step": 45600 }, { "epoch": 0.17946005250577957, "grad_norm": 0.312150239944458, "learning_rate": 4.1030719799380905e-05, "loss": 0.3999, "step": 45800 }, { "epoch": 0.18024372085733317, "grad_norm": 0.18172216415405273, "learning_rate": 4.0991536381803225e-05, "loss": 0.4072, "step": 46000 }, { "epoch": 0.1810273892088868, "grad_norm": 0.22473308444023132, "learning_rate": 4.0952352964225546e-05, "loss": 0.3615, "step": 46200 }, { "epoch": 0.18181105756044041, "grad_norm": 0.9685620665550232, "learning_rate": 4.091316954664786e-05, "loss": 0.3721, "step": 46400 }, { "epoch": 0.18259472591199405, "grad_norm": 0.33434680104255676, "learning_rate": 4.087398612907018e-05, "loss": 0.3939, "step": 46600 }, { "epoch": 0.18337839426354766, "grad_norm": 0.516543984413147, "learning_rate": 4.08348027114925e-05, "loss": 0.336, "step": 46800 }, { "epoch": 0.1841620626151013, "grad_norm": 0.5582794547080994, "learning_rate": 4.079561929391481e-05, "loss": 0.411, "step": 47000 }, { "epoch": 0.18494573096665493, "grad_norm": 0.26512426137924194, "learning_rate": 4.0756435876337134e-05, "loss": 0.4075, "step": 47200 }, { "epoch": 0.18572939931820853, "grad_norm": 0.525921106338501, "learning_rate": 4.0717252458759454e-05, "loss": 0.3643, "step": 47400 }, { "epoch": 0.18651306766976217, "grad_norm": 0.8760446310043335, "learning_rate": 4.0678264958269666e-05, "loss": 0.4108, "step": 47600 }, { "epoch": 0.18729673602131577, "grad_norm": 0.40823042392730713, "learning_rate": 4.063908154069198e-05, "loss": 0.3425, "step": 47800 }, { "epoch": 0.1880804043728694, "grad_norm": 0.23763743042945862, "learning_rate": 4.05998981231143e-05, "loss": 0.3863, "step": 48000 }, { "epoch": 0.18886407272442302, "grad_norm": 0.36821314692497253, "learning_rate": 4.056071470553662e-05, "loss": 0.3791, "step": 48200 }, { "epoch": 0.18964774107597665, "grad_norm": 0.43061232566833496, "learning_rate": 4.052153128795893e-05, "loss": 0.3465, "step": 48400 }, { "epoch": 0.19043140942753026, "grad_norm": 0.35803717374801636, "learning_rate": 4.0482347870381253e-05, "loss": 0.3761, "step": 48600 }, { "epoch": 0.1912150777790839, "grad_norm": 0.2865389287471771, "learning_rate": 4.044316445280358e-05, "loss": 0.3646, "step": 48800 }, { "epoch": 0.19199874613063753, "grad_norm": 0.43889543414115906, "learning_rate": 4.0403981035225894e-05, "loss": 0.3621, "step": 49000 }, { "epoch": 0.19278241448219113, "grad_norm": 0.35403209924697876, "learning_rate": 4.0364797617648214e-05, "loss": 0.4233, "step": 49200 }, { "epoch": 0.19356608283374477, "grad_norm": 0.5109453201293945, "learning_rate": 4.0325614200070535e-05, "loss": 0.336, "step": 49400 }, { "epoch": 0.19434975118529838, "grad_norm": 0.604751706123352, "learning_rate": 4.028643078249285e-05, "loss": 0.3889, "step": 49600 }, { "epoch": 0.195133419536852, "grad_norm": 0.642396867275238, "learning_rate": 4.024724736491517e-05, "loss": 0.4073, "step": 49800 }, { "epoch": 0.19591708788840562, "grad_norm": 0.34808388352394104, "learning_rate": 4.020806394733749e-05, "loss": 0.3402, "step": 50000 }, { "epoch": 0.19591708788840562, "eval_loss": 0.3966066241264343, "eval_runtime": 194.3445, "eval_samples_per_second": 13.265, "eval_steps_per_second": 13.265, "step": 50000 }, { "epoch": 0.19670075623995925, "grad_norm": 0.6763666868209839, "learning_rate": 4.01690764468477e-05, "loss": 0.4119, "step": 50200 }, { "epoch": 0.19748442459151286, "grad_norm": 0.5871962904930115, "learning_rate": 4.0129893029270014e-05, "loss": 0.3564, "step": 50400 }, { "epoch": 0.1982680929430665, "grad_norm": 0.31573522090911865, "learning_rate": 4.0090709611692334e-05, "loss": 0.4356, "step": 50600 }, { "epoch": 0.19905176129462013, "grad_norm": 0.16465429961681366, "learning_rate": 4.0051526194114654e-05, "loss": 0.3719, "step": 50800 }, { "epoch": 0.19983542964617373, "grad_norm": 1.8124102354049683, "learning_rate": 4.001234277653697e-05, "loss": 0.3632, "step": 51000 }, { "epoch": 0.20061909799772737, "grad_norm": 0.41661161184310913, "learning_rate": 3.997315935895929e-05, "loss": 0.422, "step": 51200 }, { "epoch": 0.20140276634928098, "grad_norm": 0.5656956434249878, "learning_rate": 3.993397594138161e-05, "loss": 0.3747, "step": 51400 }, { "epoch": 0.2021864347008346, "grad_norm": 0.48378095030784607, "learning_rate": 3.989479252380393e-05, "loss": 0.3797, "step": 51600 }, { "epoch": 0.20297010305238822, "grad_norm": 0.13523226976394653, "learning_rate": 3.985560910622625e-05, "loss": 0.3951, "step": 51800 }, { "epoch": 0.20375377140394185, "grad_norm": 0.6183939576148987, "learning_rate": 3.981642568864857e-05, "loss": 0.3591, "step": 52000 }, { "epoch": 0.2045374397554955, "grad_norm": 0.17926977574825287, "learning_rate": 3.9777438188158774e-05, "loss": 0.3869, "step": 52200 }, { "epoch": 0.2053211081070491, "grad_norm": 0.5102852582931519, "learning_rate": 3.973825477058109e-05, "loss": 0.3654, "step": 52400 }, { "epoch": 0.20610477645860273, "grad_norm": 0.2515888512134552, "learning_rate": 3.969907135300341e-05, "loss": 0.3646, "step": 52600 }, { "epoch": 0.20688844481015634, "grad_norm": 1.4775642156600952, "learning_rate": 3.9659887935425735e-05, "loss": 0.4016, "step": 52800 }, { "epoch": 0.20767211316170997, "grad_norm": 0.6207823157310486, "learning_rate": 3.962070451784805e-05, "loss": 0.3738, "step": 53000 }, { "epoch": 0.20845578151326358, "grad_norm": 0.7464879155158997, "learning_rate": 3.958152110027037e-05, "loss": 0.3928, "step": 53200 }, { "epoch": 0.2092394498648172, "grad_norm": 0.40468066930770874, "learning_rate": 3.954233768269269e-05, "loss": 0.3551, "step": 53400 }, { "epoch": 0.21002311821637082, "grad_norm": 0.40929293632507324, "learning_rate": 3.9503154265115e-05, "loss": 0.3851, "step": 53600 }, { "epoch": 0.21080678656792445, "grad_norm": 0.3360099494457245, "learning_rate": 3.946397084753732e-05, "loss": 0.3806, "step": 53800 }, { "epoch": 0.2115904549194781, "grad_norm": 0.6595797538757324, "learning_rate": 3.942478742995964e-05, "loss": 0.3652, "step": 54000 }, { "epoch": 0.2123741232710317, "grad_norm": 0.2698526680469513, "learning_rate": 3.9385604012381964e-05, "loss": 0.3523, "step": 54200 }, { "epoch": 0.21315779162258533, "grad_norm": 0.10673108696937561, "learning_rate": 3.934642059480428e-05, "loss": 0.3383, "step": 54400 }, { "epoch": 0.21394145997413894, "grad_norm": 0.19577933847904205, "learning_rate": 3.93072371772266e-05, "loss": 0.3676, "step": 54600 }, { "epoch": 0.21472512832569257, "grad_norm": 0.40797916054725647, "learning_rate": 3.926805375964892e-05, "loss": 0.4309, "step": 54800 }, { "epoch": 0.21550879667724618, "grad_norm": 0.31533485651016235, "learning_rate": 3.922887034207124e-05, "loss": 0.412, "step": 55000 }, { "epoch": 0.2162924650287998, "grad_norm": 0.34004566073417664, "learning_rate": 3.918968692449356e-05, "loss": 0.3672, "step": 55200 }, { "epoch": 0.21707613338035345, "grad_norm": 0.37092098593711853, "learning_rate": 3.915050350691588e-05, "loss": 0.3981, "step": 55400 }, { "epoch": 0.21785980173190705, "grad_norm": 0.1739439070224762, "learning_rate": 3.911132008933819e-05, "loss": 0.3639, "step": 55600 }, { "epoch": 0.2186434700834607, "grad_norm": 0.6593155264854431, "learning_rate": 3.907213667176051e-05, "loss": 0.3587, "step": 55800 }, { "epoch": 0.2194271384350143, "grad_norm": 0.5279967784881592, "learning_rate": 3.903295325418283e-05, "loss": 0.4134, "step": 56000 }, { "epoch": 0.22021080678656793, "grad_norm": 0.4780954122543335, "learning_rate": 3.8993769836605146e-05, "loss": 0.382, "step": 56200 }, { "epoch": 0.22099447513812154, "grad_norm": 0.47533586621284485, "learning_rate": 3.895478233611536e-05, "loss": 0.3828, "step": 56400 }, { "epoch": 0.22177814348967517, "grad_norm": 0.45172592997550964, "learning_rate": 3.891559891853768e-05, "loss": 0.4104, "step": 56600 }, { "epoch": 0.22256181184122878, "grad_norm": 0.2017403542995453, "learning_rate": 3.887641550096e-05, "loss": 0.3942, "step": 56800 }, { "epoch": 0.22334548019278241, "grad_norm": 0.35245269536972046, "learning_rate": 3.883723208338231e-05, "loss": 0.3478, "step": 57000 }, { "epoch": 0.22412914854433605, "grad_norm": 0.45087093114852905, "learning_rate": 3.879804866580463e-05, "loss": 0.3963, "step": 57200 }, { "epoch": 0.22491281689588966, "grad_norm": 0.2895054221153259, "learning_rate": 3.875886524822695e-05, "loss": 0.3838, "step": 57400 }, { "epoch": 0.2256964852474433, "grad_norm": 0.3846510052680969, "learning_rate": 3.871968183064927e-05, "loss": 0.3923, "step": 57600 }, { "epoch": 0.2264801535989969, "grad_norm": 0.46527042984962463, "learning_rate": 3.868049841307159e-05, "loss": 0.3941, "step": 57800 }, { "epoch": 0.22726382195055053, "grad_norm": 0.634870707988739, "learning_rate": 3.864131499549391e-05, "loss": 0.3296, "step": 58000 }, { "epoch": 0.22804749030210414, "grad_norm": 0.36156928539276123, "learning_rate": 3.860213157791623e-05, "loss": 0.3585, "step": 58200 }, { "epoch": 0.22883115865365777, "grad_norm": 0.5111891031265259, "learning_rate": 3.856294816033855e-05, "loss": 0.3775, "step": 58400 }, { "epoch": 0.2296148270052114, "grad_norm": 0.3419630229473114, "learning_rate": 3.852396065984875e-05, "loss": 0.3315, "step": 58600 }, { "epoch": 0.23039849535676502, "grad_norm": 0.19386570155620575, "learning_rate": 3.848477724227107e-05, "loss": 0.3698, "step": 58800 }, { "epoch": 0.23118216370831865, "grad_norm": 0.3232468366622925, "learning_rate": 3.844559382469339e-05, "loss": 0.3915, "step": 59000 }, { "epoch": 0.23196583205987226, "grad_norm": 0.161575585603714, "learning_rate": 3.840641040711571e-05, "loss": 0.3538, "step": 59200 }, { "epoch": 0.2327495004114259, "grad_norm": 0.4957740604877472, "learning_rate": 3.836722698953803e-05, "loss": 0.3697, "step": 59400 }, { "epoch": 0.2335331687629795, "grad_norm": 0.4044831693172455, "learning_rate": 3.832804357196035e-05, "loss": 0.3581, "step": 59600 }, { "epoch": 0.23431683711453313, "grad_norm": 0.5998848080635071, "learning_rate": 3.828886015438267e-05, "loss": 0.3964, "step": 59800 }, { "epoch": 0.23510050546608674, "grad_norm": 0.813400149345398, "learning_rate": 3.824987265389288e-05, "loss": 0.3812, "step": 60000 }, { "epoch": 0.23588417381764037, "grad_norm": 0.3788783550262451, "learning_rate": 3.821068923631519e-05, "loss": 0.3726, "step": 60200 }, { "epoch": 0.236667842169194, "grad_norm": 0.8946236371994019, "learning_rate": 3.817150581873751e-05, "loss": 0.364, "step": 60400 }, { "epoch": 0.23745151052074762, "grad_norm": 0.38863125443458557, "learning_rate": 3.813232240115983e-05, "loss": 0.3869, "step": 60600 }, { "epoch": 0.23823517887230125, "grad_norm": 0.8564476370811462, "learning_rate": 3.8093138983582146e-05, "loss": 0.3176, "step": 60800 }, { "epoch": 0.23901884722385486, "grad_norm": 0.831457793712616, "learning_rate": 3.8053955566004467e-05, "loss": 0.3709, "step": 61000 }, { "epoch": 0.2398025155754085, "grad_norm": 0.2872166335582733, "learning_rate": 3.801477214842679e-05, "loss": 0.3941, "step": 61200 }, { "epoch": 0.2405861839269621, "grad_norm": 0.2783432602882385, "learning_rate": 3.797558873084911e-05, "loss": 0.4176, "step": 61400 }, { "epoch": 0.24136985227851573, "grad_norm": 0.33174943923950195, "learning_rate": 3.793640531327143e-05, "loss": 0.4079, "step": 61600 }, { "epoch": 0.24215352063006934, "grad_norm": 0.31831586360931396, "learning_rate": 3.789722189569375e-05, "loss": 0.3654, "step": 61800 }, { "epoch": 0.24293718898162298, "grad_norm": 0.8055663108825684, "learning_rate": 3.785803847811607e-05, "loss": 0.3685, "step": 62000 }, { "epoch": 0.2437208573331766, "grad_norm": 0.428229421377182, "learning_rate": 3.781885506053838e-05, "loss": 0.3524, "step": 62200 }, { "epoch": 0.24450452568473022, "grad_norm": 0.28733059763908386, "learning_rate": 3.77796716429607e-05, "loss": 0.3139, "step": 62400 }, { "epoch": 0.24528819403628385, "grad_norm": 0.2965468168258667, "learning_rate": 3.774048822538302e-05, "loss": 0.325, "step": 62600 }, { "epoch": 0.24607186238783746, "grad_norm": 0.4734314978122711, "learning_rate": 3.770150072489323e-05, "loss": 0.4431, "step": 62800 }, { "epoch": 0.2468555307393911, "grad_norm": 0.28449442982673645, "learning_rate": 3.766231730731555e-05, "loss": 0.3698, "step": 63000 }, { "epoch": 0.2476391990909447, "grad_norm": 0.18714368343353271, "learning_rate": 3.762313388973787e-05, "loss": 0.3816, "step": 63200 }, { "epoch": 0.24842286744249834, "grad_norm": 0.33139896392822266, "learning_rate": 3.758395047216018e-05, "loss": 0.363, "step": 63400 }, { "epoch": 0.24920653579405197, "grad_norm": 0.2831484377384186, "learning_rate": 3.75447670545825e-05, "loss": 0.3906, "step": 63600 }, { "epoch": 0.24999020414560558, "grad_norm": 0.55617356300354, "learning_rate": 3.750558363700482e-05, "loss": 0.42, "step": 63800 }, { "epoch": 0.2507738724971592, "grad_norm": 0.3242952525615692, "learning_rate": 3.746640021942714e-05, "loss": 0.411, "step": 64000 }, { "epoch": 0.25155754084871285, "grad_norm": 0.28202465176582336, "learning_rate": 3.7427216801849455e-05, "loss": 0.4155, "step": 64200 }, { "epoch": 0.2523412092002664, "grad_norm": 0.3347418010234833, "learning_rate": 3.7388033384271776e-05, "loss": 0.3819, "step": 64400 }, { "epoch": 0.25312487755182006, "grad_norm": 0.48041480779647827, "learning_rate": 3.73488499666941e-05, "loss": 0.3881, "step": 64600 }, { "epoch": 0.2539085459033737, "grad_norm": 0.8025708794593811, "learning_rate": 3.7309666549116416e-05, "loss": 0.3782, "step": 64800 }, { "epoch": 0.25469221425492733, "grad_norm": 0.6489108800888062, "learning_rate": 3.7270483131538737e-05, "loss": 0.3508, "step": 65000 }, { "epoch": 0.25547588260648096, "grad_norm": 0.6492595672607422, "learning_rate": 3.723129971396106e-05, "loss": 0.4482, "step": 65200 }, { "epoch": 0.25625955095803454, "grad_norm": 0.6477459073066711, "learning_rate": 3.719211629638337e-05, "loss": 0.3872, "step": 65400 }, { "epoch": 0.2570432193095882, "grad_norm": 0.21798567473888397, "learning_rate": 3.715293287880569e-05, "loss": 0.3981, "step": 65600 }, { "epoch": 0.2578268876611418, "grad_norm": 0.3467005789279938, "learning_rate": 3.711374946122801e-05, "loss": 0.3691, "step": 65800 }, { "epoch": 0.25861055601269545, "grad_norm": 0.4128284454345703, "learning_rate": 3.7074761960738216e-05, "loss": 0.4, "step": 66000 }, { "epoch": 0.259394224364249, "grad_norm": 0.573503315448761, "learning_rate": 3.7035578543160536e-05, "loss": 0.4024, "step": 66200 }, { "epoch": 0.26017789271580266, "grad_norm": 0.15045996010303497, "learning_rate": 3.6996395125582856e-05, "loss": 0.3806, "step": 66400 }, { "epoch": 0.2609615610673563, "grad_norm": 0.4502982795238495, "learning_rate": 3.695721170800518e-05, "loss": 0.3855, "step": 66600 }, { "epoch": 0.26174522941890993, "grad_norm": 0.4728052020072937, "learning_rate": 3.691802829042749e-05, "loss": 0.3688, "step": 66800 }, { "epoch": 0.26252889777046357, "grad_norm": 0.25955212116241455, "learning_rate": 3.687884487284981e-05, "loss": 0.3269, "step": 67000 }, { "epoch": 0.26331256612201714, "grad_norm": 0.7298959493637085, "learning_rate": 3.683966145527213e-05, "loss": 0.4002, "step": 67200 }, { "epoch": 0.2640962344735708, "grad_norm": 0.6033191084861755, "learning_rate": 3.6800478037694444e-05, "loss": 0.3835, "step": 67400 }, { "epoch": 0.2648799028251244, "grad_norm": 0.23632152378559113, "learning_rate": 3.676129462011677e-05, "loss": 0.4035, "step": 67600 }, { "epoch": 0.26566357117667805, "grad_norm": 0.42146193981170654, "learning_rate": 3.672211120253909e-05, "loss": 0.3661, "step": 67800 }, { "epoch": 0.2664472395282316, "grad_norm": 0.17430192232131958, "learning_rate": 3.6682927784961405e-05, "loss": 0.3654, "step": 68000 }, { "epoch": 0.26723090787978526, "grad_norm": 0.2119530439376831, "learning_rate": 3.6643744367383725e-05, "loss": 0.4031, "step": 68200 }, { "epoch": 0.2680145762313389, "grad_norm": 0.5437138676643372, "learning_rate": 3.6604560949806046e-05, "loss": 0.3247, "step": 68400 }, { "epoch": 0.26879824458289253, "grad_norm": 0.41820359230041504, "learning_rate": 3.656557344931625e-05, "loss": 0.3574, "step": 68600 }, { "epoch": 0.26958191293444617, "grad_norm": 0.3975052535533905, "learning_rate": 3.652639003173857e-05, "loss": 0.3127, "step": 68800 }, { "epoch": 0.27036558128599975, "grad_norm": 0.43151357769966125, "learning_rate": 3.648720661416089e-05, "loss": 0.3634, "step": 69000 }, { "epoch": 0.2711492496375534, "grad_norm": 0.5540599226951599, "learning_rate": 3.644802319658321e-05, "loss": 0.4269, "step": 69200 }, { "epoch": 0.271932917989107, "grad_norm": 0.5560029745101929, "learning_rate": 3.6408839779005525e-05, "loss": 0.3911, "step": 69400 }, { "epoch": 0.27271658634066065, "grad_norm": 0.23961585760116577, "learning_rate": 3.6369656361427845e-05, "loss": 0.3816, "step": 69600 }, { "epoch": 0.27350025469221423, "grad_norm": 0.21505390107631683, "learning_rate": 3.6330472943850166e-05, "loss": 0.3579, "step": 69800 }, { "epoch": 0.27428392304376786, "grad_norm": 0.2892753481864929, "learning_rate": 3.629128952627248e-05, "loss": 0.3707, "step": 70000 }, { "epoch": 0.2750675913953215, "grad_norm": 0.44339391589164734, "learning_rate": 3.62521061086948e-05, "loss": 0.3645, "step": 70200 }, { "epoch": 0.27585125974687513, "grad_norm": 0.36989185214042664, "learning_rate": 3.621292269111712e-05, "loss": 0.3879, "step": 70400 }, { "epoch": 0.27663492809842877, "grad_norm": 0.40105459094047546, "learning_rate": 3.617373927353944e-05, "loss": 0.3565, "step": 70600 }, { "epoch": 0.27741859644998235, "grad_norm": 0.3460588753223419, "learning_rate": 3.613455585596176e-05, "loss": 0.3808, "step": 70800 }, { "epoch": 0.278202264801536, "grad_norm": 0.3379383683204651, "learning_rate": 3.609537243838408e-05, "loss": 0.3601, "step": 71000 }, { "epoch": 0.2789859331530896, "grad_norm": 0.4138387143611908, "learning_rate": 3.6056189020806394e-05, "loss": 0.3514, "step": 71200 }, { "epoch": 0.27976960150464325, "grad_norm": 0.18285110592842102, "learning_rate": 3.6017005603228714e-05, "loss": 0.3463, "step": 71400 }, { "epoch": 0.2805532698561969, "grad_norm": 0.2968384623527527, "learning_rate": 3.5977822185651035e-05, "loss": 0.3992, "step": 71600 }, { "epoch": 0.28133693820775046, "grad_norm": 0.7367753982543945, "learning_rate": 3.5938638768073355e-05, "loss": 0.3625, "step": 71800 }, { "epoch": 0.2821206065593041, "grad_norm": 0.17875872552394867, "learning_rate": 3.589945535049567e-05, "loss": 0.3566, "step": 72000 }, { "epoch": 0.28290427491085773, "grad_norm": 0.5544998049736023, "learning_rate": 3.586027193291799e-05, "loss": 0.3939, "step": 72200 }, { "epoch": 0.28368794326241137, "grad_norm": 0.32148993015289307, "learning_rate": 3.582108851534031e-05, "loss": 0.3977, "step": 72400 }, { "epoch": 0.28447161161396495, "grad_norm": 0.32182982563972473, "learning_rate": 3.578190509776263e-05, "loss": 0.3355, "step": 72600 }, { "epoch": 0.2852552799655186, "grad_norm": 0.22902780771255493, "learning_rate": 3.574272168018495e-05, "loss": 0.362, "step": 72800 }, { "epoch": 0.2860389483170722, "grad_norm": 0.5000271797180176, "learning_rate": 3.570353826260727e-05, "loss": 0.3682, "step": 73000 }, { "epoch": 0.28682261666862585, "grad_norm": 0.40563029050827026, "learning_rate": 3.566435484502958e-05, "loss": 0.3854, "step": 73200 }, { "epoch": 0.2876062850201795, "grad_norm": 0.3997242748737335, "learning_rate": 3.5625171427451904e-05, "loss": 0.4195, "step": 73400 }, { "epoch": 0.28838995337173307, "grad_norm": 0.675685703754425, "learning_rate": 3.5585988009874224e-05, "loss": 0.3905, "step": 73600 }, { "epoch": 0.2891736217232867, "grad_norm": 0.5017514824867249, "learning_rate": 3.554680459229654e-05, "loss": 0.373, "step": 73800 }, { "epoch": 0.28995729007484033, "grad_norm": 0.40830105543136597, "learning_rate": 3.550762117471886e-05, "loss": 0.3974, "step": 74000 }, { "epoch": 0.29074095842639397, "grad_norm": 0.4665190577507019, "learning_rate": 3.5468437757141185e-05, "loss": 0.357, "step": 74200 }, { "epoch": 0.29152462677794755, "grad_norm": 0.5082120895385742, "learning_rate": 3.54292543395635e-05, "loss": 0.3535, "step": 74400 }, { "epoch": 0.2923082951295012, "grad_norm": 0.3896706700325012, "learning_rate": 3.53902668390737e-05, "loss": 0.3686, "step": 74600 }, { "epoch": 0.2930919634810548, "grad_norm": 0.6185327172279358, "learning_rate": 3.5351083421496023e-05, "loss": 0.3592, "step": 74800 }, { "epoch": 0.29387563183260845, "grad_norm": 0.3137947916984558, "learning_rate": 3.5311900003918344e-05, "loss": 0.3926, "step": 75000 }, { "epoch": 0.2946593001841621, "grad_norm": 0.6183467507362366, "learning_rate": 3.5272716586340664e-05, "loss": 0.3806, "step": 75200 }, { "epoch": 0.29544296853571567, "grad_norm": 0.25096866488456726, "learning_rate": 3.523372908585087e-05, "loss": 0.4003, "step": 75400 }, { "epoch": 0.2962266368872693, "grad_norm": 0.33048325777053833, "learning_rate": 3.519454566827319e-05, "loss": 0.366, "step": 75600 }, { "epoch": 0.29701030523882294, "grad_norm": 0.1668993979692459, "learning_rate": 3.51555581677834e-05, "loss": 0.3759, "step": 75800 }, { "epoch": 0.29779397359037657, "grad_norm": 1.363071084022522, "learning_rate": 3.5116374750205715e-05, "loss": 0.4054, "step": 76000 }, { "epoch": 0.29857764194193015, "grad_norm": 0.19746778905391693, "learning_rate": 3.5077191332628035e-05, "loss": 0.3431, "step": 76200 }, { "epoch": 0.2993613102934838, "grad_norm": 0.5256962180137634, "learning_rate": 3.5038007915050355e-05, "loss": 0.3402, "step": 76400 }, { "epoch": 0.3001449786450374, "grad_norm": 0.2743825912475586, "learning_rate": 3.499882449747267e-05, "loss": 0.3914, "step": 76600 }, { "epoch": 0.30092864699659105, "grad_norm": 0.6069706082344055, "learning_rate": 3.495964107989499e-05, "loss": 0.3516, "step": 76800 }, { "epoch": 0.3017123153481447, "grad_norm": 0.366163969039917, "learning_rate": 3.492045766231731e-05, "loss": 0.3732, "step": 77000 }, { "epoch": 0.30249598369969827, "grad_norm": 0.413269966840744, "learning_rate": 3.488127424473962e-05, "loss": 0.3747, "step": 77200 }, { "epoch": 0.3032796520512519, "grad_norm": 0.21370181441307068, "learning_rate": 3.484209082716195e-05, "loss": 0.3598, "step": 77400 }, { "epoch": 0.30406332040280554, "grad_norm": 0.2713698148727417, "learning_rate": 3.480290740958427e-05, "loss": 0.3718, "step": 77600 }, { "epoch": 0.30484698875435917, "grad_norm": 0.31190505623817444, "learning_rate": 3.4763723992006583e-05, "loss": 0.3864, "step": 77800 }, { "epoch": 0.30563065710591275, "grad_norm": 0.3205302357673645, "learning_rate": 3.4724540574428904e-05, "loss": 0.3465, "step": 78000 }, { "epoch": 0.3064143254574664, "grad_norm": 0.20165614783763885, "learning_rate": 3.4685357156851224e-05, "loss": 0.3701, "step": 78200 }, { "epoch": 0.30719799380902, "grad_norm": 0.15387390553951263, "learning_rate": 3.464617373927354e-05, "loss": 0.3553, "step": 78400 }, { "epoch": 0.30798166216057365, "grad_norm": 0.5583301186561584, "learning_rate": 3.460699032169586e-05, "loss": 0.3621, "step": 78600 }, { "epoch": 0.3087653305121273, "grad_norm": 0.5134016871452332, "learning_rate": 3.456780690411818e-05, "loss": 0.3998, "step": 78800 }, { "epoch": 0.30954899886368087, "grad_norm": 0.22689929604530334, "learning_rate": 3.45286234865405e-05, "loss": 0.3522, "step": 79000 }, { "epoch": 0.3103326672152345, "grad_norm": 0.4166206419467926, "learning_rate": 3.448944006896281e-05, "loss": 0.3562, "step": 79200 }, { "epoch": 0.31111633556678814, "grad_norm": 0.25005531311035156, "learning_rate": 3.445025665138514e-05, "loss": 0.3689, "step": 79400 }, { "epoch": 0.3119000039183418, "grad_norm": 0.12845709919929504, "learning_rate": 3.441107323380746e-05, "loss": 0.4182, "step": 79600 }, { "epoch": 0.3126836722698954, "grad_norm": 0.23673836886882782, "learning_rate": 3.437188981622977e-05, "loss": 0.4, "step": 79800 }, { "epoch": 0.313467340621449, "grad_norm": 0.7517299652099609, "learning_rate": 3.433270639865209e-05, "loss": 0.3738, "step": 80000 }, { "epoch": 0.3142510089730026, "grad_norm": 0.5294409990310669, "learning_rate": 3.42937188981623e-05, "loss": 0.3929, "step": 80200 }, { "epoch": 0.31503467732455626, "grad_norm": 0.42163851857185364, "learning_rate": 3.425473139767251e-05, "loss": 0.3925, "step": 80400 }, { "epoch": 0.3158183456761099, "grad_norm": 0.2595706880092621, "learning_rate": 3.421554798009482e-05, "loss": 0.3865, "step": 80600 }, { "epoch": 0.31660201402766347, "grad_norm": 0.4245240390300751, "learning_rate": 3.4176364562517144e-05, "loss": 0.3807, "step": 80800 }, { "epoch": 0.3173856823792171, "grad_norm": 0.31517165899276733, "learning_rate": 3.4137181144939464e-05, "loss": 0.3854, "step": 81000 }, { "epoch": 0.31816935073077074, "grad_norm": 0.5753512978553772, "learning_rate": 3.409799772736178e-05, "loss": 0.3481, "step": 81200 }, { "epoch": 0.3189530190823244, "grad_norm": 0.15293245017528534, "learning_rate": 3.40588143097841e-05, "loss": 0.3778, "step": 81400 }, { "epoch": 0.319736687433878, "grad_norm": 0.558912992477417, "learning_rate": 3.4019630892206425e-05, "loss": 0.3969, "step": 81600 }, { "epoch": 0.3205203557854316, "grad_norm": 0.25161799788475037, "learning_rate": 3.398044747462874e-05, "loss": 0.3561, "step": 81800 }, { "epoch": 0.3213040241369852, "grad_norm": 0.2745450735092163, "learning_rate": 3.394126405705106e-05, "loss": 0.3595, "step": 82000 }, { "epoch": 0.32208769248853886, "grad_norm": 0.12316953390836716, "learning_rate": 3.390208063947338e-05, "loss": 0.373, "step": 82200 }, { "epoch": 0.3228713608400925, "grad_norm": 0.21542522311210632, "learning_rate": 3.386289722189569e-05, "loss": 0.3658, "step": 82400 }, { "epoch": 0.32365502919164607, "grad_norm": 0.29948458075523376, "learning_rate": 3.382371380431801e-05, "loss": 0.3697, "step": 82600 }, { "epoch": 0.3244386975431997, "grad_norm": 0.5049545764923096, "learning_rate": 3.3784726303828224e-05, "loss": 0.384, "step": 82800 }, { "epoch": 0.32522236589475334, "grad_norm": 0.4615326225757599, "learning_rate": 3.3745542886250545e-05, "loss": 0.3409, "step": 83000 }, { "epoch": 0.326006034246307, "grad_norm": 0.5401926040649414, "learning_rate": 3.370635946867286e-05, "loss": 0.3604, "step": 83200 }, { "epoch": 0.3267897025978606, "grad_norm": 0.1781325489282608, "learning_rate": 3.366717605109518e-05, "loss": 0.3659, "step": 83400 }, { "epoch": 0.3275733709494142, "grad_norm": 0.2073005884885788, "learning_rate": 3.36279926335175e-05, "loss": 0.3962, "step": 83600 }, { "epoch": 0.3283570393009678, "grad_norm": 0.519901692867279, "learning_rate": 3.358880921593981e-05, "loss": 0.3965, "step": 83800 }, { "epoch": 0.32914070765252146, "grad_norm": 0.553053081035614, "learning_rate": 3.354962579836213e-05, "loss": 0.3834, "step": 84000 }, { "epoch": 0.3299243760040751, "grad_norm": 0.43088486790657043, "learning_rate": 3.351044238078445e-05, "loss": 0.3583, "step": 84200 }, { "epoch": 0.33070804435562867, "grad_norm": 0.18893325328826904, "learning_rate": 3.347125896320677e-05, "loss": 0.363, "step": 84400 }, { "epoch": 0.3314917127071823, "grad_norm": 0.1969502866268158, "learning_rate": 3.343207554562909e-05, "loss": 0.3614, "step": 84600 }, { "epoch": 0.33227538105873594, "grad_norm": 0.16047035157680511, "learning_rate": 3.3392892128051414e-05, "loss": 0.3861, "step": 84800 }, { "epoch": 0.3330590494102896, "grad_norm": 0.6296816468238831, "learning_rate": 3.335370871047373e-05, "loss": 0.4161, "step": 85000 }, { "epoch": 0.3338427177618432, "grad_norm": 0.4492548108100891, "learning_rate": 3.331452529289605e-05, "loss": 0.3869, "step": 85200 }, { "epoch": 0.3346263861133968, "grad_norm": 0.44898709654808044, "learning_rate": 3.327534187531837e-05, "loss": 0.3999, "step": 85400 }, { "epoch": 0.3354100544649504, "grad_norm": 0.20912408828735352, "learning_rate": 3.323615845774069e-05, "loss": 0.401, "step": 85600 }, { "epoch": 0.33619372281650406, "grad_norm": 0.36975884437561035, "learning_rate": 3.3196975040163e-05, "loss": 0.3661, "step": 85800 }, { "epoch": 0.3369773911680577, "grad_norm": 0.22726765275001526, "learning_rate": 3.315779162258532e-05, "loss": 0.3716, "step": 86000 }, { "epoch": 0.3377610595196113, "grad_norm": 0.24941909313201904, "learning_rate": 3.311860820500764e-05, "loss": 0.3602, "step": 86200 }, { "epoch": 0.3385447278711649, "grad_norm": 0.4467964172363281, "learning_rate": 3.307942478742996e-05, "loss": 0.3733, "step": 86400 }, { "epoch": 0.33932839622271854, "grad_norm": 0.7117724418640137, "learning_rate": 3.304024136985228e-05, "loss": 0.394, "step": 86600 }, { "epoch": 0.3401120645742722, "grad_norm": 0.1879279613494873, "learning_rate": 3.30010579522746e-05, "loss": 0.3867, "step": 86800 }, { "epoch": 0.3408957329258258, "grad_norm": 1.0636439323425293, "learning_rate": 3.2961874534696916e-05, "loss": 0.3813, "step": 87000 }, { "epoch": 0.3416794012773794, "grad_norm": 0.8648964762687683, "learning_rate": 3.292288703420712e-05, "loss": 0.3454, "step": 87200 }, { "epoch": 0.342463069628933, "grad_norm": 0.49916282296180725, "learning_rate": 3.288370361662945e-05, "loss": 0.365, "step": 87400 }, { "epoch": 0.34324673798048666, "grad_norm": 0.43505415320396423, "learning_rate": 3.284452019905176e-05, "loss": 0.3491, "step": 87600 }, { "epoch": 0.3440304063320403, "grad_norm": 0.2587812542915344, "learning_rate": 3.280533678147408e-05, "loss": 0.3529, "step": 87800 }, { "epoch": 0.34481407468359393, "grad_norm": 0.42354780435562134, "learning_rate": 3.27661533638964e-05, "loss": 0.3496, "step": 88000 }, { "epoch": 0.3455977430351475, "grad_norm": 0.17272061109542847, "learning_rate": 3.272696994631872e-05, "loss": 0.3952, "step": 88200 }, { "epoch": 0.34638141138670114, "grad_norm": 0.37294986844062805, "learning_rate": 3.2687786528741036e-05, "loss": 0.3774, "step": 88400 }, { "epoch": 0.3471650797382548, "grad_norm": 0.5656327605247498, "learning_rate": 3.2648603111163356e-05, "loss": 0.3644, "step": 88600 }, { "epoch": 0.3479487480898084, "grad_norm": 0.5601153373718262, "learning_rate": 3.260941969358568e-05, "loss": 0.3543, "step": 88800 }, { "epoch": 0.348732416441362, "grad_norm": 0.4643417298793793, "learning_rate": 3.257043219309588e-05, "loss": 0.3658, "step": 89000 }, { "epoch": 0.3495160847929156, "grad_norm": 0.29950958490371704, "learning_rate": 3.25312487755182e-05, "loss": 0.376, "step": 89200 }, { "epoch": 0.35029975314446926, "grad_norm": 0.2659519910812378, "learning_rate": 3.249206535794052e-05, "loss": 0.3835, "step": 89400 }, { "epoch": 0.3510834214960229, "grad_norm": 0.14695997536182404, "learning_rate": 3.245288194036284e-05, "loss": 0.3409, "step": 89600 }, { "epoch": 0.35186708984757653, "grad_norm": 0.28257864713668823, "learning_rate": 3.2413698522785156e-05, "loss": 0.3467, "step": 89800 }, { "epoch": 0.3526507581991301, "grad_norm": 0.29032102227211, "learning_rate": 3.2374515105207476e-05, "loss": 0.406, "step": 90000 }, { "epoch": 0.35343442655068374, "grad_norm": 0.2921280562877655, "learning_rate": 3.23353316876298e-05, "loss": 0.3421, "step": 90200 }, { "epoch": 0.3542180949022374, "grad_norm": 0.436246782541275, "learning_rate": 3.229614827005212e-05, "loss": 0.3923, "step": 90400 }, { "epoch": 0.355001763253791, "grad_norm": 0.3646532893180847, "learning_rate": 3.225696485247444e-05, "loss": 0.3627, "step": 90600 }, { "epoch": 0.3557854316053446, "grad_norm": 0.29675033688545227, "learning_rate": 3.221778143489676e-05, "loss": 0.3613, "step": 90800 }, { "epoch": 0.35656909995689823, "grad_norm": 0.3070316016674042, "learning_rate": 3.217859801731907e-05, "loss": 0.4035, "step": 91000 }, { "epoch": 0.35735276830845186, "grad_norm": 0.222697913646698, "learning_rate": 3.213941459974139e-05, "loss": 0.397, "step": 91200 }, { "epoch": 0.3581364366600055, "grad_norm": 0.6626273393630981, "learning_rate": 3.210023118216371e-05, "loss": 0.3437, "step": 91400 }, { "epoch": 0.35892010501155913, "grad_norm": 0.5618545413017273, "learning_rate": 3.2061243681673916e-05, "loss": 0.4135, "step": 91600 }, { "epoch": 0.3597037733631127, "grad_norm": 0.5214632153511047, "learning_rate": 3.202206026409624e-05, "loss": 0.361, "step": 91800 }, { "epoch": 0.36048744171466635, "grad_norm": 0.23996460437774658, "learning_rate": 3.198287684651856e-05, "loss": 0.3595, "step": 92000 }, { "epoch": 0.36127111006622, "grad_norm": 0.3504426181316376, "learning_rate": 3.194369342894088e-05, "loss": 0.4077, "step": 92200 }, { "epoch": 0.3620547784177736, "grad_norm": 0.23541449010372162, "learning_rate": 3.190451001136319e-05, "loss": 0.3631, "step": 92400 }, { "epoch": 0.3628384467693272, "grad_norm": 0.31811484694480896, "learning_rate": 3.186532659378551e-05, "loss": 0.3475, "step": 92600 }, { "epoch": 0.36362211512088083, "grad_norm": 0.46106281876564026, "learning_rate": 3.182614317620783e-05, "loss": 0.3107, "step": 92800 }, { "epoch": 0.36440578347243446, "grad_norm": 0.26417216658592224, "learning_rate": 3.1786959758630145e-05, "loss": 0.3534, "step": 93000 }, { "epoch": 0.3651894518239881, "grad_norm": 0.5394189953804016, "learning_rate": 3.1747776341052465e-05, "loss": 0.3872, "step": 93200 }, { "epoch": 0.36597312017554173, "grad_norm": 0.37850356101989746, "learning_rate": 3.170859292347479e-05, "loss": 0.3842, "step": 93400 }, { "epoch": 0.3667567885270953, "grad_norm": 0.2986568510532379, "learning_rate": 3.1669409505897106e-05, "loss": 0.3739, "step": 93600 }, { "epoch": 0.36754045687864895, "grad_norm": 0.18436530232429504, "learning_rate": 3.1630226088319426e-05, "loss": 0.3686, "step": 93800 }, { "epoch": 0.3683241252302026, "grad_norm": 0.09537161886692047, "learning_rate": 3.159123858782963e-05, "loss": 0.3794, "step": 94000 }, { "epoch": 0.3691077935817562, "grad_norm": 0.4849044680595398, "learning_rate": 3.155205517025195e-05, "loss": 0.375, "step": 94200 }, { "epoch": 0.36989146193330985, "grad_norm": 0.29660823941230774, "learning_rate": 3.151287175267427e-05, "loss": 0.3601, "step": 94400 }, { "epoch": 0.37067513028486343, "grad_norm": 0.5260242819786072, "learning_rate": 3.147368833509659e-05, "loss": 0.3828, "step": 94600 }, { "epoch": 0.37145879863641706, "grad_norm": 1.3941904306411743, "learning_rate": 3.143450491751891e-05, "loss": 0.3227, "step": 94800 }, { "epoch": 0.3722424669879707, "grad_norm": 0.22299116849899292, "learning_rate": 3.1395321499941226e-05, "loss": 0.3402, "step": 95000 }, { "epoch": 0.37302613533952433, "grad_norm": 0.10729559510946274, "learning_rate": 3.1356138082363546e-05, "loss": 0.3663, "step": 95200 }, { "epoch": 0.3738098036910779, "grad_norm": 0.32809117436408997, "learning_rate": 3.1316954664785866e-05, "loss": 0.4377, "step": 95400 }, { "epoch": 0.37459347204263155, "grad_norm": 0.46985819935798645, "learning_rate": 3.127777124720818e-05, "loss": 0.3762, "step": 95600 }, { "epoch": 0.3753771403941852, "grad_norm": 0.5396414995193481, "learning_rate": 3.12385878296305e-05, "loss": 0.3607, "step": 95800 }, { "epoch": 0.3761608087457388, "grad_norm": 0.17952068150043488, "learning_rate": 3.119940441205282e-05, "loss": 0.3672, "step": 96000 }, { "epoch": 0.37694447709729245, "grad_norm": 0.13644246757030487, "learning_rate": 3.116022099447514e-05, "loss": 0.3622, "step": 96200 }, { "epoch": 0.37772814544884603, "grad_norm": 0.49535325169563293, "learning_rate": 3.112103757689746e-05, "loss": 0.4016, "step": 96400 }, { "epoch": 0.37851181380039967, "grad_norm": 0.4390897750854492, "learning_rate": 3.1082050076407666e-05, "loss": 0.3592, "step": 96600 }, { "epoch": 0.3792954821519533, "grad_norm": 0.37850648164749146, "learning_rate": 3.1042866658829986e-05, "loss": 0.3116, "step": 96800 }, { "epoch": 0.38007915050350694, "grad_norm": 0.40341299772262573, "learning_rate": 3.10036832412523e-05, "loss": 0.4011, "step": 97000 }, { "epoch": 0.3808628188550605, "grad_norm": 0.1462254822254181, "learning_rate": 3.096449982367462e-05, "loss": 0.3363, "step": 97200 }, { "epoch": 0.38164648720661415, "grad_norm": 0.17707198858261108, "learning_rate": 3.092531640609695e-05, "loss": 0.3666, "step": 97400 }, { "epoch": 0.3824301555581678, "grad_norm": 0.2892052233219147, "learning_rate": 3.088613298851926e-05, "loss": 0.3411, "step": 97600 }, { "epoch": 0.3832138239097214, "grad_norm": 0.3313768208026886, "learning_rate": 3.084694957094158e-05, "loss": 0.3918, "step": 97800 }, { "epoch": 0.38399749226127505, "grad_norm": 0.26118066906929016, "learning_rate": 3.08077661533639e-05, "loss": 0.3674, "step": 98000 }, { "epoch": 0.38478116061282863, "grad_norm": 0.22374743223190308, "learning_rate": 3.0768582735786215e-05, "loss": 0.3412, "step": 98200 }, { "epoch": 0.38556482896438227, "grad_norm": 0.4654456079006195, "learning_rate": 3.0729399318208535e-05, "loss": 0.378, "step": 98400 }, { "epoch": 0.3863484973159359, "grad_norm": 0.3268471658229828, "learning_rate": 3.0690215900630855e-05, "loss": 0.3528, "step": 98600 }, { "epoch": 0.38713216566748954, "grad_norm": 0.1550745964050293, "learning_rate": 3.065103248305317e-05, "loss": 0.3457, "step": 98800 }, { "epoch": 0.3879158340190431, "grad_norm": 0.15858714282512665, "learning_rate": 3.061184906547549e-05, "loss": 0.3632, "step": 99000 }, { "epoch": 0.38869950237059675, "grad_norm": 0.21531552076339722, "learning_rate": 3.0572665647897816e-05, "loss": 0.3702, "step": 99200 }, { "epoch": 0.3894831707221504, "grad_norm": 0.3402751088142395, "learning_rate": 3.053348223032013e-05, "loss": 0.363, "step": 99400 }, { "epoch": 0.390266839073704, "grad_norm": 0.6650770306587219, "learning_rate": 3.049429881274245e-05, "loss": 0.3916, "step": 99600 }, { "epoch": 0.39105050742525765, "grad_norm": 0.14699183404445648, "learning_rate": 3.045511539516477e-05, "loss": 0.3837, "step": 99800 }, { "epoch": 0.39183417577681123, "grad_norm": 0.4576098918914795, "learning_rate": 3.0415931977587087e-05, "loss": 0.3838, "step": 100000 }, { "epoch": 0.39183417577681123, "eval_loss": 0.38670605421066284, "eval_runtime": 194.0092, "eval_samples_per_second": 13.288, "eval_steps_per_second": 13.288, "step": 100000 }, { "epoch": 0.39261784412836487, "grad_norm": 0.09350486099720001, "learning_rate": 3.0376748560009404e-05, "loss": 0.3232, "step": 100200 }, { "epoch": 0.3934015124799185, "grad_norm": 0.08104929327964783, "learning_rate": 3.0337565142431724e-05, "loss": 0.3905, "step": 100400 }, { "epoch": 0.39418518083147214, "grad_norm": 0.8304306864738464, "learning_rate": 3.029838172485404e-05, "loss": 0.333, "step": 100600 }, { "epoch": 0.3949688491830257, "grad_norm": 0.2307712733745575, "learning_rate": 3.025919830727636e-05, "loss": 0.3929, "step": 100800 }, { "epoch": 0.39575251753457935, "grad_norm": 0.4819060266017914, "learning_rate": 3.0220014889698685e-05, "loss": 0.3924, "step": 101000 }, { "epoch": 0.396536185886133, "grad_norm": 0.9390903115272522, "learning_rate": 3.0180831472121002e-05, "loss": 0.3232, "step": 101200 }, { "epoch": 0.3973198542376866, "grad_norm": 0.29699474573135376, "learning_rate": 3.014164805454332e-05, "loss": 0.3682, "step": 101400 }, { "epoch": 0.39810352258924026, "grad_norm": 0.1447882503271103, "learning_rate": 3.010246463696564e-05, "loss": 0.3677, "step": 101600 }, { "epoch": 0.39888719094079383, "grad_norm": 0.523335874080658, "learning_rate": 3.0063477136475844e-05, "loss": 0.37, "step": 101800 }, { "epoch": 0.39967085929234747, "grad_norm": 0.29856398701667786, "learning_rate": 3.002429371889816e-05, "loss": 0.3785, "step": 102000 }, { "epoch": 0.4004545276439011, "grad_norm": 0.5382548570632935, "learning_rate": 2.9985110301320485e-05, "loss": 0.4065, "step": 102200 }, { "epoch": 0.40123819599545474, "grad_norm": 0.30009403824806213, "learning_rate": 2.99459268837428e-05, "loss": 0.3726, "step": 102400 }, { "epoch": 0.4020218643470084, "grad_norm": 0.4505676329135895, "learning_rate": 2.990674346616512e-05, "loss": 0.3493, "step": 102600 }, { "epoch": 0.40280553269856195, "grad_norm": 0.27638620138168335, "learning_rate": 2.986756004858744e-05, "loss": 0.373, "step": 102800 }, { "epoch": 0.4035892010501156, "grad_norm": 0.38812577724456787, "learning_rate": 2.982837663100976e-05, "loss": 0.3578, "step": 103000 }, { "epoch": 0.4043728694016692, "grad_norm": 0.3605404496192932, "learning_rate": 2.9789389130519967e-05, "loss": 0.3788, "step": 103200 }, { "epoch": 0.40515653775322286, "grad_norm": 0.3588046431541443, "learning_rate": 2.9750205712942287e-05, "loss": 0.371, "step": 103400 }, { "epoch": 0.40594020610477644, "grad_norm": 0.16250497102737427, "learning_rate": 2.9711022295364604e-05, "loss": 0.3504, "step": 103600 }, { "epoch": 0.40672387445633007, "grad_norm": 0.24008536338806152, "learning_rate": 2.967183887778692e-05, "loss": 0.3565, "step": 103800 }, { "epoch": 0.4075075428078837, "grad_norm": 0.5740640759468079, "learning_rate": 2.963265546020924e-05, "loss": 0.3698, "step": 104000 }, { "epoch": 0.40829121115943734, "grad_norm": 0.20550820231437683, "learning_rate": 2.959347204263156e-05, "loss": 0.3675, "step": 104200 }, { "epoch": 0.409074879510991, "grad_norm": 0.3275391161441803, "learning_rate": 2.955428862505388e-05, "loss": 0.3806, "step": 104400 }, { "epoch": 0.40985854786254455, "grad_norm": 0.26002252101898193, "learning_rate": 2.9515105207476196e-05, "loss": 0.3923, "step": 104600 }, { "epoch": 0.4106422162140982, "grad_norm": 0.5443273186683655, "learning_rate": 2.9475921789898513e-05, "loss": 0.3766, "step": 104800 }, { "epoch": 0.4114258845656518, "grad_norm": 0.6181606650352478, "learning_rate": 2.9436738372320836e-05, "loss": 0.3713, "step": 105000 }, { "epoch": 0.41220955291720546, "grad_norm": 1.991551160812378, "learning_rate": 2.9397554954743156e-05, "loss": 0.3874, "step": 105200 }, { "epoch": 0.41299322126875904, "grad_norm": 0.6074121594429016, "learning_rate": 2.9358371537165473e-05, "loss": 0.3917, "step": 105400 }, { "epoch": 0.41377688962031267, "grad_norm": 0.5187014937400818, "learning_rate": 2.9319188119587794e-05, "loss": 0.3749, "step": 105600 }, { "epoch": 0.4145605579718663, "grad_norm": 0.3932857811450958, "learning_rate": 2.928000470201011e-05, "loss": 0.3472, "step": 105800 }, { "epoch": 0.41534422632341994, "grad_norm": 0.234763965010643, "learning_rate": 2.924082128443243e-05, "loss": 0.334, "step": 106000 }, { "epoch": 0.4161278946749736, "grad_norm": 0.093149833381176, "learning_rate": 2.9201637866854748e-05, "loss": 0.3768, "step": 106200 }, { "epoch": 0.41691156302652715, "grad_norm": 0.5720223188400269, "learning_rate": 2.9162650366364956e-05, "loss": 0.405, "step": 106400 }, { "epoch": 0.4176952313780808, "grad_norm": 0.30845069885253906, "learning_rate": 2.9123466948787276e-05, "loss": 0.3652, "step": 106600 }, { "epoch": 0.4184788997296344, "grad_norm": 0.4574078619480133, "learning_rate": 2.9084283531209593e-05, "loss": 0.3982, "step": 106800 }, { "epoch": 0.41926256808118806, "grad_norm": 0.18100954592227936, "learning_rate": 2.9045100113631914e-05, "loss": 0.3215, "step": 107000 }, { "epoch": 0.42004623643274164, "grad_norm": 0.26479223370552063, "learning_rate": 2.900591669605423e-05, "loss": 0.3522, "step": 107200 }, { "epoch": 0.4208299047842953, "grad_norm": 0.6265758872032166, "learning_rate": 2.8966733278476547e-05, "loss": 0.3641, "step": 107400 }, { "epoch": 0.4216135731358489, "grad_norm": 0.2577565908432007, "learning_rate": 2.8927549860898868e-05, "loss": 0.3631, "step": 107600 }, { "epoch": 0.42239724148740254, "grad_norm": 0.22920626401901245, "learning_rate": 2.8888366443321184e-05, "loss": 0.3971, "step": 107800 }, { "epoch": 0.4231809098389562, "grad_norm": 0.28307777643203735, "learning_rate": 2.8849183025743508e-05, "loss": 0.3856, "step": 108000 }, { "epoch": 0.42396457819050976, "grad_norm": 0.29488345980644226, "learning_rate": 2.880999960816583e-05, "loss": 0.3651, "step": 108200 }, { "epoch": 0.4247482465420634, "grad_norm": 0.3332146406173706, "learning_rate": 2.8771012107676033e-05, "loss": 0.3595, "step": 108400 }, { "epoch": 0.425531914893617, "grad_norm": 0.34239307045936584, "learning_rate": 2.873182869009835e-05, "loss": 0.3824, "step": 108600 }, { "epoch": 0.42631558324517066, "grad_norm": 0.3574167788028717, "learning_rate": 2.8692645272520667e-05, "loss": 0.363, "step": 108800 }, { "epoch": 0.42709925159672424, "grad_norm": 0.2348683625459671, "learning_rate": 2.8653461854942987e-05, "loss": 0.3364, "step": 109000 }, { "epoch": 0.4278829199482779, "grad_norm": 0.6222290396690369, "learning_rate": 2.861427843736531e-05, "loss": 0.3863, "step": 109200 }, { "epoch": 0.4286665882998315, "grad_norm": 0.7743702530860901, "learning_rate": 2.8575095019787628e-05, "loss": 0.3702, "step": 109400 }, { "epoch": 0.42945025665138514, "grad_norm": 0.3957701623439789, "learning_rate": 2.853591160220995e-05, "loss": 0.3667, "step": 109600 }, { "epoch": 0.4302339250029388, "grad_norm": 0.243615984916687, "learning_rate": 2.8496728184632265e-05, "loss": 0.3523, "step": 109800 }, { "epoch": 0.43101759335449236, "grad_norm": 0.32763805985450745, "learning_rate": 2.8457544767054582e-05, "loss": 0.3363, "step": 110000 }, { "epoch": 0.431801261706046, "grad_norm": 0.3364832401275635, "learning_rate": 2.8418361349476902e-05, "loss": 0.3415, "step": 110200 }, { "epoch": 0.4325849300575996, "grad_norm": 0.37985387444496155, "learning_rate": 2.837917793189922e-05, "loss": 0.3844, "step": 110400 }, { "epoch": 0.43336859840915326, "grad_norm": 0.3277980387210846, "learning_rate": 2.834019043140943e-05, "loss": 0.3699, "step": 110600 }, { "epoch": 0.4341522667607069, "grad_norm": 0.9344809651374817, "learning_rate": 2.8301007013831748e-05, "loss": 0.3666, "step": 110800 }, { "epoch": 0.4349359351122605, "grad_norm": 0.2958358824253082, "learning_rate": 2.8261823596254068e-05, "loss": 0.3413, "step": 111000 }, { "epoch": 0.4357196034638141, "grad_norm": 0.15358564257621765, "learning_rate": 2.8222640178676385e-05, "loss": 0.3553, "step": 111200 }, { "epoch": 0.43650327181536774, "grad_norm": 0.3972684144973755, "learning_rate": 2.8183456761098702e-05, "loss": 0.3536, "step": 111400 }, { "epoch": 0.4372869401669214, "grad_norm": 0.2892283797264099, "learning_rate": 2.8144273343521022e-05, "loss": 0.3831, "step": 111600 }, { "epoch": 0.43807060851847496, "grad_norm": 0.7200210094451904, "learning_rate": 2.810508992594334e-05, "loss": 0.3705, "step": 111800 }, { "epoch": 0.4388542768700286, "grad_norm": 0.7221992611885071, "learning_rate": 2.8065906508365663e-05, "loss": 0.3888, "step": 112000 }, { "epoch": 0.4396379452215822, "grad_norm": 0.5464593172073364, "learning_rate": 2.8026723090787983e-05, "loss": 0.3683, "step": 112200 }, { "epoch": 0.44042161357313586, "grad_norm": 0.26548904180526733, "learning_rate": 2.79875396732103e-05, "loss": 0.3792, "step": 112400 }, { "epoch": 0.4412052819246895, "grad_norm": 0.40648555755615234, "learning_rate": 2.7948356255632617e-05, "loss": 0.4111, "step": 112600 }, { "epoch": 0.4419889502762431, "grad_norm": 0.5338069796562195, "learning_rate": 2.7909172838054937e-05, "loss": 0.4011, "step": 112800 }, { "epoch": 0.4427726186277967, "grad_norm": 0.42862260341644287, "learning_rate": 2.7869989420477254e-05, "loss": 0.3704, "step": 113000 }, { "epoch": 0.44355628697935034, "grad_norm": 0.3932804763317108, "learning_rate": 2.7830806002899574e-05, "loss": 0.3481, "step": 113200 }, { "epoch": 0.444339955330904, "grad_norm": 0.44323670864105225, "learning_rate": 2.779162258532189e-05, "loss": 0.3601, "step": 113400 }, { "epoch": 0.44512362368245756, "grad_norm": 0.1578771471977234, "learning_rate": 2.7752635084832103e-05, "loss": 0.361, "step": 113600 }, { "epoch": 0.4459072920340112, "grad_norm": 0.26360565423965454, "learning_rate": 2.771345166725442e-05, "loss": 0.3324, "step": 113800 }, { "epoch": 0.44669096038556483, "grad_norm": 0.1808992475271225, "learning_rate": 2.7674268249676737e-05, "loss": 0.3725, "step": 114000 }, { "epoch": 0.44747462873711846, "grad_norm": 0.3972775936126709, "learning_rate": 2.7635084832099057e-05, "loss": 0.3728, "step": 114200 }, { "epoch": 0.4482582970886721, "grad_norm": 0.31185370683670044, "learning_rate": 2.7595901414521374e-05, "loss": 0.3711, "step": 114400 }, { "epoch": 0.4490419654402257, "grad_norm": 0.2995752990245819, "learning_rate": 2.7556717996943694e-05, "loss": 0.3867, "step": 114600 }, { "epoch": 0.4498256337917793, "grad_norm": 0.3175097703933716, "learning_rate": 2.751753457936601e-05, "loss": 0.3928, "step": 114800 }, { "epoch": 0.45060930214333295, "grad_norm": 0.2543286979198456, "learning_rate": 2.7478351161788335e-05, "loss": 0.3293, "step": 115000 }, { "epoch": 0.4513929704948866, "grad_norm": 0.6012951135635376, "learning_rate": 2.743916774421065e-05, "loss": 0.3735, "step": 115200 }, { "epoch": 0.45217663884644016, "grad_norm": 0.4118979871273041, "learning_rate": 2.7399984326632972e-05, "loss": 0.3301, "step": 115400 }, { "epoch": 0.4529603071979938, "grad_norm": 0.46776869893074036, "learning_rate": 2.736080090905529e-05, "loss": 0.3857, "step": 115600 }, { "epoch": 0.45374397554954743, "grad_norm": 0.5272917747497559, "learning_rate": 2.732161749147761e-05, "loss": 0.3445, "step": 115800 }, { "epoch": 0.45452764390110106, "grad_norm": 0.573877215385437, "learning_rate": 2.7282434073899926e-05, "loss": 0.3904, "step": 116000 }, { "epoch": 0.4553113122526547, "grad_norm": 0.18885576725006104, "learning_rate": 2.7243250656322243e-05, "loss": 0.4034, "step": 116200 }, { "epoch": 0.4560949806042083, "grad_norm": 0.2104359120130539, "learning_rate": 2.7204067238744563e-05, "loss": 0.3675, "step": 116400 }, { "epoch": 0.4568786489557619, "grad_norm": 0.5629342794418335, "learning_rate": 2.716488382116688e-05, "loss": 0.3695, "step": 116600 }, { "epoch": 0.45766231730731555, "grad_norm": 0.27658262848854065, "learning_rate": 2.7125700403589204e-05, "loss": 0.3515, "step": 116800 }, { "epoch": 0.4584459856588692, "grad_norm": 0.2814624607563019, "learning_rate": 2.7086516986011524e-05, "loss": 0.4002, "step": 117000 }, { "epoch": 0.4592296540104228, "grad_norm": 0.7413595914840698, "learning_rate": 2.704733356843384e-05, "loss": 0.3678, "step": 117200 }, { "epoch": 0.4600133223619764, "grad_norm": 0.21935655176639557, "learning_rate": 2.700815015085616e-05, "loss": 0.3575, "step": 117400 }, { "epoch": 0.46079699071353003, "grad_norm": 0.28784772753715515, "learning_rate": 2.6968966733278478e-05, "loss": 0.3537, "step": 117600 }, { "epoch": 0.46158065906508366, "grad_norm": 0.5971498489379883, "learning_rate": 2.6929783315700795e-05, "loss": 0.3755, "step": 117800 }, { "epoch": 0.4623643274166373, "grad_norm": 0.3393053412437439, "learning_rate": 2.6890795815211007e-05, "loss": 0.3934, "step": 118000 }, { "epoch": 0.4631479957681909, "grad_norm": 0.24666576087474823, "learning_rate": 2.6851612397633324e-05, "loss": 0.3787, "step": 118200 }, { "epoch": 0.4639316641197445, "grad_norm": 0.527527391910553, "learning_rate": 2.6812428980055644e-05, "loss": 0.4005, "step": 118400 }, { "epoch": 0.46471533247129815, "grad_norm": 0.26503968238830566, "learning_rate": 2.677324556247796e-05, "loss": 0.3758, "step": 118600 }, { "epoch": 0.4654990008228518, "grad_norm": 0.187395378947258, "learning_rate": 2.6734062144900278e-05, "loss": 0.388, "step": 118800 }, { "epoch": 0.4662826691744054, "grad_norm": 0.21370276808738708, "learning_rate": 2.6694878727322598e-05, "loss": 0.3851, "step": 119000 }, { "epoch": 0.467066337525959, "grad_norm": 0.5603575706481934, "learning_rate": 2.6655695309744915e-05, "loss": 0.351, "step": 119200 }, { "epoch": 0.46785000587751263, "grad_norm": 0.3410462439060211, "learning_rate": 2.6616511892167235e-05, "loss": 0.3767, "step": 119400 }, { "epoch": 0.46863367422906627, "grad_norm": 0.18788377940654755, "learning_rate": 2.6577328474589552e-05, "loss": 0.3726, "step": 119600 }, { "epoch": 0.4694173425806199, "grad_norm": 0.28751447796821594, "learning_rate": 2.6538145057011876e-05, "loss": 0.3259, "step": 119800 }, { "epoch": 0.4702010109321735, "grad_norm": 0.33174562454223633, "learning_rate": 2.649915755652208e-05, "loss": 0.3599, "step": 120000 }, { "epoch": 0.4709846792837271, "grad_norm": 0.4742797911167145, "learning_rate": 2.6459974138944398e-05, "loss": 0.3543, "step": 120200 }, { "epoch": 0.47176834763528075, "grad_norm": 0.299096018075943, "learning_rate": 2.6420790721366718e-05, "loss": 0.3898, "step": 120400 }, { "epoch": 0.4725520159868344, "grad_norm": 0.4215034246444702, "learning_rate": 2.6381607303789035e-05, "loss": 0.3391, "step": 120600 }, { "epoch": 0.473335684338388, "grad_norm": 0.15233637392520905, "learning_rate": 2.634242388621136e-05, "loss": 0.3715, "step": 120800 }, { "epoch": 0.4741193526899416, "grad_norm": 0.35297247767448425, "learning_rate": 2.630324046863368e-05, "loss": 0.3805, "step": 121000 }, { "epoch": 0.47490302104149523, "grad_norm": 0.33719316124916077, "learning_rate": 2.6264057051055996e-05, "loss": 0.4085, "step": 121200 }, { "epoch": 0.47568668939304887, "grad_norm": 0.21381162106990814, "learning_rate": 2.6224873633478313e-05, "loss": 0.3539, "step": 121400 }, { "epoch": 0.4764703577446025, "grad_norm": 0.49517229199409485, "learning_rate": 2.6185690215900633e-05, "loss": 0.3512, "step": 121600 }, { "epoch": 0.4772540260961561, "grad_norm": 0.15626825392246246, "learning_rate": 2.6146702715410838e-05, "loss": 0.3831, "step": 121800 }, { "epoch": 0.4780376944477097, "grad_norm": 0.7291291356086731, "learning_rate": 2.610751929783316e-05, "loss": 0.349, "step": 122000 }, { "epoch": 0.47882136279926335, "grad_norm": 0.28941255807876587, "learning_rate": 2.606833588025548e-05, "loss": 0.3902, "step": 122200 }, { "epoch": 0.479605031150817, "grad_norm": 0.22206926345825195, "learning_rate": 2.60291524626778e-05, "loss": 0.362, "step": 122400 }, { "epoch": 0.4803886995023706, "grad_norm": 0.17998214066028595, "learning_rate": 2.5989969045100116e-05, "loss": 0.3558, "step": 122600 }, { "epoch": 0.4811723678539242, "grad_norm": 0.7033812999725342, "learning_rate": 2.5950785627522432e-05, "loss": 0.3674, "step": 122800 }, { "epoch": 0.48195603620547783, "grad_norm": 0.2934001684188843, "learning_rate": 2.5911602209944753e-05, "loss": 0.3697, "step": 123000 }, { "epoch": 0.48273970455703147, "grad_norm": 0.649713397026062, "learning_rate": 2.587241879236707e-05, "loss": 0.3532, "step": 123200 }, { "epoch": 0.4835233729085851, "grad_norm": 2.5628743171691895, "learning_rate": 2.583323537478939e-05, "loss": 0.361, "step": 123400 }, { "epoch": 0.4843070412601387, "grad_norm": 0.491916686296463, "learning_rate": 2.5794051957211707e-05, "loss": 0.3323, "step": 123600 }, { "epoch": 0.4850907096116923, "grad_norm": 0.20333188772201538, "learning_rate": 2.575486853963403e-05, "loss": 0.3544, "step": 123800 }, { "epoch": 0.48587437796324595, "grad_norm": 0.5699710249900818, "learning_rate": 2.5715685122056347e-05, "loss": 0.3818, "step": 124000 }, { "epoch": 0.4866580463147996, "grad_norm": 0.3595122694969177, "learning_rate": 2.5676501704478668e-05, "loss": 0.3509, "step": 124200 }, { "epoch": 0.4874417146663532, "grad_norm": 0.6123571991920471, "learning_rate": 2.5637318286900985e-05, "loss": 0.3498, "step": 124400 }, { "epoch": 0.4882253830179068, "grad_norm": 0.29732608795166016, "learning_rate": 2.5598134869323305e-05, "loss": 0.3556, "step": 124600 }, { "epoch": 0.48900905136946043, "grad_norm": 0.2616589665412903, "learning_rate": 2.555895145174562e-05, "loss": 0.358, "step": 124800 }, { "epoch": 0.48979271972101407, "grad_norm": 0.3030960261821747, "learning_rate": 2.551976803416794e-05, "loss": 0.3904, "step": 125000 }, { "epoch": 0.4905763880725677, "grad_norm": 0.5628107786178589, "learning_rate": 2.548058461659026e-05, "loss": 0.4407, "step": 125200 }, { "epoch": 0.49136005642412134, "grad_norm": 0.3667828440666199, "learning_rate": 2.5441401199012576e-05, "loss": 0.3534, "step": 125400 }, { "epoch": 0.4921437247756749, "grad_norm": 0.4125637710094452, "learning_rate": 2.54022177814349e-05, "loss": 0.3501, "step": 125600 }, { "epoch": 0.49292739312722855, "grad_norm": 0.519904375076294, "learning_rate": 2.5363230280945104e-05, "loss": 0.3816, "step": 125800 }, { "epoch": 0.4937110614787822, "grad_norm": 0.12069497257471085, "learning_rate": 2.5324046863367425e-05, "loss": 0.3729, "step": 126000 }, { "epoch": 0.4944947298303358, "grad_norm": 0.22725090384483337, "learning_rate": 2.528486344578974e-05, "loss": 0.3878, "step": 126200 }, { "epoch": 0.4952783981818894, "grad_norm": 0.26038530468940735, "learning_rate": 2.524568002821206e-05, "loss": 0.356, "step": 126400 }, { "epoch": 0.49606206653344304, "grad_norm": 0.26929038763046265, "learning_rate": 2.520649661063438e-05, "loss": 0.3836, "step": 126600 }, { "epoch": 0.49684573488499667, "grad_norm": 0.6038582921028137, "learning_rate": 2.5167313193056702e-05, "loss": 0.352, "step": 126800 }, { "epoch": 0.4976294032365503, "grad_norm": 0.2038988471031189, "learning_rate": 2.5128325692566907e-05, "loss": 0.3889, "step": 127000 }, { "epoch": 0.49841307158810394, "grad_norm": 0.40812844038009644, "learning_rate": 2.5089142274989224e-05, "loss": 0.3554, "step": 127200 }, { "epoch": 0.4991967399396575, "grad_norm": 0.3789123594760895, "learning_rate": 2.5049958857411545e-05, "loss": 0.3474, "step": 127400 }, { "epoch": 0.49998040829121115, "grad_norm": 0.12067434936761856, "learning_rate": 2.501077543983386e-05, "loss": 0.3832, "step": 127600 }, { "epoch": 0.5007640766427648, "grad_norm": 0.3177630305290222, "learning_rate": 2.4971592022256182e-05, "loss": 0.3327, "step": 127800 }, { "epoch": 0.5015477449943184, "grad_norm": 0.48288750648498535, "learning_rate": 2.4932408604678502e-05, "loss": 0.3577, "step": 128000 }, { "epoch": 0.5023314133458721, "grad_norm": 0.5807780027389526, "learning_rate": 2.489322518710082e-05, "loss": 0.3329, "step": 128200 }, { "epoch": 0.5031150816974257, "grad_norm": 0.28707030415534973, "learning_rate": 2.485404176952314e-05, "loss": 0.3371, "step": 128400 }, { "epoch": 0.5038987500489793, "grad_norm": 0.6914198994636536, "learning_rate": 2.481485835194546e-05, "loss": 0.3817, "step": 128600 }, { "epoch": 0.5046824184005329, "grad_norm": 0.5673906803131104, "learning_rate": 2.4775674934367776e-05, "loss": 0.3884, "step": 128800 }, { "epoch": 0.5054660867520865, "grad_norm": 0.15600450336933136, "learning_rate": 2.4736491516790093e-05, "loss": 0.3744, "step": 129000 }, { "epoch": 0.5062497551036401, "grad_norm": 0.398987740278244, "learning_rate": 2.4697308099212417e-05, "loss": 0.3811, "step": 129200 }, { "epoch": 0.5070334234551938, "grad_norm": 0.20268848538398743, "learning_rate": 2.4658320598722622e-05, "loss": 0.3549, "step": 129400 }, { "epoch": 0.5078170918067474, "grad_norm": 0.24596478044986725, "learning_rate": 2.4619137181144942e-05, "loss": 0.3475, "step": 129600 }, { "epoch": 0.508600760158301, "grad_norm": 0.24833951890468597, "learning_rate": 2.457995376356726e-05, "loss": 0.3727, "step": 129800 }, { "epoch": 0.5093844285098547, "grad_norm": 0.22542235255241394, "learning_rate": 2.454077034598958e-05, "loss": 0.411, "step": 130000 }, { "epoch": 0.5101680968614083, "grad_norm": 0.3879716396331787, "learning_rate": 2.4501586928411896e-05, "loss": 0.3871, "step": 130200 }, { "epoch": 0.5109517652129619, "grad_norm": 0.6856550574302673, "learning_rate": 2.4462403510834217e-05, "loss": 0.3698, "step": 130400 }, { "epoch": 0.5117354335645155, "grad_norm": 0.14749617874622345, "learning_rate": 2.4423220093256537e-05, "loss": 0.3669, "step": 130600 }, { "epoch": 0.5125191019160691, "grad_norm": 0.5325919389724731, "learning_rate": 2.4384036675678854e-05, "loss": 0.339, "step": 130800 }, { "epoch": 0.5133027702676227, "grad_norm": 0.541504979133606, "learning_rate": 2.434485325810117e-05, "loss": 0.3807, "step": 131000 }, { "epoch": 0.5140864386191764, "grad_norm": 0.6320985555648804, "learning_rate": 2.4305669840523494e-05, "loss": 0.3451, "step": 131200 }, { "epoch": 0.51487010697073, "grad_norm": 0.7686489224433899, "learning_rate": 2.426648642294581e-05, "loss": 0.3896, "step": 131400 }, { "epoch": 0.5156537753222836, "grad_norm": 0.4429371953010559, "learning_rate": 2.422749892245602e-05, "loss": 0.3271, "step": 131600 }, { "epoch": 0.5164374436738373, "grad_norm": 0.379732221364975, "learning_rate": 2.4188315504878336e-05, "loss": 0.4014, "step": 131800 }, { "epoch": 0.5172211120253909, "grad_norm": 0.375230997800827, "learning_rate": 2.4149132087300653e-05, "loss": 0.3476, "step": 132000 }, { "epoch": 0.5180047803769445, "grad_norm": 0.6194652318954468, "learning_rate": 2.4109948669722974e-05, "loss": 0.3376, "step": 132200 }, { "epoch": 0.518788448728498, "grad_norm": 0.36911606788635254, "learning_rate": 2.4070765252145294e-05, "loss": 0.3797, "step": 132400 }, { "epoch": 0.5195721170800517, "grad_norm": 0.3317911922931671, "learning_rate": 2.4031581834567614e-05, "loss": 0.3573, "step": 132600 }, { "epoch": 0.5203557854316053, "grad_norm": 0.5983933806419373, "learning_rate": 2.399239841698993e-05, "loss": 0.3661, "step": 132800 }, { "epoch": 0.521139453783159, "grad_norm": 0.2932760715484619, "learning_rate": 2.3953214999412248e-05, "loss": 0.3946, "step": 133000 }, { "epoch": 0.5219231221347126, "grad_norm": 0.5710874795913696, "learning_rate": 2.391403158183457e-05, "loss": 0.4224, "step": 133200 }, { "epoch": 0.5227067904862662, "grad_norm": 0.3369152545928955, "learning_rate": 2.387484816425689e-05, "loss": 0.3943, "step": 133400 }, { "epoch": 0.5234904588378199, "grad_norm": 0.5752618908882141, "learning_rate": 2.3835664746679205e-05, "loss": 0.3313, "step": 133600 }, { "epoch": 0.5242741271893735, "grad_norm": 0.2937524616718292, "learning_rate": 2.3796481329101526e-05, "loss": 0.3886, "step": 133800 }, { "epoch": 0.5250577955409271, "grad_norm": 0.6479809284210205, "learning_rate": 2.3757297911523843e-05, "loss": 0.3579, "step": 134000 }, { "epoch": 0.5258414638924807, "grad_norm": 0.3808928430080414, "learning_rate": 2.3718114493946163e-05, "loss": 0.4038, "step": 134200 }, { "epoch": 0.5266251322440343, "grad_norm": 0.35928645730018616, "learning_rate": 2.367912699345637e-05, "loss": 0.3535, "step": 134400 }, { "epoch": 0.5274088005955879, "grad_norm": 0.4035384953022003, "learning_rate": 2.3639943575878688e-05, "loss": 0.3287, "step": 134600 }, { "epoch": 0.5281924689471416, "grad_norm": 0.17447680234909058, "learning_rate": 2.360076015830101e-05, "loss": 0.3703, "step": 134800 }, { "epoch": 0.5289761372986952, "grad_norm": 0.34512630105018616, "learning_rate": 2.3561576740723325e-05, "loss": 0.3781, "step": 135000 }, { "epoch": 0.5297598056502488, "grad_norm": 0.39121803641319275, "learning_rate": 2.352239332314565e-05, "loss": 0.3089, "step": 135200 }, { "epoch": 0.5305434740018025, "grad_norm": 1.4505184888839722, "learning_rate": 2.3483209905567966e-05, "loss": 0.3507, "step": 135400 }, { "epoch": 0.5313271423533561, "grad_norm": 0.37115374207496643, "learning_rate": 2.3444026487990283e-05, "loss": 0.3356, "step": 135600 }, { "epoch": 0.5321108107049097, "grad_norm": 0.37171250581741333, "learning_rate": 2.3404843070412603e-05, "loss": 0.3828, "step": 135800 }, { "epoch": 0.5328944790564633, "grad_norm": 0.17461098730564117, "learning_rate": 2.336565965283492e-05, "loss": 0.355, "step": 136000 }, { "epoch": 0.5336781474080169, "grad_norm": 0.40963634848594666, "learning_rate": 2.332647623525724e-05, "loss": 0.3905, "step": 136200 }, { "epoch": 0.5344618157595705, "grad_norm": 0.3249402642250061, "learning_rate": 2.328729281767956e-05, "loss": 0.3507, "step": 136400 }, { "epoch": 0.5352454841111242, "grad_norm": 0.6207261085510254, "learning_rate": 2.3248109400101877e-05, "loss": 0.387, "step": 136600 }, { "epoch": 0.5360291524626778, "grad_norm": 0.31390905380249023, "learning_rate": 2.3208925982524198e-05, "loss": 0.4063, "step": 136800 }, { "epoch": 0.5368128208142314, "grad_norm": 0.3568958640098572, "learning_rate": 2.3169938482034403e-05, "loss": 0.3361, "step": 137000 }, { "epoch": 0.5375964891657851, "grad_norm": 0.26150256395339966, "learning_rate": 2.3130755064456723e-05, "loss": 0.371, "step": 137200 }, { "epoch": 0.5383801575173387, "grad_norm": 0.46840909123420715, "learning_rate": 2.3091571646879043e-05, "loss": 0.3709, "step": 137400 }, { "epoch": 0.5391638258688923, "grad_norm": 0.15887069702148438, "learning_rate": 2.305238822930136e-05, "loss": 0.3796, "step": 137600 }, { "epoch": 0.5399474942204459, "grad_norm": 0.5783007740974426, "learning_rate": 2.301320481172368e-05, "loss": 0.3647, "step": 137800 }, { "epoch": 0.5407311625719995, "grad_norm": 0.42835474014282227, "learning_rate": 2.2974021394145997e-05, "loss": 0.3786, "step": 138000 }, { "epoch": 0.5415148309235531, "grad_norm": 0.13530884683132172, "learning_rate": 2.2934837976568317e-05, "loss": 0.3663, "step": 138200 }, { "epoch": 0.5422984992751068, "grad_norm": 0.6681059002876282, "learning_rate": 2.2895654558990638e-05, "loss": 0.3603, "step": 138400 }, { "epoch": 0.5430821676266604, "grad_norm": 0.44055140018463135, "learning_rate": 2.2856471141412955e-05, "loss": 0.3475, "step": 138600 }, { "epoch": 0.543865835978214, "grad_norm": 0.5066186189651489, "learning_rate": 2.2817287723835275e-05, "loss": 0.3722, "step": 138800 }, { "epoch": 0.5446495043297677, "grad_norm": 0.5097247958183289, "learning_rate": 2.2778104306257592e-05, "loss": 0.4018, "step": 139000 }, { "epoch": 0.5454331726813213, "grad_norm": 0.3178964853286743, "learning_rate": 2.2738920888679912e-05, "loss": 0.3567, "step": 139200 }, { "epoch": 0.5462168410328749, "grad_norm": 0.49194690585136414, "learning_rate": 2.2699737471102232e-05, "loss": 0.3509, "step": 139400 }, { "epoch": 0.5470005093844285, "grad_norm": 0.13328629732131958, "learning_rate": 2.266055405352455e-05, "loss": 0.3793, "step": 139600 }, { "epoch": 0.5477841777359821, "grad_norm": 0.36072656512260437, "learning_rate": 2.2621370635946866e-05, "loss": 0.3874, "step": 139800 }, { "epoch": 0.5485678460875357, "grad_norm": 0.49009180068969727, "learning_rate": 2.258218721836919e-05, "loss": 0.3512, "step": 140000 }, { "epoch": 0.5493515144390894, "grad_norm": 0.44969332218170166, "learning_rate": 2.2543003800791507e-05, "loss": 0.3814, "step": 140200 }, { "epoch": 0.550135182790643, "grad_norm": 0.48316630721092224, "learning_rate": 2.2504016300301715e-05, "loss": 0.3409, "step": 140400 }, { "epoch": 0.5509188511421966, "grad_norm": 0.17039939761161804, "learning_rate": 2.2464832882724032e-05, "loss": 0.3516, "step": 140600 }, { "epoch": 0.5517025194937503, "grad_norm": 0.32851117849349976, "learning_rate": 2.242564946514635e-05, "loss": 0.3733, "step": 140800 }, { "epoch": 0.5524861878453039, "grad_norm": 0.29249897599220276, "learning_rate": 2.238646604756867e-05, "loss": 0.3381, "step": 141000 }, { "epoch": 0.5532698561968575, "grad_norm": 1.3945118188858032, "learning_rate": 2.234728262999099e-05, "loss": 0.3429, "step": 141200 }, { "epoch": 0.5540535245484111, "grad_norm": 0.47640150785446167, "learning_rate": 2.230809921241331e-05, "loss": 0.3683, "step": 141400 }, { "epoch": 0.5548371928999647, "grad_norm": 0.3301154673099518, "learning_rate": 2.2268915794835627e-05, "loss": 0.3437, "step": 141600 }, { "epoch": 0.5556208612515183, "grad_norm": 0.22982585430145264, "learning_rate": 2.2229732377257944e-05, "loss": 0.3696, "step": 141800 }, { "epoch": 0.556404529603072, "grad_norm": 0.22164678573608398, "learning_rate": 2.2190548959680267e-05, "loss": 0.3299, "step": 142000 }, { "epoch": 0.5571881979546256, "grad_norm": 0.36188364028930664, "learning_rate": 2.2151365542102584e-05, "loss": 0.3586, "step": 142200 }, { "epoch": 0.5579718663061792, "grad_norm": 0.2797524631023407, "learning_rate": 2.21121821245249e-05, "loss": 0.3747, "step": 142400 }, { "epoch": 0.5587555346577329, "grad_norm": 0.4067152738571167, "learning_rate": 2.207299870694722e-05, "loss": 0.3421, "step": 142600 }, { "epoch": 0.5595392030092865, "grad_norm": 0.5025527477264404, "learning_rate": 2.2034011206457426e-05, "loss": 0.3508, "step": 142800 }, { "epoch": 0.5603228713608401, "grad_norm": 0.5966392755508423, "learning_rate": 2.1994827788879747e-05, "loss": 0.3717, "step": 143000 }, { "epoch": 0.5611065397123938, "grad_norm": 0.3739721179008484, "learning_rate": 2.1955644371302067e-05, "loss": 0.3705, "step": 143200 }, { "epoch": 0.5618902080639473, "grad_norm": 0.34258928894996643, "learning_rate": 2.1916460953724384e-05, "loss": 0.3527, "step": 143400 }, { "epoch": 0.5626738764155009, "grad_norm": 0.43817439675331116, "learning_rate": 2.1877277536146704e-05, "loss": 0.3607, "step": 143600 }, { "epoch": 0.5634575447670546, "grad_norm": 0.34139204025268555, "learning_rate": 2.183809411856902e-05, "loss": 0.3255, "step": 143800 }, { "epoch": 0.5642412131186082, "grad_norm": 0.4315531849861145, "learning_rate": 2.179891070099134e-05, "loss": 0.3339, "step": 144000 }, { "epoch": 0.5650248814701618, "grad_norm": 0.531743586063385, "learning_rate": 2.175972728341366e-05, "loss": 0.3295, "step": 144200 }, { "epoch": 0.5658085498217155, "grad_norm": 0.15134507417678833, "learning_rate": 2.172054386583598e-05, "loss": 0.3786, "step": 144400 }, { "epoch": 0.5665922181732691, "grad_norm": 0.13769565522670746, "learning_rate": 2.16813604482583e-05, "loss": 0.3968, "step": 144600 }, { "epoch": 0.5673758865248227, "grad_norm": 0.09113066643476486, "learning_rate": 2.1642177030680616e-05, "loss": 0.3741, "step": 144800 }, { "epoch": 0.5681595548763764, "grad_norm": 0.4379032254219055, "learning_rate": 2.1602993613102936e-05, "loss": 0.3511, "step": 145000 }, { "epoch": 0.5689432232279299, "grad_norm": 0.4613747298717499, "learning_rate": 2.1563810195525256e-05, "loss": 0.3208, "step": 145200 }, { "epoch": 0.5697268915794835, "grad_norm": 0.27241915464401245, "learning_rate": 2.1524626777947573e-05, "loss": 0.3796, "step": 145400 }, { "epoch": 0.5705105599310372, "grad_norm": 0.25275343656539917, "learning_rate": 2.1485443360369893e-05, "loss": 0.3376, "step": 145600 }, { "epoch": 0.5712942282825908, "grad_norm": 0.46639978885650635, "learning_rate": 2.1446455859880098e-05, "loss": 0.3244, "step": 145800 }, { "epoch": 0.5720778966341444, "grad_norm": 0.5711578726768494, "learning_rate": 2.140727244230242e-05, "loss": 0.3556, "step": 146000 }, { "epoch": 0.5728615649856981, "grad_norm": 0.27420786023139954, "learning_rate": 2.136808902472474e-05, "loss": 0.3714, "step": 146200 }, { "epoch": 0.5736452333372517, "grad_norm": 0.4411211907863617, "learning_rate": 2.1328905607147056e-05, "loss": 0.3712, "step": 146400 }, { "epoch": 0.5744289016888053, "grad_norm": 0.41663551330566406, "learning_rate": 2.1289722189569376e-05, "loss": 0.3589, "step": 146600 }, { "epoch": 0.575212570040359, "grad_norm": 0.3276519179344177, "learning_rate": 2.1250538771991693e-05, "loss": 0.35, "step": 146800 }, { "epoch": 0.5759962383919125, "grad_norm": 0.24392451345920563, "learning_rate": 2.1211355354414013e-05, "loss": 0.3789, "step": 147000 }, { "epoch": 0.5767799067434661, "grad_norm": 0.34200170636177063, "learning_rate": 2.1172171936836333e-05, "loss": 0.3619, "step": 147200 }, { "epoch": 0.5775635750950198, "grad_norm": 0.10846410691738129, "learning_rate": 2.113298851925865e-05, "loss": 0.3601, "step": 147400 }, { "epoch": 0.5783472434465734, "grad_norm": 0.9371908903121948, "learning_rate": 2.109380510168097e-05, "loss": 0.3304, "step": 147600 }, { "epoch": 0.579130911798127, "grad_norm": 0.6960780024528503, "learning_rate": 2.1054621684103287e-05, "loss": 0.3688, "step": 147800 }, { "epoch": 0.5799145801496807, "grad_norm": 0.2847283184528351, "learning_rate": 2.1015438266525608e-05, "loss": 0.3544, "step": 148000 }, { "epoch": 0.5806982485012343, "grad_norm": 0.4167712926864624, "learning_rate": 2.0976254848947928e-05, "loss": 0.3536, "step": 148200 }, { "epoch": 0.5814819168527879, "grad_norm": 0.3233075737953186, "learning_rate": 2.0937267348458133e-05, "loss": 0.3928, "step": 148400 }, { "epoch": 0.5822655852043416, "grad_norm": 0.20192258059978485, "learning_rate": 2.0898083930880453e-05, "loss": 0.3417, "step": 148600 }, { "epoch": 0.5830492535558951, "grad_norm": 0.28048059344291687, "learning_rate": 2.085890051330277e-05, "loss": 0.3899, "step": 148800 }, { "epoch": 0.5838329219074487, "grad_norm": 0.2952977418899536, "learning_rate": 2.081971709572509e-05, "loss": 0.3277, "step": 149000 }, { "epoch": 0.5846165902590024, "grad_norm": 0.4648447334766388, "learning_rate": 2.078053367814741e-05, "loss": 0.3828, "step": 149200 }, { "epoch": 0.585400258610556, "grad_norm": 0.21418029069900513, "learning_rate": 2.0741350260569728e-05, "loss": 0.3565, "step": 149400 }, { "epoch": 0.5861839269621096, "grad_norm": 0.5714257955551147, "learning_rate": 2.0702166842992048e-05, "loss": 0.3261, "step": 149600 }, { "epoch": 0.5869675953136633, "grad_norm": 0.20218569040298462, "learning_rate": 2.0662983425414365e-05, "loss": 0.37, "step": 149800 }, { "epoch": 0.5877512636652169, "grad_norm": 0.2833838164806366, "learning_rate": 2.0623800007836685e-05, "loss": 0.3215, "step": 150000 }, { "epoch": 0.5877512636652169, "eval_loss": 0.37760129570961, "eval_runtime": 194.768, "eval_samples_per_second": 13.236, "eval_steps_per_second": 13.236, "step": 150000 }, { "epoch": 0.5885349320167705, "grad_norm": 0.5451735854148865, "learning_rate": 2.0584616590259005e-05, "loss": 0.343, "step": 150200 }, { "epoch": 0.5893186003683242, "grad_norm": 0.24212311208248138, "learning_rate": 2.0545433172681322e-05, "loss": 0.3624, "step": 150400 }, { "epoch": 0.5901022687198777, "grad_norm": 0.45798975229263306, "learning_rate": 2.050624975510364e-05, "loss": 0.3747, "step": 150600 }, { "epoch": 0.5908859370714313, "grad_norm": 0.16003598272800446, "learning_rate": 2.0467066337525963e-05, "loss": 0.3374, "step": 150800 }, { "epoch": 0.591669605422985, "grad_norm": 0.18254567682743073, "learning_rate": 2.042788291994828e-05, "loss": 0.4032, "step": 151000 }, { "epoch": 0.5924532737745386, "grad_norm": 0.34588566422462463, "learning_rate": 2.0388699502370597e-05, "loss": 0.3869, "step": 151200 }, { "epoch": 0.5932369421260922, "grad_norm": 0.7721492648124695, "learning_rate": 2.0349516084792917e-05, "loss": 0.3519, "step": 151400 }, { "epoch": 0.5940206104776459, "grad_norm": 0.39813682436943054, "learning_rate": 2.0310332667215234e-05, "loss": 0.3363, "step": 151600 }, { "epoch": 0.5948042788291995, "grad_norm": 0.35051488876342773, "learning_rate": 2.0271149249637554e-05, "loss": 0.3686, "step": 151800 }, { "epoch": 0.5955879471807531, "grad_norm": 0.3599979281425476, "learning_rate": 2.0231965832059874e-05, "loss": 0.4051, "step": 152000 }, { "epoch": 0.5963716155323068, "grad_norm": 0.6114223003387451, "learning_rate": 2.019278241448219e-05, "loss": 0.3645, "step": 152200 }, { "epoch": 0.5971552838838603, "grad_norm": 0.4051482379436493, "learning_rate": 2.015359899690451e-05, "loss": 0.341, "step": 152400 }, { "epoch": 0.5979389522354139, "grad_norm": 1.0628719329833984, "learning_rate": 2.011441557932683e-05, "loss": 0.3318, "step": 152600 }, { "epoch": 0.5987226205869676, "grad_norm": 0.24264752864837646, "learning_rate": 2.007523216174915e-05, "loss": 0.3597, "step": 152800 }, { "epoch": 0.5995062889385212, "grad_norm": 0.5559861063957214, "learning_rate": 2.003604874417147e-05, "loss": 0.3434, "step": 153000 }, { "epoch": 0.6002899572900748, "grad_norm": 0.28309759497642517, "learning_rate": 1.9996865326593786e-05, "loss": 0.3344, "step": 153200 }, { "epoch": 0.6010736256416285, "grad_norm": 0.2588461637496948, "learning_rate": 1.9957681909016103e-05, "loss": 0.3923, "step": 153400 }, { "epoch": 0.6018572939931821, "grad_norm": 0.4383498430252075, "learning_rate": 1.9918498491438426e-05, "loss": 0.3595, "step": 153600 }, { "epoch": 0.6026409623447357, "grad_norm": 0.48799148201942444, "learning_rate": 1.9879315073860743e-05, "loss": 0.367, "step": 153800 }, { "epoch": 0.6034246306962894, "grad_norm": 0.4515002369880676, "learning_rate": 1.984013165628306e-05, "loss": 0.3464, "step": 154000 }, { "epoch": 0.6042082990478429, "grad_norm": 0.5179967880249023, "learning_rate": 1.980094823870538e-05, "loss": 0.3504, "step": 154200 }, { "epoch": 0.6049919673993965, "grad_norm": 0.11176899820566177, "learning_rate": 1.976196073821559e-05, "loss": 0.3539, "step": 154400 }, { "epoch": 0.6057756357509502, "grad_norm": 0.38047921657562256, "learning_rate": 1.9722777320637906e-05, "loss": 0.3932, "step": 154600 }, { "epoch": 0.6065593041025038, "grad_norm": 0.7975618243217468, "learning_rate": 1.9683593903060226e-05, "loss": 0.3088, "step": 154800 }, { "epoch": 0.6073429724540574, "grad_norm": 0.24565903842449188, "learning_rate": 1.9644606402570434e-05, "loss": 0.3632, "step": 155000 }, { "epoch": 0.6081266408056111, "grad_norm": 0.3727683126926422, "learning_rate": 1.960542298499275e-05, "loss": 0.3594, "step": 155200 }, { "epoch": 0.6089103091571647, "grad_norm": 0.19599901139736176, "learning_rate": 1.956623956741507e-05, "loss": 0.3675, "step": 155400 }, { "epoch": 0.6096939775087183, "grad_norm": 0.8321279287338257, "learning_rate": 1.952705614983739e-05, "loss": 0.3272, "step": 155600 }, { "epoch": 0.610477645860272, "grad_norm": 0.5361796021461487, "learning_rate": 1.948787273225971e-05, "loss": 0.3346, "step": 155800 }, { "epoch": 0.6112613142118255, "grad_norm": 0.48975545167922974, "learning_rate": 1.944868931468203e-05, "loss": 0.3677, "step": 156000 }, { "epoch": 0.6120449825633791, "grad_norm": 0.15650728344917297, "learning_rate": 1.9409505897104346e-05, "loss": 0.3098, "step": 156200 }, { "epoch": 0.6128286509149328, "grad_norm": 0.3466033935546875, "learning_rate": 1.9370322479526666e-05, "loss": 0.3553, "step": 156400 }, { "epoch": 0.6136123192664864, "grad_norm": 0.6844311356544495, "learning_rate": 1.9331139061948983e-05, "loss": 0.3756, "step": 156600 }, { "epoch": 0.61439598761804, "grad_norm": 0.3475654423236847, "learning_rate": 1.9291955644371303e-05, "loss": 0.3922, "step": 156800 }, { "epoch": 0.6151796559695937, "grad_norm": 0.4712473750114441, "learning_rate": 1.9252772226793624e-05, "loss": 0.3908, "step": 157000 }, { "epoch": 0.6159633243211473, "grad_norm": 0.34038087725639343, "learning_rate": 1.921358880921594e-05, "loss": 0.346, "step": 157200 }, { "epoch": 0.6167469926727009, "grad_norm": 0.0849527046084404, "learning_rate": 1.9174405391638257e-05, "loss": 0.3178, "step": 157400 }, { "epoch": 0.6175306610242546, "grad_norm": 0.5310103297233582, "learning_rate": 1.913522197406058e-05, "loss": 0.3783, "step": 157600 }, { "epoch": 0.6183143293758081, "grad_norm": 0.39207419753074646, "learning_rate": 1.9096038556482898e-05, "loss": 0.3475, "step": 157800 }, { "epoch": 0.6190979977273617, "grad_norm": 0.757610559463501, "learning_rate": 1.9056855138905215e-05, "loss": 0.3521, "step": 158000 }, { "epoch": 0.6198816660789154, "grad_norm": 0.10942260175943375, "learning_rate": 1.9017671721327535e-05, "loss": 0.3328, "step": 158200 }, { "epoch": 0.620665334430469, "grad_norm": 0.3848426043987274, "learning_rate": 1.8978488303749852e-05, "loss": 0.3473, "step": 158400 }, { "epoch": 0.6214490027820226, "grad_norm": 0.18226224184036255, "learning_rate": 1.8939304886172172e-05, "loss": 0.3375, "step": 158600 }, { "epoch": 0.6222326711335763, "grad_norm": 0.509005069732666, "learning_rate": 1.8900121468594493e-05, "loss": 0.3489, "step": 158800 }, { "epoch": 0.6230163394851299, "grad_norm": 0.4185326099395752, "learning_rate": 1.886093805101681e-05, "loss": 0.3681, "step": 159000 }, { "epoch": 0.6238000078366835, "grad_norm": 0.4330194592475891, "learning_rate": 1.882175463343913e-05, "loss": 0.3436, "step": 159200 }, { "epoch": 0.6245836761882372, "grad_norm": 0.5911514759063721, "learning_rate": 1.8782571215861447e-05, "loss": 0.3621, "step": 159400 }, { "epoch": 0.6253673445397908, "grad_norm": 0.2262437492609024, "learning_rate": 1.8743583715371655e-05, "loss": 0.3589, "step": 159600 }, { "epoch": 0.6261510128913443, "grad_norm": 0.3778083920478821, "learning_rate": 1.8704400297793975e-05, "loss": 0.371, "step": 159800 }, { "epoch": 0.626934681242898, "grad_norm": 0.4313370883464813, "learning_rate": 1.8665216880216292e-05, "loss": 0.3688, "step": 160000 }, { "epoch": 0.6277183495944516, "grad_norm": 0.4480289816856384, "learning_rate": 1.86262293797265e-05, "loss": 0.3558, "step": 160200 }, { "epoch": 0.6285020179460052, "grad_norm": 0.2418823391199112, "learning_rate": 1.8587045962148817e-05, "loss": 0.369, "step": 160400 }, { "epoch": 0.6292856862975589, "grad_norm": 0.16812770068645477, "learning_rate": 1.8547862544571138e-05, "loss": 0.3561, "step": 160600 }, { "epoch": 0.6300693546491125, "grad_norm": 0.3258691728115082, "learning_rate": 1.8508679126993458e-05, "loss": 0.3284, "step": 160800 }, { "epoch": 0.6308530230006661, "grad_norm": 0.4535875916481018, "learning_rate": 1.846949570941578e-05, "loss": 0.3492, "step": 161000 }, { "epoch": 0.6316366913522198, "grad_norm": 0.40306776762008667, "learning_rate": 1.8430312291838095e-05, "loss": 0.3749, "step": 161200 }, { "epoch": 0.6324203597037734, "grad_norm": 0.37273484468460083, "learning_rate": 1.8391128874260412e-05, "loss": 0.3563, "step": 161400 }, { "epoch": 0.6332040280553269, "grad_norm": 0.4654099643230438, "learning_rate": 1.8351945456682732e-05, "loss": 0.3416, "step": 161600 }, { "epoch": 0.6339876964068806, "grad_norm": 0.3393149673938751, "learning_rate": 1.8312762039105053e-05, "loss": 0.3583, "step": 161800 }, { "epoch": 0.6347713647584342, "grad_norm": 0.5733456611633301, "learning_rate": 1.827357862152737e-05, "loss": 0.3742, "step": 162000 }, { "epoch": 0.6355550331099878, "grad_norm": 0.4906434416770935, "learning_rate": 1.823439520394969e-05, "loss": 0.3885, "step": 162200 }, { "epoch": 0.6363387014615415, "grad_norm": 0.35643014311790466, "learning_rate": 1.8195211786372007e-05, "loss": 0.3751, "step": 162400 }, { "epoch": 0.6371223698130951, "grad_norm": 0.16521279513835907, "learning_rate": 1.8156028368794327e-05, "loss": 0.3643, "step": 162600 }, { "epoch": 0.6379060381646487, "grad_norm": 0.3319249749183655, "learning_rate": 1.8116844951216647e-05, "loss": 0.4232, "step": 162800 }, { "epoch": 0.6386897065162024, "grad_norm": 0.42640459537506104, "learning_rate": 1.8077661533638964e-05, "loss": 0.3618, "step": 163000 }, { "epoch": 0.639473374867756, "grad_norm": 0.3406376242637634, "learning_rate": 1.8038478116061285e-05, "loss": 0.38, "step": 163200 }, { "epoch": 0.6402570432193095, "grad_norm": 0.769278347492218, "learning_rate": 1.79992946984836e-05, "loss": 0.3545, "step": 163400 }, { "epoch": 0.6410407115708632, "grad_norm": 0.31369972229003906, "learning_rate": 1.796011128090592e-05, "loss": 0.3255, "step": 163600 }, { "epoch": 0.6418243799224168, "grad_norm": 0.29629456996917725, "learning_rate": 1.7920927863328242e-05, "loss": 0.3529, "step": 163800 }, { "epoch": 0.6426080482739704, "grad_norm": 0.15645676851272583, "learning_rate": 1.788174444575056e-05, "loss": 0.3279, "step": 164000 }, { "epoch": 0.6433917166255241, "grad_norm": 0.35391145944595337, "learning_rate": 1.7842561028172876e-05, "loss": 0.3339, "step": 164200 }, { "epoch": 0.6441753849770777, "grad_norm": 0.1588476449251175, "learning_rate": 1.7803573527683084e-05, "loss": 0.3603, "step": 164400 }, { "epoch": 0.6449590533286313, "grad_norm": 0.1144072562456131, "learning_rate": 1.7764390110105404e-05, "loss": 0.4077, "step": 164600 }, { "epoch": 0.645742721680185, "grad_norm": 0.22551771998405457, "learning_rate": 1.7725206692527725e-05, "loss": 0.4069, "step": 164800 }, { "epoch": 0.6465263900317386, "grad_norm": 0.10134784877300262, "learning_rate": 1.768602327495004e-05, "loss": 0.3394, "step": 165000 }, { "epoch": 0.6473100583832921, "grad_norm": 0.21326112747192383, "learning_rate": 1.7646839857372362e-05, "loss": 0.3608, "step": 165200 }, { "epoch": 0.6480937267348458, "grad_norm": 0.2849804759025574, "learning_rate": 1.760765643979468e-05, "loss": 0.3338, "step": 165400 }, { "epoch": 0.6488773950863994, "grad_norm": 0.2652018964290619, "learning_rate": 1.7568668939304887e-05, "loss": 0.3214, "step": 165600 }, { "epoch": 0.649661063437953, "grad_norm": 0.21313534677028656, "learning_rate": 1.7529485521727207e-05, "loss": 0.3919, "step": 165800 }, { "epoch": 0.6504447317895067, "grad_norm": 0.35072752833366394, "learning_rate": 1.7490302104149524e-05, "loss": 0.3319, "step": 166000 }, { "epoch": 0.6512284001410603, "grad_norm": 0.9032460451126099, "learning_rate": 1.7451118686571845e-05, "loss": 0.3514, "step": 166200 }, { "epoch": 0.652012068492614, "grad_norm": 0.27799591422080994, "learning_rate": 1.741193526899416e-05, "loss": 0.3909, "step": 166400 }, { "epoch": 0.6527957368441676, "grad_norm": 0.33158427476882935, "learning_rate": 1.7372751851416482e-05, "loss": 0.3592, "step": 166600 }, { "epoch": 0.6535794051957212, "grad_norm": 0.27624791860580444, "learning_rate": 1.7333568433838802e-05, "loss": 0.3967, "step": 166800 }, { "epoch": 0.6543630735472747, "grad_norm": 0.3143371045589447, "learning_rate": 1.729438501626112e-05, "loss": 0.4242, "step": 167000 }, { "epoch": 0.6551467418988284, "grad_norm": 0.44777911901474, "learning_rate": 1.725520159868344e-05, "loss": 0.3542, "step": 167200 }, { "epoch": 0.655930410250382, "grad_norm": 0.1704530268907547, "learning_rate": 1.7216018181105756e-05, "loss": 0.3924, "step": 167400 }, { "epoch": 0.6567140786019356, "grad_norm": 0.1803852766752243, "learning_rate": 1.7176834763528076e-05, "loss": 0.3525, "step": 167600 }, { "epoch": 0.6574977469534893, "grad_norm": 0.4806746542453766, "learning_rate": 1.7137651345950397e-05, "loss": 0.3775, "step": 167800 }, { "epoch": 0.6582814153050429, "grad_norm": 0.2006748467683792, "learning_rate": 1.7098467928372714e-05, "loss": 0.3625, "step": 168000 }, { "epoch": 0.6590650836565966, "grad_norm": 0.7129150629043579, "learning_rate": 1.705928451079503e-05, "loss": 0.3601, "step": 168200 }, { "epoch": 0.6598487520081502, "grad_norm": 0.23467794060707092, "learning_rate": 1.702010109321735e-05, "loss": 0.3878, "step": 168400 }, { "epoch": 0.6606324203597038, "grad_norm": 0.7448341846466064, "learning_rate": 1.698091767563967e-05, "loss": 0.3302, "step": 168600 }, { "epoch": 0.6614160887112573, "grad_norm": 0.3080410659313202, "learning_rate": 1.6941734258061988e-05, "loss": 0.3532, "step": 168800 }, { "epoch": 0.662199757062811, "grad_norm": 0.41210758686065674, "learning_rate": 1.6902550840484308e-05, "loss": 0.3792, "step": 169000 }, { "epoch": 0.6629834254143646, "grad_norm": 0.18141373991966248, "learning_rate": 1.6863367422906625e-05, "loss": 0.3536, "step": 169200 }, { "epoch": 0.6637670937659182, "grad_norm": 0.31794726848602295, "learning_rate": 1.6824184005328945e-05, "loss": 0.3269, "step": 169400 }, { "epoch": 0.6645507621174719, "grad_norm": 0.34971383213996887, "learning_rate": 1.6785196504839154e-05, "loss": 0.3576, "step": 169600 }, { "epoch": 0.6653344304690255, "grad_norm": 0.6500710248947144, "learning_rate": 1.6746013087261474e-05, "loss": 0.3559, "step": 169800 }, { "epoch": 0.6661180988205792, "grad_norm": 0.519538938999176, "learning_rate": 1.670682966968379e-05, "loss": 0.3484, "step": 170000 }, { "epoch": 0.6669017671721328, "grad_norm": 0.5022698044776917, "learning_rate": 1.6667646252106108e-05, "loss": 0.3249, "step": 170200 }, { "epoch": 0.6676854355236864, "grad_norm": 0.4107462763786316, "learning_rate": 1.6628462834528428e-05, "loss": 0.3771, "step": 170400 }, { "epoch": 0.66846910387524, "grad_norm": 2.687856674194336, "learning_rate": 1.658927941695075e-05, "loss": 0.3688, "step": 170600 }, { "epoch": 0.6692527722267936, "grad_norm": 0.5536699891090393, "learning_rate": 1.6550095999373065e-05, "loss": 0.3749, "step": 170800 }, { "epoch": 0.6700364405783472, "grad_norm": 0.218756303191185, "learning_rate": 1.6510912581795386e-05, "loss": 0.3712, "step": 171000 }, { "epoch": 0.6708201089299008, "grad_norm": 0.35229411721229553, "learning_rate": 1.6471729164217702e-05, "loss": 0.3897, "step": 171200 }, { "epoch": 0.6716037772814545, "grad_norm": 0.48240748047828674, "learning_rate": 1.6432545746640023e-05, "loss": 0.3488, "step": 171400 }, { "epoch": 0.6723874456330081, "grad_norm": 0.24678152799606323, "learning_rate": 1.6393362329062343e-05, "loss": 0.3405, "step": 171600 }, { "epoch": 0.6731711139845618, "grad_norm": 0.25077906250953674, "learning_rate": 1.635417891148466e-05, "loss": 0.3448, "step": 171800 }, { "epoch": 0.6739547823361154, "grad_norm": 0.32912206649780273, "learning_rate": 1.631499549390698e-05, "loss": 0.3831, "step": 172000 }, { "epoch": 0.674738450687669, "grad_norm": 0.20715036988258362, "learning_rate": 1.6276007993417185e-05, "loss": 0.3817, "step": 172200 }, { "epoch": 0.6755221190392225, "grad_norm": 1.022975206375122, "learning_rate": 1.6236824575839505e-05, "loss": 0.3417, "step": 172400 }, { "epoch": 0.6763057873907762, "grad_norm": 0.18375138938426971, "learning_rate": 1.6197641158261826e-05, "loss": 0.3864, "step": 172600 }, { "epoch": 0.6770894557423298, "grad_norm": 0.17378203570842743, "learning_rate": 1.6158457740684143e-05, "loss": 0.3686, "step": 172800 }, { "epoch": 0.6778731240938835, "grad_norm": 0.3172835409641266, "learning_rate": 1.6119274323106463e-05, "loss": 0.348, "step": 173000 }, { "epoch": 0.6786567924454371, "grad_norm": 0.3347463309764862, "learning_rate": 1.608009090552878e-05, "loss": 0.3353, "step": 173200 }, { "epoch": 0.6794404607969907, "grad_norm": 0.48238494992256165, "learning_rate": 1.60409074879511e-05, "loss": 0.3266, "step": 173400 }, { "epoch": 0.6802241291485444, "grad_norm": 0.3434430658817291, "learning_rate": 1.600172407037342e-05, "loss": 0.3881, "step": 173600 }, { "epoch": 0.681007797500098, "grad_norm": 0.570183277130127, "learning_rate": 1.5962540652795737e-05, "loss": 0.39, "step": 173800 }, { "epoch": 0.6817914658516516, "grad_norm": 0.3380807340145111, "learning_rate": 1.5923357235218057e-05, "loss": 0.3485, "step": 174000 }, { "epoch": 0.6825751342032053, "grad_norm": 0.40217986702919006, "learning_rate": 1.5884173817640374e-05, "loss": 0.309, "step": 174200 }, { "epoch": 0.6833588025547588, "grad_norm": 0.6215846538543701, "learning_rate": 1.5845186317150583e-05, "loss": 0.4265, "step": 174400 }, { "epoch": 0.6841424709063124, "grad_norm": 0.1687578558921814, "learning_rate": 1.5806002899572903e-05, "loss": 0.3682, "step": 174600 }, { "epoch": 0.684926139257866, "grad_norm": 0.24993237853050232, "learning_rate": 1.576681948199522e-05, "loss": 0.3282, "step": 174800 }, { "epoch": 0.6857098076094197, "grad_norm": 0.39133015275001526, "learning_rate": 1.572763606441754e-05, "loss": 0.3211, "step": 175000 }, { "epoch": 0.6864934759609733, "grad_norm": 0.17297115921974182, "learning_rate": 1.5688452646839857e-05, "loss": 0.3706, "step": 175200 }, { "epoch": 0.687277144312527, "grad_norm": 0.32759934663772583, "learning_rate": 1.5649269229262174e-05, "loss": 0.3458, "step": 175400 }, { "epoch": 0.6880608126640806, "grad_norm": 0.5024855732917786, "learning_rate": 1.5610085811684498e-05, "loss": 0.2964, "step": 175600 }, { "epoch": 0.6888444810156342, "grad_norm": 0.150591179728508, "learning_rate": 1.5571098311194703e-05, "loss": 0.3431, "step": 175800 }, { "epoch": 0.6896281493671879, "grad_norm": 0.27106091380119324, "learning_rate": 1.5531914893617023e-05, "loss": 0.3523, "step": 176000 }, { "epoch": 0.6904118177187414, "grad_norm": 0.29715707898139954, "learning_rate": 1.549273147603934e-05, "loss": 0.3641, "step": 176200 }, { "epoch": 0.691195486070295, "grad_norm": 0.457747220993042, "learning_rate": 1.545354805846166e-05, "loss": 0.3852, "step": 176400 }, { "epoch": 0.6919791544218487, "grad_norm": 0.280843585729599, "learning_rate": 1.541436464088398e-05, "loss": 0.3267, "step": 176600 }, { "epoch": 0.6927628227734023, "grad_norm": 0.18428704142570496, "learning_rate": 1.5375181223306297e-05, "loss": 0.4184, "step": 176800 }, { "epoch": 0.6935464911249559, "grad_norm": 0.6016808152198792, "learning_rate": 1.5335997805728618e-05, "loss": 0.3222, "step": 177000 }, { "epoch": 0.6943301594765096, "grad_norm": 0.2656881809234619, "learning_rate": 1.5296814388150934e-05, "loss": 0.3282, "step": 177200 }, { "epoch": 0.6951138278280632, "grad_norm": 0.41041797399520874, "learning_rate": 1.5257630970573253e-05, "loss": 0.3626, "step": 177400 }, { "epoch": 0.6958974961796168, "grad_norm": 0.358990877866745, "learning_rate": 1.5218447552995573e-05, "loss": 0.3374, "step": 177600 }, { "epoch": 0.6966811645311705, "grad_norm": 0.16006579995155334, "learning_rate": 1.5179264135417892e-05, "loss": 0.334, "step": 177800 }, { "epoch": 0.697464832882724, "grad_norm": 0.5395707488059998, "learning_rate": 1.514008071784021e-05, "loss": 0.3863, "step": 178000 }, { "epoch": 0.6982485012342776, "grad_norm": 0.5989649295806885, "learning_rate": 1.5100897300262529e-05, "loss": 0.3752, "step": 178200 }, { "epoch": 0.6990321695858313, "grad_norm": 0.851452648639679, "learning_rate": 1.5061909799772736e-05, "loss": 0.3659, "step": 178400 }, { "epoch": 0.6998158379373849, "grad_norm": 0.31787481904029846, "learning_rate": 1.5022726382195058e-05, "loss": 0.3523, "step": 178600 }, { "epoch": 0.7005995062889385, "grad_norm": 0.16660988330841064, "learning_rate": 1.4983542964617375e-05, "loss": 0.3819, "step": 178800 }, { "epoch": 0.7013831746404922, "grad_norm": 0.6440271139144897, "learning_rate": 1.4944359547039693e-05, "loss": 0.3654, "step": 179000 }, { "epoch": 0.7021668429920458, "grad_norm": 0.4141835570335388, "learning_rate": 1.4905176129462012e-05, "loss": 0.3047, "step": 179200 }, { "epoch": 0.7029505113435994, "grad_norm": 0.23103243112564087, "learning_rate": 1.486599271188433e-05, "loss": 0.3703, "step": 179400 }, { "epoch": 0.7037341796951531, "grad_norm": 0.1522352248430252, "learning_rate": 1.482680929430665e-05, "loss": 0.3393, "step": 179600 }, { "epoch": 0.7045178480467066, "grad_norm": 0.38634273409843445, "learning_rate": 1.478762587672897e-05, "loss": 0.3341, "step": 179800 }, { "epoch": 0.7053015163982602, "grad_norm": 0.4919366240501404, "learning_rate": 1.4748442459151288e-05, "loss": 0.3579, "step": 180000 }, { "epoch": 0.7060851847498139, "grad_norm": 0.5656337141990662, "learning_rate": 1.4709259041573606e-05, "loss": 0.3707, "step": 180200 }, { "epoch": 0.7068688531013675, "grad_norm": 0.128086656332016, "learning_rate": 1.4670075623995927e-05, "loss": 0.3254, "step": 180400 }, { "epoch": 0.7076525214529211, "grad_norm": 0.4238952696323395, "learning_rate": 1.4630892206418245e-05, "loss": 0.378, "step": 180600 }, { "epoch": 0.7084361898044748, "grad_norm": 0.15909014642238617, "learning_rate": 1.4591708788840564e-05, "loss": 0.3135, "step": 180800 }, { "epoch": 0.7092198581560284, "grad_norm": 0.43431320786476135, "learning_rate": 1.4552525371262882e-05, "loss": 0.3635, "step": 181000 }, { "epoch": 0.710003526507582, "grad_norm": 0.2068466693162918, "learning_rate": 1.45133419536852e-05, "loss": 0.3395, "step": 181200 }, { "epoch": 0.7107871948591357, "grad_norm": 0.27643465995788574, "learning_rate": 1.4474158536107521e-05, "loss": 0.3364, "step": 181400 }, { "epoch": 0.7115708632106892, "grad_norm": 0.23885765671730042, "learning_rate": 1.443497511852984e-05, "loss": 0.3406, "step": 181600 }, { "epoch": 0.7123545315622428, "grad_norm": 0.3895440697669983, "learning_rate": 1.4395791700952157e-05, "loss": 0.3474, "step": 181800 }, { "epoch": 0.7131381999137965, "grad_norm": 0.28900033235549927, "learning_rate": 1.4356608283374475e-05, "loss": 0.3839, "step": 182000 }, { "epoch": 0.7139218682653501, "grad_norm": 0.24865594506263733, "learning_rate": 1.4317424865796794e-05, "loss": 0.3509, "step": 182200 }, { "epoch": 0.7147055366169037, "grad_norm": 0.1657281070947647, "learning_rate": 1.4278241448219116e-05, "loss": 0.4043, "step": 182400 }, { "epoch": 0.7154892049684574, "grad_norm": 0.5622673630714417, "learning_rate": 1.4239253947729323e-05, "loss": 0.3351, "step": 182600 }, { "epoch": 0.716272873320011, "grad_norm": 0.29535192251205444, "learning_rate": 1.4200070530151641e-05, "loss": 0.3583, "step": 182800 }, { "epoch": 0.7170565416715646, "grad_norm": 0.4303935170173645, "learning_rate": 1.416088711257396e-05, "loss": 0.3304, "step": 183000 }, { "epoch": 0.7178402100231183, "grad_norm": 0.4135098159313202, "learning_rate": 1.4121703694996277e-05, "loss": 0.4045, "step": 183200 }, { "epoch": 0.7186238783746718, "grad_norm": 0.34102487564086914, "learning_rate": 1.4082520277418599e-05, "loss": 0.3409, "step": 183400 }, { "epoch": 0.7194075467262254, "grad_norm": 0.22567662596702576, "learning_rate": 1.4043336859840917e-05, "loss": 0.3934, "step": 183600 }, { "epoch": 0.7201912150777791, "grad_norm": 0.36801376938819885, "learning_rate": 1.4004153442263234e-05, "loss": 0.3315, "step": 183800 }, { "epoch": 0.7209748834293327, "grad_norm": 0.5870971083641052, "learning_rate": 1.3964970024685553e-05, "loss": 0.3648, "step": 184000 }, { "epoch": 0.7217585517808863, "grad_norm": 0.20883271098136902, "learning_rate": 1.3925786607107871e-05, "loss": 0.3343, "step": 184200 }, { "epoch": 0.72254222013244, "grad_norm": 0.3225955665111542, "learning_rate": 1.3886603189530192e-05, "loss": 0.3484, "step": 184400 }, { "epoch": 0.7233258884839936, "grad_norm": 0.37929660081863403, "learning_rate": 1.384741977195251e-05, "loss": 0.3258, "step": 184600 }, { "epoch": 0.7241095568355472, "grad_norm": 0.2800404727458954, "learning_rate": 1.3808236354374829e-05, "loss": 0.35, "step": 184800 }, { "epoch": 0.7248932251871009, "grad_norm": 0.22596155107021332, "learning_rate": 1.3769052936797147e-05, "loss": 0.3942, "step": 185000 }, { "epoch": 0.7256768935386544, "grad_norm": 0.32030895352363586, "learning_rate": 1.3729869519219468e-05, "loss": 0.3554, "step": 185200 }, { "epoch": 0.726460561890208, "grad_norm": 0.5121368765830994, "learning_rate": 1.3690882018729676e-05, "loss": 0.3841, "step": 185400 }, { "epoch": 0.7272442302417617, "grad_norm": 0.21764332056045532, "learning_rate": 1.3651698601151995e-05, "loss": 0.3687, "step": 185600 }, { "epoch": 0.7280278985933153, "grad_norm": 0.37256884574890137, "learning_rate": 1.3612515183574311e-05, "loss": 0.3965, "step": 185800 }, { "epoch": 0.7288115669448689, "grad_norm": 0.46709561347961426, "learning_rate": 1.357352768308452e-05, "loss": 0.3773, "step": 186000 }, { "epoch": 0.7295952352964226, "grad_norm": 0.2802875339984894, "learning_rate": 1.3534344265506838e-05, "loss": 0.3407, "step": 186200 }, { "epoch": 0.7303789036479762, "grad_norm": 0.3546951115131378, "learning_rate": 1.3495160847929155e-05, "loss": 0.3382, "step": 186400 }, { "epoch": 0.7311625719995298, "grad_norm": 1.2052944898605347, "learning_rate": 1.3455977430351477e-05, "loss": 0.3998, "step": 186600 }, { "epoch": 0.7319462403510835, "grad_norm": 0.22293418645858765, "learning_rate": 1.3416794012773796e-05, "loss": 0.3613, "step": 186800 }, { "epoch": 0.732729908702637, "grad_norm": 0.20051322877407074, "learning_rate": 1.3377610595196113e-05, "loss": 0.3305, "step": 187000 }, { "epoch": 0.7335135770541906, "grad_norm": 0.23895002901554108, "learning_rate": 1.3338427177618431e-05, "loss": 0.4121, "step": 187200 }, { "epoch": 0.7342972454057443, "grad_norm": 0.3351421058177948, "learning_rate": 1.3299243760040753e-05, "loss": 0.3626, "step": 187400 }, { "epoch": 0.7350809137572979, "grad_norm": 0.387396901845932, "learning_rate": 1.326006034246307e-05, "loss": 0.3654, "step": 187600 }, { "epoch": 0.7358645821088515, "grad_norm": 0.46178901195526123, "learning_rate": 1.3220876924885389e-05, "loss": 0.3919, "step": 187800 }, { "epoch": 0.7366482504604052, "grad_norm": 0.19138361513614655, "learning_rate": 1.3181693507307707e-05, "loss": 0.3785, "step": 188000 }, { "epoch": 0.7374319188119588, "grad_norm": 0.1531708538532257, "learning_rate": 1.3142510089730026e-05, "loss": 0.3024, "step": 188200 }, { "epoch": 0.7382155871635124, "grad_norm": 0.3086858093738556, "learning_rate": 1.3103326672152346e-05, "loss": 0.3848, "step": 188400 }, { "epoch": 0.7389992555150661, "grad_norm": 0.5864688158035278, "learning_rate": 1.3064143254574665e-05, "loss": 0.3473, "step": 188600 }, { "epoch": 0.7397829238666197, "grad_norm": 0.5110514760017395, "learning_rate": 1.3024959836996983e-05, "loss": 0.333, "step": 188800 }, { "epoch": 0.7405665922181732, "grad_norm": 0.30423370003700256, "learning_rate": 1.2985776419419302e-05, "loss": 0.3707, "step": 189000 }, { "epoch": 0.7413502605697269, "grad_norm": 0.27866747975349426, "learning_rate": 1.2946593001841622e-05, "loss": 0.3229, "step": 189200 }, { "epoch": 0.7421339289212805, "grad_norm": 1.556573748588562, "learning_rate": 1.2907409584263941e-05, "loss": 0.3617, "step": 189400 }, { "epoch": 0.7429175972728341, "grad_norm": 0.33649295568466187, "learning_rate": 1.286822616668626e-05, "loss": 0.3764, "step": 189600 }, { "epoch": 0.7437012656243878, "grad_norm": 0.5861026644706726, "learning_rate": 1.2829238666196466e-05, "loss": 0.3493, "step": 189800 }, { "epoch": 0.7444849339759414, "grad_norm": 0.33105725049972534, "learning_rate": 1.2790055248618785e-05, "loss": 0.3372, "step": 190000 }, { "epoch": 0.745268602327495, "grad_norm": 0.388429194688797, "learning_rate": 1.2750871831041103e-05, "loss": 0.3531, "step": 190200 }, { "epoch": 0.7460522706790487, "grad_norm": 0.3028870224952698, "learning_rate": 1.2711688413463424e-05, "loss": 0.3762, "step": 190400 }, { "epoch": 0.7468359390306023, "grad_norm": 0.26430070400238037, "learning_rate": 1.2672504995885742e-05, "loss": 0.3552, "step": 190600 }, { "epoch": 0.7476196073821558, "grad_norm": 0.47712862491607666, "learning_rate": 1.263332157830806e-05, "loss": 0.3777, "step": 190800 }, { "epoch": 0.7484032757337095, "grad_norm": 3.1485884189605713, "learning_rate": 1.259413816073038e-05, "loss": 0.3642, "step": 191000 }, { "epoch": 0.7491869440852631, "grad_norm": 0.5391570925712585, "learning_rate": 1.2554954743152696e-05, "loss": 0.4023, "step": 191200 }, { "epoch": 0.7499706124368167, "grad_norm": 0.5413140058517456, "learning_rate": 1.2515771325575018e-05, "loss": 0.38, "step": 191400 }, { "epoch": 0.7507542807883704, "grad_norm": 0.13696108758449554, "learning_rate": 1.2476587907997337e-05, "loss": 0.3271, "step": 191600 }, { "epoch": 0.751537949139924, "grad_norm": 0.19886599481105804, "learning_rate": 1.2437404490419655e-05, "loss": 0.3836, "step": 191800 }, { "epoch": 0.7523216174914776, "grad_norm": 0.17873993515968323, "learning_rate": 1.2398221072841974e-05, "loss": 0.3383, "step": 192000 }, { "epoch": 0.7531052858430313, "grad_norm": 0.6531501412391663, "learning_rate": 1.2359037655264293e-05, "loss": 0.3409, "step": 192200 }, { "epoch": 0.7538889541945849, "grad_norm": 2.3798701763153076, "learning_rate": 1.2319854237686613e-05, "loss": 0.3503, "step": 192400 }, { "epoch": 0.7546726225461384, "grad_norm": 0.4060080647468567, "learning_rate": 1.228067082010893e-05, "loss": 0.3339, "step": 192600 }, { "epoch": 0.7554562908976921, "grad_norm": 0.32076671719551086, "learning_rate": 1.2241487402531248e-05, "loss": 0.361, "step": 192800 }, { "epoch": 0.7562399592492457, "grad_norm": 0.4069419801235199, "learning_rate": 1.2202303984953569e-05, "loss": 0.3565, "step": 193000 }, { "epoch": 0.7570236276007993, "grad_norm": 0.2776513695716858, "learning_rate": 1.2163120567375887e-05, "loss": 0.388, "step": 193200 }, { "epoch": 0.757807295952353, "grad_norm": 0.24962277710437775, "learning_rate": 1.2123937149798206e-05, "loss": 0.3535, "step": 193400 }, { "epoch": 0.7585909643039066, "grad_norm": 0.29066240787506104, "learning_rate": 1.2084753732220524e-05, "loss": 0.3469, "step": 193600 }, { "epoch": 0.7593746326554602, "grad_norm": 0.32002654671669006, "learning_rate": 1.2045766231730731e-05, "loss": 0.3424, "step": 193800 }, { "epoch": 0.7601583010070139, "grad_norm": 0.2010965794324875, "learning_rate": 1.2006582814153051e-05, "loss": 0.3629, "step": 194000 }, { "epoch": 0.7609419693585675, "grad_norm": 0.37930095195770264, "learning_rate": 1.1967595313663258e-05, "loss": 0.3053, "step": 194200 }, { "epoch": 0.761725637710121, "grad_norm": 0.22433385252952576, "learning_rate": 1.1928411896085577e-05, "loss": 0.3433, "step": 194400 }, { "epoch": 0.7625093060616747, "grad_norm": 0.6634377241134644, "learning_rate": 1.1889228478507897e-05, "loss": 0.3533, "step": 194600 }, { "epoch": 0.7632929744132283, "grad_norm": 0.5401524901390076, "learning_rate": 1.1850045060930215e-05, "loss": 0.3476, "step": 194800 }, { "epoch": 0.7640766427647819, "grad_norm": 0.276132196187973, "learning_rate": 1.1810861643352534e-05, "loss": 0.3454, "step": 195000 }, { "epoch": 0.7648603111163356, "grad_norm": 0.3703073263168335, "learning_rate": 1.1771678225774853e-05, "loss": 0.3326, "step": 195200 }, { "epoch": 0.7656439794678892, "grad_norm": 0.4518648386001587, "learning_rate": 1.1732494808197171e-05, "loss": 0.3243, "step": 195400 }, { "epoch": 0.7664276478194428, "grad_norm": 0.47713226079940796, "learning_rate": 1.1693311390619491e-05, "loss": 0.3506, "step": 195600 }, { "epoch": 0.7672113161709965, "grad_norm": 0.2348567694425583, "learning_rate": 1.1654127973041808e-05, "loss": 0.3638, "step": 195800 }, { "epoch": 0.7679949845225501, "grad_norm": 0.4898678958415985, "learning_rate": 1.1614944555464129e-05, "loss": 0.3526, "step": 196000 }, { "epoch": 0.7687786528741036, "grad_norm": 0.28333336114883423, "learning_rate": 1.1575761137886447e-05, "loss": 0.3221, "step": 196200 }, { "epoch": 0.7695623212256573, "grad_norm": 0.262997031211853, "learning_rate": 1.1536577720308766e-05, "loss": 0.3706, "step": 196400 }, { "epoch": 0.7703459895772109, "grad_norm": 0.32209834456443787, "learning_rate": 1.1497394302731084e-05, "loss": 0.3483, "step": 196600 }, { "epoch": 0.7711296579287645, "grad_norm": 0.28576773405075073, "learning_rate": 1.1458210885153403e-05, "loss": 0.3146, "step": 196800 }, { "epoch": 0.7719133262803182, "grad_norm": 0.28298234939575195, "learning_rate": 1.1419027467575723e-05, "loss": 0.3465, "step": 197000 }, { "epoch": 0.7726969946318718, "grad_norm": 0.11571706086397171, "learning_rate": 1.1379844049998042e-05, "loss": 0.3429, "step": 197200 }, { "epoch": 0.7734806629834254, "grad_norm": 0.5682443976402283, "learning_rate": 1.134066063242036e-05, "loss": 0.3613, "step": 197400 }, { "epoch": 0.7742643313349791, "grad_norm": 0.44382941722869873, "learning_rate": 1.1301477214842679e-05, "loss": 0.3273, "step": 197600 }, { "epoch": 0.7750479996865327, "grad_norm": 0.27556684613227844, "learning_rate": 1.1262293797265e-05, "loss": 0.3836, "step": 197800 }, { "epoch": 0.7758316680380862, "grad_norm": 0.6749032735824585, "learning_rate": 1.1223306296775206e-05, "loss": 0.3278, "step": 198000 }, { "epoch": 0.7766153363896399, "grad_norm": 0.31399938464164734, "learning_rate": 1.1184122879197525e-05, "loss": 0.3613, "step": 198200 }, { "epoch": 0.7773990047411935, "grad_norm": 0.429665744304657, "learning_rate": 1.1144939461619843e-05, "loss": 0.3338, "step": 198400 }, { "epoch": 0.7781826730927471, "grad_norm": 0.32820430397987366, "learning_rate": 1.1105756044042162e-05, "loss": 0.3378, "step": 198600 }, { "epoch": 0.7789663414443008, "grad_norm": 0.5159560441970825, "learning_rate": 1.106657262646448e-05, "loss": 0.3727, "step": 198800 }, { "epoch": 0.7797500097958544, "grad_norm": 0.6431983709335327, "learning_rate": 1.10273892088868e-05, "loss": 0.3649, "step": 199000 }, { "epoch": 0.780533678147408, "grad_norm": 0.3942665755748749, "learning_rate": 1.0988205791309118e-05, "loss": 0.3435, "step": 199200 }, { "epoch": 0.7813173464989617, "grad_norm": 0.17049553990364075, "learning_rate": 1.0949022373731438e-05, "loss": 0.3373, "step": 199400 }, { "epoch": 0.7821010148505153, "grad_norm": 0.4546549320220947, "learning_rate": 1.0909838956153756e-05, "loss": 0.32, "step": 199600 }, { "epoch": 0.7828846832020688, "grad_norm": 0.2129913568496704, "learning_rate": 1.0870655538576075e-05, "loss": 0.3253, "step": 199800 }, { "epoch": 0.7836683515536225, "grad_norm": 0.304645299911499, "learning_rate": 1.0831472120998394e-05, "loss": 0.3434, "step": 200000 }, { "epoch": 0.7836683515536225, "eval_loss": 0.36935603618621826, "eval_runtime": 194.9874, "eval_samples_per_second": 13.221, "eval_steps_per_second": 13.221, "step": 200000 }, { "epoch": 0.7844520199051761, "grad_norm": 0.2658133804798126, "learning_rate": 1.0792288703420712e-05, "loss": 0.3964, "step": 200200 }, { "epoch": 0.7852356882567297, "grad_norm": 0.5810640454292297, "learning_rate": 1.0753105285843032e-05, "loss": 0.3409, "step": 200400 }, { "epoch": 0.7860193566082834, "grad_norm": 0.5466582179069519, "learning_rate": 1.0713921868265351e-05, "loss": 0.355, "step": 200600 }, { "epoch": 0.786803024959837, "grad_norm": 0.6995142698287964, "learning_rate": 1.067473845068767e-05, "loss": 0.3603, "step": 200800 }, { "epoch": 0.7875866933113906, "grad_norm": 0.3142918646335602, "learning_rate": 1.0635555033109988e-05, "loss": 0.3622, "step": 201000 }, { "epoch": 0.7883703616629443, "grad_norm": 0.21291528642177582, "learning_rate": 1.0596371615532308e-05, "loss": 0.3815, "step": 201200 }, { "epoch": 0.7891540300144979, "grad_norm": 1.9498099088668823, "learning_rate": 1.0557188197954625e-05, "loss": 0.3465, "step": 201400 }, { "epoch": 0.7899376983660514, "grad_norm": 0.4036538004875183, "learning_rate": 1.0518004780376944e-05, "loss": 0.3442, "step": 201600 }, { "epoch": 0.7907213667176051, "grad_norm": 0.30874505639076233, "learning_rate": 1.0479017279887152e-05, "loss": 0.3597, "step": 201800 }, { "epoch": 0.7915050350691587, "grad_norm": 0.36580735445022583, "learning_rate": 1.0439833862309471e-05, "loss": 0.3572, "step": 202000 }, { "epoch": 0.7922887034207123, "grad_norm": 0.12965187430381775, "learning_rate": 1.040065044473179e-05, "loss": 0.3374, "step": 202200 }, { "epoch": 0.793072371772266, "grad_norm": 0.23761625587940216, "learning_rate": 1.036146702715411e-05, "loss": 0.3406, "step": 202400 }, { "epoch": 0.7938560401238196, "grad_norm": 1.7544664144515991, "learning_rate": 1.0322283609576427e-05, "loss": 0.3533, "step": 202600 }, { "epoch": 0.7946397084753732, "grad_norm": 0.3685474991798401, "learning_rate": 1.0283100191998747e-05, "loss": 0.3514, "step": 202800 }, { "epoch": 0.7954233768269269, "grad_norm": 0.4537036716938019, "learning_rate": 1.0243916774421066e-05, "loss": 0.3271, "step": 203000 }, { "epoch": 0.7962070451784805, "grad_norm": 0.36493510007858276, "learning_rate": 1.0204733356843386e-05, "loss": 0.3325, "step": 203200 }, { "epoch": 0.7969907135300341, "grad_norm": 0.4055664837360382, "learning_rate": 1.0165549939265703e-05, "loss": 0.3633, "step": 203400 }, { "epoch": 0.7977743818815877, "grad_norm": 0.6503295302391052, "learning_rate": 1.0126366521688021e-05, "loss": 0.3459, "step": 203600 }, { "epoch": 0.7985580502331413, "grad_norm": 0.34953197836875916, "learning_rate": 1.0087183104110342e-05, "loss": 0.361, "step": 203800 }, { "epoch": 0.7993417185846949, "grad_norm": 0.7741647958755493, "learning_rate": 1.0048195603620548e-05, "loss": 0.3753, "step": 204000 }, { "epoch": 0.8001253869362486, "grad_norm": 0.4427395761013031, "learning_rate": 1.0009012186042867e-05, "loss": 0.3965, "step": 204200 }, { "epoch": 0.8009090552878022, "grad_norm": 0.2570464015007019, "learning_rate": 9.969828768465187e-06, "loss": 0.3507, "step": 204400 }, { "epoch": 0.8016927236393558, "grad_norm": 0.6625790596008301, "learning_rate": 9.930645350887504e-06, "loss": 0.3441, "step": 204600 }, { "epoch": 0.8024763919909095, "grad_norm": 0.1296575963497162, "learning_rate": 9.891461933309824e-06, "loss": 0.3711, "step": 204800 }, { "epoch": 0.8032600603424631, "grad_norm": 0.3326294720172882, "learning_rate": 9.852278515732143e-06, "loss": 0.3108, "step": 205000 }, { "epoch": 0.8040437286940167, "grad_norm": 0.6194589734077454, "learning_rate": 9.813095098154461e-06, "loss": 0.3337, "step": 205200 }, { "epoch": 0.8048273970455703, "grad_norm": 0.49426934123039246, "learning_rate": 9.77391168057678e-06, "loss": 0.4048, "step": 205400 }, { "epoch": 0.8056110653971239, "grad_norm": 0.19368387758731842, "learning_rate": 9.734728262999099e-06, "loss": 0.3784, "step": 205600 }, { "epoch": 0.8063947337486775, "grad_norm": 0.274065762758255, "learning_rate": 9.695544845421419e-06, "loss": 0.3873, "step": 205800 }, { "epoch": 0.8071784021002312, "grad_norm": 0.28209155797958374, "learning_rate": 9.656361427843737e-06, "loss": 0.2867, "step": 206000 }, { "epoch": 0.8079620704517848, "grad_norm": 0.1206631287932396, "learning_rate": 9.617178010266056e-06, "loss": 0.3666, "step": 206200 }, { "epoch": 0.8087457388033384, "grad_norm": 0.45625948905944824, "learning_rate": 9.577994592688375e-06, "loss": 0.3555, "step": 206400 }, { "epoch": 0.8095294071548921, "grad_norm": 1.6964956521987915, "learning_rate": 9.539007092198581e-06, "loss": 0.3985, "step": 206600 }, { "epoch": 0.8103130755064457, "grad_norm": 0.20946283638477325, "learning_rate": 9.499823674620902e-06, "loss": 0.3536, "step": 206800 }, { "epoch": 0.8110967438579993, "grad_norm": 0.5647934675216675, "learning_rate": 9.46064025704322e-06, "loss": 0.3417, "step": 207000 }, { "epoch": 0.8118804122095529, "grad_norm": 0.46122294664382935, "learning_rate": 9.421456839465539e-06, "loss": 0.3123, "step": 207200 }, { "epoch": 0.8126640805611065, "grad_norm": 0.31902188062667847, "learning_rate": 9.382273421887857e-06, "loss": 0.3466, "step": 207400 }, { "epoch": 0.8134477489126601, "grad_norm": 0.30757972598075867, "learning_rate": 9.343090004310176e-06, "loss": 0.3626, "step": 207600 }, { "epoch": 0.8142314172642138, "grad_norm": 0.4866563379764557, "learning_rate": 9.303906586732496e-06, "loss": 0.3501, "step": 207800 }, { "epoch": 0.8150150856157674, "grad_norm": 0.16957193613052368, "learning_rate": 9.264723169154813e-06, "loss": 0.3421, "step": 208000 }, { "epoch": 0.815798753967321, "grad_norm": 0.15946203470230103, "learning_rate": 9.225539751577133e-06, "loss": 0.3936, "step": 208200 }, { "epoch": 0.8165824223188747, "grad_norm": 0.3646221160888672, "learning_rate": 9.186356333999452e-06, "loss": 0.3567, "step": 208400 }, { "epoch": 0.8173660906704283, "grad_norm": 0.7196604013442993, "learning_rate": 9.14717291642177e-06, "loss": 0.3908, "step": 208600 }, { "epoch": 0.818149759021982, "grad_norm": 0.3158138394355774, "learning_rate": 9.10798949884409e-06, "loss": 0.3856, "step": 208800 }, { "epoch": 0.8189334273735355, "grad_norm": 0.533643901348114, "learning_rate": 9.068806081266408e-06, "loss": 0.3413, "step": 209000 }, { "epoch": 0.8197170957250891, "grad_norm": 0.33527955412864685, "learning_rate": 9.029622663688728e-06, "loss": 0.365, "step": 209200 }, { "epoch": 0.8205007640766427, "grad_norm": 0.3317583203315735, "learning_rate": 8.990439246111047e-06, "loss": 0.3362, "step": 209400 }, { "epoch": 0.8212844324281964, "grad_norm": 0.2337818294763565, "learning_rate": 8.951255828533365e-06, "loss": 0.4277, "step": 209600 }, { "epoch": 0.82206810077975, "grad_norm": 0.20124514400959015, "learning_rate": 8.912072410955684e-06, "loss": 0.3792, "step": 209800 }, { "epoch": 0.8228517691313036, "grad_norm": 0.7244812250137329, "learning_rate": 8.872888993378004e-06, "loss": 0.3501, "step": 210000 }, { "epoch": 0.8236354374828573, "grad_norm": 0.3771028220653534, "learning_rate": 8.833705575800321e-06, "loss": 0.3459, "step": 210200 }, { "epoch": 0.8244191058344109, "grad_norm": 0.3812679052352905, "learning_rate": 8.79452215822264e-06, "loss": 0.3277, "step": 210400 }, { "epoch": 0.8252027741859645, "grad_norm": 0.5025450587272644, "learning_rate": 8.75533874064496e-06, "loss": 0.3529, "step": 210600 }, { "epoch": 0.8259864425375181, "grad_norm": 0.321023166179657, "learning_rate": 8.716351240155167e-06, "loss": 0.3668, "step": 210800 }, { "epoch": 0.8267701108890717, "grad_norm": 0.3340953290462494, "learning_rate": 8.677167822577485e-06, "loss": 0.3398, "step": 211000 }, { "epoch": 0.8275537792406253, "grad_norm": 1.1746923923492432, "learning_rate": 8.637984404999805e-06, "loss": 0.3306, "step": 211200 }, { "epoch": 0.828337447592179, "grad_norm": 0.5042812824249268, "learning_rate": 8.598800987422124e-06, "loss": 0.3864, "step": 211400 }, { "epoch": 0.8291211159437326, "grad_norm": 0.3353957533836365, "learning_rate": 8.559617569844443e-06, "loss": 0.3697, "step": 211600 }, { "epoch": 0.8299047842952862, "grad_norm": 0.2791668772697449, "learning_rate": 8.520434152266761e-06, "loss": 0.3321, "step": 211800 }, { "epoch": 0.8306884526468399, "grad_norm": 0.3713902533054352, "learning_rate": 8.481446651776968e-06, "loss": 0.3665, "step": 212000 }, { "epoch": 0.8314721209983935, "grad_norm": 0.2594499886035919, "learning_rate": 8.442263234199286e-06, "loss": 0.362, "step": 212200 }, { "epoch": 0.8322557893499472, "grad_norm": 0.9900026321411133, "learning_rate": 8.403079816621607e-06, "loss": 0.3636, "step": 212400 }, { "epoch": 0.8330394577015007, "grad_norm": 0.4834081530570984, "learning_rate": 8.363896399043925e-06, "loss": 0.3496, "step": 212600 }, { "epoch": 0.8338231260530543, "grad_norm": 0.8258674144744873, "learning_rate": 8.324712981466244e-06, "loss": 0.3495, "step": 212800 }, { "epoch": 0.8346067944046079, "grad_norm": 0.16623233258724213, "learning_rate": 8.285529563888562e-06, "loss": 0.3648, "step": 213000 }, { "epoch": 0.8353904627561616, "grad_norm": 0.30037203431129456, "learning_rate": 8.246346146310883e-06, "loss": 0.3493, "step": 213200 }, { "epoch": 0.8361741311077152, "grad_norm": 0.28860384225845337, "learning_rate": 8.2071627287332e-06, "loss": 0.3249, "step": 213400 }, { "epoch": 0.8369577994592688, "grad_norm": 0.36268025636672974, "learning_rate": 8.16797931115552e-06, "loss": 0.3488, "step": 213600 }, { "epoch": 0.8377414678108225, "grad_norm": 0.6866934895515442, "learning_rate": 8.128795893577838e-06, "loss": 0.3912, "step": 213800 }, { "epoch": 0.8385251361623761, "grad_norm": 0.6361219882965088, "learning_rate": 8.089612476000157e-06, "loss": 0.3872, "step": 214000 }, { "epoch": 0.8393088045139298, "grad_norm": 0.6985290050506592, "learning_rate": 8.050624975510364e-06, "loss": 0.3906, "step": 214200 }, { "epoch": 0.8400924728654833, "grad_norm": 0.1675785928964615, "learning_rate": 8.011441557932684e-06, "loss": 0.3433, "step": 214400 }, { "epoch": 0.8408761412170369, "grad_norm": 0.44967809319496155, "learning_rate": 7.972258140355001e-06, "loss": 0.3441, "step": 214600 }, { "epoch": 0.8416598095685905, "grad_norm": 0.22128620743751526, "learning_rate": 7.933074722777321e-06, "loss": 0.3798, "step": 214800 }, { "epoch": 0.8424434779201442, "grad_norm": 0.19397297501564026, "learning_rate": 7.89389130519964e-06, "loss": 0.3233, "step": 215000 }, { "epoch": 0.8432271462716978, "grad_norm": 0.38185518980026245, "learning_rate": 7.85470788762196e-06, "loss": 0.3509, "step": 215200 }, { "epoch": 0.8440108146232514, "grad_norm": 0.4878714084625244, "learning_rate": 7.815524470044277e-06, "loss": 0.3295, "step": 215400 }, { "epoch": 0.8447944829748051, "grad_norm": 0.338005393743515, "learning_rate": 7.776341052466596e-06, "loss": 0.3661, "step": 215600 }, { "epoch": 0.8455781513263587, "grad_norm": 0.17256999015808105, "learning_rate": 7.737157634888916e-06, "loss": 0.3624, "step": 215800 }, { "epoch": 0.8463618196779124, "grad_norm": 0.457180380821228, "learning_rate": 7.697974217311234e-06, "loss": 0.3678, "step": 216000 }, { "epoch": 0.8471454880294659, "grad_norm": 0.28088513016700745, "learning_rate": 7.658790799733553e-06, "loss": 0.347, "step": 216200 }, { "epoch": 0.8479291563810195, "grad_norm": 0.29847681522369385, "learning_rate": 7.619607382155872e-06, "loss": 0.3287, "step": 216400 }, { "epoch": 0.8487128247325731, "grad_norm": 0.36706671118736267, "learning_rate": 7.580423964578191e-06, "loss": 0.3239, "step": 216600 }, { "epoch": 0.8494964930841268, "grad_norm": 0.35141023993492126, "learning_rate": 7.54124054700051e-06, "loss": 0.3382, "step": 216800 }, { "epoch": 0.8502801614356804, "grad_norm": 0.1458987295627594, "learning_rate": 7.502057129422829e-06, "loss": 0.3523, "step": 217000 }, { "epoch": 0.851063829787234, "grad_norm": 0.888570249080658, "learning_rate": 7.462873711845148e-06, "loss": 0.3641, "step": 217200 }, { "epoch": 0.8518474981387877, "grad_norm": 0.5985159277915955, "learning_rate": 7.423690294267465e-06, "loss": 0.3109, "step": 217400 }, { "epoch": 0.8526311664903413, "grad_norm": 0.08441504091024399, "learning_rate": 7.384506876689786e-06, "loss": 0.336, "step": 217600 }, { "epoch": 0.853414834841895, "grad_norm": 0.17146113514900208, "learning_rate": 7.345323459112103e-06, "loss": 0.3507, "step": 217800 }, { "epoch": 0.8541985031934485, "grad_norm": 0.15761646628379822, "learning_rate": 7.306140041534424e-06, "loss": 0.3773, "step": 218000 }, { "epoch": 0.8549821715450021, "grad_norm": 0.25699788331985474, "learning_rate": 7.26715254104463e-06, "loss": 0.4354, "step": 218200 }, { "epoch": 0.8557658398965557, "grad_norm": 0.08377000689506531, "learning_rate": 7.227969123466949e-06, "loss": 0.3712, "step": 218400 }, { "epoch": 0.8565495082481094, "grad_norm": 0.30239951610565186, "learning_rate": 7.188785705889268e-06, "loss": 0.4251, "step": 218600 }, { "epoch": 0.857333176599663, "grad_norm": 0.43087834119796753, "learning_rate": 7.149602288311587e-06, "loss": 0.3676, "step": 218800 }, { "epoch": 0.8581168449512167, "grad_norm": 0.29747945070266724, "learning_rate": 7.110418870733906e-06, "loss": 0.3603, "step": 219000 }, { "epoch": 0.8589005133027703, "grad_norm": 0.45779263973236084, "learning_rate": 7.071235453156225e-06, "loss": 0.3768, "step": 219200 }, { "epoch": 0.8596841816543239, "grad_norm": 0.13283923268318176, "learning_rate": 7.032247952666432e-06, "loss": 0.3232, "step": 219400 }, { "epoch": 0.8604678500058776, "grad_norm": 0.3145968019962311, "learning_rate": 6.99306453508875e-06, "loss": 0.361, "step": 219600 }, { "epoch": 0.8612515183574312, "grad_norm": 0.595435619354248, "learning_rate": 6.95388111751107e-06, "loss": 0.3561, "step": 219800 }, { "epoch": 0.8620351867089847, "grad_norm": 0.5451090931892395, "learning_rate": 6.914697699933388e-06, "loss": 0.3735, "step": 220000 }, { "epoch": 0.8628188550605383, "grad_norm": 0.3953447639942169, "learning_rate": 6.875514282355708e-06, "loss": 0.3526, "step": 220200 }, { "epoch": 0.863602523412092, "grad_norm": 0.14726021885871887, "learning_rate": 6.836330864778026e-06, "loss": 0.3335, "step": 220400 }, { "epoch": 0.8643861917636456, "grad_norm": 0.44425076246261597, "learning_rate": 6.797147447200346e-06, "loss": 0.3516, "step": 220600 }, { "epoch": 0.8651698601151993, "grad_norm": 0.3331049680709839, "learning_rate": 6.757964029622664e-06, "loss": 0.3452, "step": 220800 }, { "epoch": 0.8659535284667529, "grad_norm": 0.5187826156616211, "learning_rate": 6.718780612044982e-06, "loss": 0.3187, "step": 221000 }, { "epoch": 0.8667371968183065, "grad_norm": 0.41904640197753906, "learning_rate": 6.679597194467302e-06, "loss": 0.3454, "step": 221200 }, { "epoch": 0.8675208651698602, "grad_norm": 0.3603901267051697, "learning_rate": 6.64041377688962e-06, "loss": 0.3427, "step": 221400 }, { "epoch": 0.8683045335214138, "grad_norm": 0.10734283924102783, "learning_rate": 6.6012303593119395e-06, "loss": 0.3126, "step": 221600 }, { "epoch": 0.8690882018729673, "grad_norm": 0.2561802566051483, "learning_rate": 6.562046941734258e-06, "loss": 0.3589, "step": 221800 }, { "epoch": 0.869871870224521, "grad_norm": 0.35452401638031006, "learning_rate": 6.5228635241565775e-06, "loss": 0.3171, "step": 222000 }, { "epoch": 0.8706555385760746, "grad_norm": 0.4760279953479767, "learning_rate": 6.483680106578896e-06, "loss": 0.3537, "step": 222200 }, { "epoch": 0.8714392069276282, "grad_norm": 0.3154491186141968, "learning_rate": 6.4444966890012155e-06, "loss": 0.3428, "step": 222400 }, { "epoch": 0.8722228752791819, "grad_norm": 0.22634199261665344, "learning_rate": 6.405313271423534e-06, "loss": 0.3618, "step": 222600 }, { "epoch": 0.8730065436307355, "grad_norm": 0.18168601393699646, "learning_rate": 6.366129853845852e-06, "loss": 0.3344, "step": 222800 }, { "epoch": 0.8737902119822891, "grad_norm": 0.21857967972755432, "learning_rate": 6.326946436268172e-06, "loss": 0.3265, "step": 223000 }, { "epoch": 0.8745738803338428, "grad_norm": 0.25472521781921387, "learning_rate": 6.28776301869049e-06, "loss": 0.3324, "step": 223200 }, { "epoch": 0.8753575486853964, "grad_norm": 0.260189950466156, "learning_rate": 6.248579601112809e-06, "loss": 0.3139, "step": 223400 }, { "epoch": 0.8761412170369499, "grad_norm": 0.4272352159023285, "learning_rate": 6.209592100623016e-06, "loss": 0.3256, "step": 223600 }, { "epoch": 0.8769248853885035, "grad_norm": 0.297317773103714, "learning_rate": 6.170408683045335e-06, "loss": 0.3248, "step": 223800 }, { "epoch": 0.8777085537400572, "grad_norm": 0.500162661075592, "learning_rate": 6.131225265467654e-06, "loss": 0.39, "step": 224000 }, { "epoch": 0.8784922220916108, "grad_norm": 0.4258497357368469, "learning_rate": 6.0920418478899734e-06, "loss": 0.3482, "step": 224200 }, { "epoch": 0.8792758904431645, "grad_norm": 0.3500688076019287, "learning_rate": 6.052858430312292e-06, "loss": 0.3121, "step": 224400 }, { "epoch": 0.8800595587947181, "grad_norm": 0.4041314721107483, "learning_rate": 6.0136750127346115e-06, "loss": 0.3943, "step": 224600 }, { "epoch": 0.8808432271462717, "grad_norm": 0.5618583559989929, "learning_rate": 5.97449159515693e-06, "loss": 0.3564, "step": 224800 }, { "epoch": 0.8816268954978254, "grad_norm": 0.2575647532939911, "learning_rate": 5.9355040946671375e-06, "loss": 0.3903, "step": 225000 }, { "epoch": 0.882410563849379, "grad_norm": 0.21139074862003326, "learning_rate": 5.896320677089456e-06, "loss": 0.3509, "step": 225200 }, { "epoch": 0.8831942322009325, "grad_norm": 0.1804710477590561, "learning_rate": 5.857137259511775e-06, "loss": 0.3844, "step": 225400 }, { "epoch": 0.8839779005524862, "grad_norm": 0.34338244795799255, "learning_rate": 5.817953841934093e-06, "loss": 0.3449, "step": 225600 }, { "epoch": 0.8847615689040398, "grad_norm": 0.4105754792690277, "learning_rate": 5.778770424356413e-06, "loss": 0.3593, "step": 225800 }, { "epoch": 0.8855452372555934, "grad_norm": 0.6113290786743164, "learning_rate": 5.739587006778731e-06, "loss": 0.3527, "step": 226000 }, { "epoch": 0.886328905607147, "grad_norm": 0.4298894703388214, "learning_rate": 5.700403589201051e-06, "loss": 0.3182, "step": 226200 }, { "epoch": 0.8871125739587007, "grad_norm": 0.2320249229669571, "learning_rate": 5.661220171623369e-06, "loss": 0.3344, "step": 226400 }, { "epoch": 0.8878962423102543, "grad_norm": 0.16917462646961212, "learning_rate": 5.622036754045688e-06, "loss": 0.3625, "step": 226600 }, { "epoch": 0.888679910661808, "grad_norm": 0.16281074285507202, "learning_rate": 5.582853336468007e-06, "loss": 0.3656, "step": 226800 }, { "epoch": 0.8894635790133616, "grad_norm": 0.232764333486557, "learning_rate": 5.543669918890326e-06, "loss": 0.3108, "step": 227000 }, { "epoch": 0.8902472473649151, "grad_norm": 0.3530287444591522, "learning_rate": 5.5044865013126446e-06, "loss": 0.3296, "step": 227200 }, { "epoch": 0.8910309157164688, "grad_norm": 0.16222508251667023, "learning_rate": 5.465303083734963e-06, "loss": 0.3498, "step": 227400 }, { "epoch": 0.8918145840680224, "grad_norm": 0.5519393682479858, "learning_rate": 5.426119666157283e-06, "loss": 0.3276, "step": 227600 }, { "epoch": 0.892598252419576, "grad_norm": 0.2971680164337158, "learning_rate": 5.386936248579601e-06, "loss": 0.3468, "step": 227800 }, { "epoch": 0.8933819207711297, "grad_norm": 0.27840396761894226, "learning_rate": 5.347752831001921e-06, "loss": 0.3387, "step": 228000 }, { "epoch": 0.8941655891226833, "grad_norm": 0.12611983716487885, "learning_rate": 5.308569413424239e-06, "loss": 0.3743, "step": 228200 }, { "epoch": 0.8949492574742369, "grad_norm": 0.3788227140903473, "learning_rate": 5.269385995846559e-06, "loss": 0.3452, "step": 228400 }, { "epoch": 0.8957329258257906, "grad_norm": 0.2808510363101959, "learning_rate": 5.230202578268876e-06, "loss": 0.3477, "step": 228600 }, { "epoch": 0.8965165941773442, "grad_norm": 0.6040523648262024, "learning_rate": 5.191019160691196e-06, "loss": 0.3723, "step": 228800 }, { "epoch": 0.8973002625288977, "grad_norm": 0.24068951606750488, "learning_rate": 5.1520316602014025e-06, "loss": 0.3703, "step": 229000 }, { "epoch": 0.8980839308804514, "grad_norm": 0.40527355670928955, "learning_rate": 5.112848242623722e-06, "loss": 0.3744, "step": 229200 }, { "epoch": 0.898867599232005, "grad_norm": 0.4473119080066681, "learning_rate": 5.0736648250460405e-06, "loss": 0.3042, "step": 229400 }, { "epoch": 0.8996512675835586, "grad_norm": 0.18631766736507416, "learning_rate": 5.03448140746836e-06, "loss": 0.3682, "step": 229600 }, { "epoch": 0.9004349359351123, "grad_norm": 0.46415069699287415, "learning_rate": 4.9952979898906785e-06, "loss": 0.3549, "step": 229800 }, { "epoch": 0.9012186042866659, "grad_norm": 0.07150289416313171, "learning_rate": 4.956114572312998e-06, "loss": 0.4329, "step": 230000 }, { "epoch": 0.9020022726382195, "grad_norm": 0.3655785024166107, "learning_rate": 4.9169311547353165e-06, "loss": 0.3621, "step": 230200 }, { "epoch": 0.9027859409897732, "grad_norm": 0.3118968904018402, "learning_rate": 4.877747737157635e-06, "loss": 0.3298, "step": 230400 }, { "epoch": 0.9035696093413268, "grad_norm": 0.339590460062027, "learning_rate": 4.838564319579954e-06, "loss": 0.3348, "step": 230600 }, { "epoch": 0.9043532776928803, "grad_norm": 0.17156149446964264, "learning_rate": 4.799380902002273e-06, "loss": 0.3857, "step": 230800 }, { "epoch": 0.905136946044434, "grad_norm": 0.3096294403076172, "learning_rate": 4.76039340151248e-06, "loss": 0.3566, "step": 231000 }, { "epoch": 0.9059206143959876, "grad_norm": 0.479358434677124, "learning_rate": 4.721209983934799e-06, "loss": 0.3702, "step": 231200 }, { "epoch": 0.9067042827475412, "grad_norm": 0.2651727497577667, "learning_rate": 4.682026566357118e-06, "loss": 0.3489, "step": 231400 }, { "epoch": 0.9074879510990949, "grad_norm": 0.37970855832099915, "learning_rate": 4.642843148779437e-06, "loss": 0.3945, "step": 231600 }, { "epoch": 0.9082716194506485, "grad_norm": 0.13563969731330872, "learning_rate": 4.603659731201756e-06, "loss": 0.3314, "step": 231800 }, { "epoch": 0.9090552878022021, "grad_norm": 0.33950814604759216, "learning_rate": 4.5644763136240744e-06, "loss": 0.3185, "step": 232000 }, { "epoch": 0.9098389561537558, "grad_norm": 0.4426833987236023, "learning_rate": 4.525292896046393e-06, "loss": 0.3468, "step": 232200 }, { "epoch": 0.9106226245053094, "grad_norm": 0.3494137227535248, "learning_rate": 4.486109478468712e-06, "loss": 0.3234, "step": 232400 }, { "epoch": 0.9114062928568629, "grad_norm": 0.47928714752197266, "learning_rate": 4.446926060891031e-06, "loss": 0.3357, "step": 232600 }, { "epoch": 0.9121899612084166, "grad_norm": 0.15988366305828094, "learning_rate": 4.40774264331335e-06, "loss": 0.348, "step": 232800 }, { "epoch": 0.9129736295599702, "grad_norm": 0.41838353872299194, "learning_rate": 4.368559225735669e-06, "loss": 0.3542, "step": 233000 }, { "epoch": 0.9137572979115238, "grad_norm": 0.5815839171409607, "learning_rate": 4.3295717252458766e-06, "loss": 0.4135, "step": 233200 }, { "epoch": 0.9145409662630775, "grad_norm": 0.27913644909858704, "learning_rate": 4.290388307668195e-06, "loss": 0.3395, "step": 233400 }, { "epoch": 0.9153246346146311, "grad_norm": 0.19515849649906158, "learning_rate": 4.251204890090514e-06, "loss": 0.3717, "step": 233600 }, { "epoch": 0.9161083029661847, "grad_norm": 0.3227376639842987, "learning_rate": 4.212021472512833e-06, "loss": 0.3687, "step": 233800 }, { "epoch": 0.9168919713177384, "grad_norm": 0.13047832250595093, "learning_rate": 4.172838054935151e-06, "loss": 0.3417, "step": 234000 }, { "epoch": 0.917675639669292, "grad_norm": 0.6230217814445496, "learning_rate": 4.13365463735747e-06, "loss": 0.3295, "step": 234200 }, { "epoch": 0.9184593080208456, "grad_norm": 0.3608349561691284, "learning_rate": 4.094471219779789e-06, "loss": 0.3337, "step": 234400 }, { "epoch": 0.9192429763723992, "grad_norm": 0.33231377601623535, "learning_rate": 4.055287802202108e-06, "loss": 0.342, "step": 234600 }, { "epoch": 0.9200266447239528, "grad_norm": 0.6240096092224121, "learning_rate": 4.016104384624427e-06, "loss": 0.3418, "step": 234800 }, { "epoch": 0.9208103130755064, "grad_norm": 0.3697316646575928, "learning_rate": 3.976920967046746e-06, "loss": 0.3373, "step": 235000 }, { "epoch": 0.9215939814270601, "grad_norm": 0.5216108560562134, "learning_rate": 3.937737549469065e-06, "loss": 0.3549, "step": 235200 }, { "epoch": 0.9223776497786137, "grad_norm": 0.14060427248477936, "learning_rate": 3.8987500489792725e-06, "loss": 0.374, "step": 235400 }, { "epoch": 0.9231613181301673, "grad_norm": 0.5167615413665771, "learning_rate": 3.85976254848948e-06, "loss": 0.3677, "step": 235600 }, { "epoch": 0.923944986481721, "grad_norm": 0.3430474102497101, "learning_rate": 3.8205791309117986e-06, "loss": 0.3583, "step": 235800 }, { "epoch": 0.9247286548332746, "grad_norm": 0.527511477470398, "learning_rate": 3.7813957133341167e-06, "loss": 0.3681, "step": 236000 }, { "epoch": 0.9255123231848282, "grad_norm": 0.1632586270570755, "learning_rate": 3.7422122957564357e-06, "loss": 0.3654, "step": 236200 }, { "epoch": 0.9262959915363818, "grad_norm": 0.30265602469444275, "learning_rate": 3.7030288781787548e-06, "loss": 0.3568, "step": 236400 }, { "epoch": 0.9270796598879354, "grad_norm": 0.4781269431114197, "learning_rate": 3.6638454606010738e-06, "loss": 0.3705, "step": 236600 }, { "epoch": 0.927863328239489, "grad_norm": 0.5035874843597412, "learning_rate": 3.6246620430233928e-06, "loss": 0.3568, "step": 236800 }, { "epoch": 0.9286469965910427, "grad_norm": 0.12536077201366425, "learning_rate": 3.585478625445712e-06, "loss": 0.3554, "step": 237000 }, { "epoch": 0.9294306649425963, "grad_norm": 0.22718684375286102, "learning_rate": 3.546295207868031e-06, "loss": 0.365, "step": 237200 }, { "epoch": 0.9302143332941499, "grad_norm": 0.3493526875972748, "learning_rate": 3.5071117902903494e-06, "loss": 0.3537, "step": 237400 }, { "epoch": 0.9309980016457036, "grad_norm": 0.3632163405418396, "learning_rate": 3.467928372712668e-06, "loss": 0.3847, "step": 237600 }, { "epoch": 0.9317816699972572, "grad_norm": 0.4137295186519623, "learning_rate": 3.4287449551349866e-06, "loss": 0.3678, "step": 237800 }, { "epoch": 0.9325653383488108, "grad_norm": 0.3593025207519531, "learning_rate": 3.3895615375573056e-06, "loss": 0.3354, "step": 238000 }, { "epoch": 0.9333490067003644, "grad_norm": 0.1468370258808136, "learning_rate": 3.3503781199796246e-06, "loss": 0.3538, "step": 238200 }, { "epoch": 0.934132675051918, "grad_norm": 0.35879406332969666, "learning_rate": 3.3111947024019436e-06, "loss": 0.3046, "step": 238400 }, { "epoch": 0.9349163434034716, "grad_norm": 0.12804114818572998, "learning_rate": 3.2720112848242626e-06, "loss": 0.3077, "step": 238600 }, { "epoch": 0.9357000117550253, "grad_norm": 0.19002677500247955, "learning_rate": 3.2328278672465816e-06, "loss": 0.3858, "step": 238800 }, { "epoch": 0.9364836801065789, "grad_norm": 0.28931474685668945, "learning_rate": 3.1936444496689006e-06, "loss": 0.4012, "step": 239000 }, { "epoch": 0.9372673484581325, "grad_norm": 0.3442515730857849, "learning_rate": 3.1544610320912197e-06, "loss": 0.3243, "step": 239200 }, { "epoch": 0.9380510168096862, "grad_norm": 0.20496311783790588, "learning_rate": 3.1152776145135382e-06, "loss": 0.3584, "step": 239400 }, { "epoch": 0.9388346851612398, "grad_norm": 0.464964359998703, "learning_rate": 3.0762901140237453e-06, "loss": 0.3529, "step": 239600 }, { "epoch": 0.9396183535127934, "grad_norm": 0.4943079948425293, "learning_rate": 3.0371066964460643e-06, "loss": 0.3651, "step": 239800 }, { "epoch": 0.940402021864347, "grad_norm": 0.6036735773086548, "learning_rate": 2.997923278868383e-06, "loss": 0.3468, "step": 240000 }, { "epoch": 0.9411856902159006, "grad_norm": 0.19112864136695862, "learning_rate": 2.958739861290702e-06, "loss": 0.3485, "step": 240200 }, { "epoch": 0.9419693585674542, "grad_norm": 1.0933749675750732, "learning_rate": 2.919556443713021e-06, "loss": 0.3304, "step": 240400 }, { "epoch": 0.9427530269190079, "grad_norm": 0.4347619414329529, "learning_rate": 2.88037302613534e-06, "loss": 0.348, "step": 240600 }, { "epoch": 0.9435366952705615, "grad_norm": 0.5953877568244934, "learning_rate": 2.8411896085576585e-06, "loss": 0.3485, "step": 240800 }, { "epoch": 0.9443203636221151, "grad_norm": 0.31309637427330017, "learning_rate": 2.8020061909799776e-06, "loss": 0.34, "step": 241000 }, { "epoch": 0.9451040319736688, "grad_norm": 0.2075364738702774, "learning_rate": 2.762822773402296e-06, "loss": 0.3744, "step": 241200 }, { "epoch": 0.9458877003252224, "grad_norm": 0.44796204566955566, "learning_rate": 2.723639355824615e-06, "loss": 0.3809, "step": 241400 }, { "epoch": 0.946671368676776, "grad_norm": 0.27432700991630554, "learning_rate": 2.6844559382469337e-06, "loss": 0.3625, "step": 241600 }, { "epoch": 0.9474550370283296, "grad_norm": 0.30352187156677246, "learning_rate": 2.6454684377571412e-06, "loss": 0.3401, "step": 241800 }, { "epoch": 0.9482387053798832, "grad_norm": 0.26174160838127136, "learning_rate": 2.6062850201794602e-06, "loss": 0.349, "step": 242000 }, { "epoch": 0.9490223737314368, "grad_norm": 0.3089654743671417, "learning_rate": 2.5671016026017793e-06, "loss": 0.3432, "step": 242200 }, { "epoch": 0.9498060420829905, "grad_norm": 0.4106723368167877, "learning_rate": 2.5279181850240983e-06, "loss": 0.3792, "step": 242400 }, { "epoch": 0.9505897104345441, "grad_norm": 0.36030468344688416, "learning_rate": 2.488734767446417e-06, "loss": 0.332, "step": 242600 }, { "epoch": 0.9513733787860977, "grad_norm": 0.2495603710412979, "learning_rate": 2.4495513498687355e-06, "loss": 0.331, "step": 242800 }, { "epoch": 0.9521570471376514, "grad_norm": 0.350875586271286, "learning_rate": 2.4103679322910545e-06, "loss": 0.373, "step": 243000 }, { "epoch": 0.952940715489205, "grad_norm": 0.4643670916557312, "learning_rate": 2.3711845147133735e-06, "loss": 0.3585, "step": 243200 }, { "epoch": 0.9537243838407586, "grad_norm": 0.469117671251297, "learning_rate": 2.332001097135692e-06, "loss": 0.3419, "step": 243400 }, { "epoch": 0.9545080521923122, "grad_norm": 0.4283735156059265, "learning_rate": 2.2930135966458996e-06, "loss": 0.3472, "step": 243600 }, { "epoch": 0.9552917205438658, "grad_norm": 0.1776352971792221, "learning_rate": 2.2538301790682186e-06, "loss": 0.3552, "step": 243800 }, { "epoch": 0.9560753888954194, "grad_norm": 0.37668749690055847, "learning_rate": 2.2146467614905376e-06, "loss": 0.3701, "step": 244000 }, { "epoch": 0.9568590572469731, "grad_norm": 0.4025615155696869, "learning_rate": 2.175463343912856e-06, "loss": 0.3784, "step": 244200 }, { "epoch": 0.9576427255985267, "grad_norm": 0.16067549586296082, "learning_rate": 2.1362799263351748e-06, "loss": 0.3588, "step": 244400 }, { "epoch": 0.9584263939500803, "grad_norm": 0.7287450432777405, "learning_rate": 2.0970965087574938e-06, "loss": 0.3695, "step": 244600 }, { "epoch": 0.959210062301634, "grad_norm": 0.2135716825723648, "learning_rate": 2.0579130911798128e-06, "loss": 0.3505, "step": 244800 }, { "epoch": 0.9599937306531876, "grad_norm": 0.306046724319458, "learning_rate": 2.018729673602132e-06, "loss": 0.3466, "step": 245000 }, { "epoch": 0.9607773990047412, "grad_norm": 0.2013571411371231, "learning_rate": 1.9795462560244504e-06, "loss": 0.3462, "step": 245200 }, { "epoch": 0.9615610673562948, "grad_norm": 0.5998982787132263, "learning_rate": 1.9403628384467694e-06, "loss": 0.3194, "step": 245400 }, { "epoch": 0.9623447357078484, "grad_norm": 0.5098769068717957, "learning_rate": 1.9011794208690884e-06, "loss": 0.3245, "step": 245600 }, { "epoch": 0.963128404059402, "grad_norm": 0.24053919315338135, "learning_rate": 1.8619960032914072e-06, "loss": 0.3594, "step": 245800 }, { "epoch": 0.9639120724109557, "grad_norm": 0.5044742226600647, "learning_rate": 1.822812585713726e-06, "loss": 0.3663, "step": 246000 }, { "epoch": 0.9646957407625093, "grad_norm": 0.3860742747783661, "learning_rate": 1.7836291681360448e-06, "loss": 0.3708, "step": 246200 }, { "epoch": 0.9654794091140629, "grad_norm": 0.34997010231018066, "learning_rate": 1.744641667646252e-06, "loss": 0.3642, "step": 246400 }, { "epoch": 0.9662630774656166, "grad_norm": 1.3756405115127563, "learning_rate": 1.7054582500685711e-06, "loss": 0.3271, "step": 246600 }, { "epoch": 0.9670467458171702, "grad_norm": 0.17304442822933197, "learning_rate": 1.6662748324908901e-06, "loss": 0.3399, "step": 246800 }, { "epoch": 0.9678304141687238, "grad_norm": 0.518538236618042, "learning_rate": 1.6270914149132087e-06, "loss": 0.3495, "step": 247000 }, { "epoch": 0.9686140825202774, "grad_norm": 0.5009946227073669, "learning_rate": 1.5879079973355277e-06, "loss": 0.3345, "step": 247200 }, { "epoch": 0.969397750871831, "grad_norm": 0.21448875963687897, "learning_rate": 1.5487245797578465e-06, "loss": 0.3489, "step": 247400 }, { "epoch": 0.9701814192233846, "grad_norm": 0.5044354200363159, "learning_rate": 1.5095411621801653e-06, "loss": 0.384, "step": 247600 }, { "epoch": 0.9709650875749383, "grad_norm": 0.5545394420623779, "learning_rate": 1.4703577446024843e-06, "loss": 0.3526, "step": 247800 }, { "epoch": 0.9717487559264919, "grad_norm": 0.2663050889968872, "learning_rate": 1.4311743270248031e-06, "loss": 0.35, "step": 248000 }, { "epoch": 0.9725324242780455, "grad_norm": 0.4767986536026001, "learning_rate": 1.3919909094471221e-06, "loss": 0.3523, "step": 248200 }, { "epoch": 0.9733160926295992, "grad_norm": 0.37418490648269653, "learning_rate": 1.352807491869441e-06, "loss": 0.3667, "step": 248400 }, { "epoch": 0.9740997609811528, "grad_norm": 0.3749473989009857, "learning_rate": 1.3136240742917597e-06, "loss": 0.3152, "step": 248600 }, { "epoch": 0.9748834293327064, "grad_norm": 0.19025427103042603, "learning_rate": 1.2744406567140785e-06, "loss": 0.3018, "step": 248800 }, { "epoch": 0.9756670976842601, "grad_norm": 0.6537559032440186, "learning_rate": 1.2352572391363976e-06, "loss": 0.3401, "step": 249000 }, { "epoch": 0.9764507660358136, "grad_norm": 0.11564496904611588, "learning_rate": 1.1960738215587164e-06, "loss": 0.3723, "step": 249200 }, { "epoch": 0.9772344343873672, "grad_norm": 0.3918033838272095, "learning_rate": 1.1570863210689236e-06, "loss": 0.3539, "step": 249400 }, { "epoch": 0.9780181027389209, "grad_norm": 0.1482819765806198, "learning_rate": 1.1179029034912427e-06, "loss": 0.3133, "step": 249600 }, { "epoch": 0.9788017710904745, "grad_norm": 0.2993972599506378, "learning_rate": 1.0787194859135615e-06, "loss": 0.3208, "step": 249800 }, { "epoch": 0.9795854394420281, "grad_norm": 0.23439355194568634, "learning_rate": 1.0395360683358803e-06, "loss": 0.367, "step": 250000 }, { "epoch": 0.9795854394420281, "eval_loss": 0.36393865942955017, "eval_runtime": 194.4628, "eval_samples_per_second": 13.257, "eval_steps_per_second": 13.257, "step": 250000 }, { "epoch": 0.9803691077935818, "grad_norm": 0.26800838112831116, "learning_rate": 1.000352650758199e-06, "loss": 0.34, "step": 250200 }, { "epoch": 0.9811527761451354, "grad_norm": 0.42382532358169556, "learning_rate": 9.61169233180518e-07, "loss": 0.3437, "step": 250400 }, { "epoch": 0.981936444496689, "grad_norm": 0.2065437138080597, "learning_rate": 9.219858156028369e-07, "loss": 0.3383, "step": 250600 }, { "epoch": 0.9827201128482427, "grad_norm": 0.394820898771286, "learning_rate": 8.828023980251558e-07, "loss": 0.3701, "step": 250800 }, { "epoch": 0.9835037811997962, "grad_norm": 0.32410669326782227, "learning_rate": 8.436189804474746e-07, "loss": 0.3333, "step": 251000 }, { "epoch": 0.9842874495513498, "grad_norm": 0.327781617641449, "learning_rate": 8.044355628697936e-07, "loss": 0.3027, "step": 251200 }, { "epoch": 0.9850711179029035, "grad_norm": 0.14675971865653992, "learning_rate": 7.652521452921124e-07, "loss": 0.3255, "step": 251400 }, { "epoch": 0.9858547862544571, "grad_norm": 0.34487444162368774, "learning_rate": 7.260687277144313e-07, "loss": 0.3152, "step": 251600 }, { "epoch": 0.9866384546060107, "grad_norm": 0.40237823128700256, "learning_rate": 6.868853101367502e-07, "loss": 0.3954, "step": 251800 }, { "epoch": 0.9874221229575644, "grad_norm": 0.3661724925041199, "learning_rate": 6.47701892559069e-07, "loss": 0.3324, "step": 252000 }, { "epoch": 0.988205791309118, "grad_norm": 0.23157495260238647, "learning_rate": 6.087143920692763e-07, "loss": 0.3822, "step": 252200 }, { "epoch": 0.9889894596606716, "grad_norm": 0.25033727288246155, "learning_rate": 5.695309744915952e-07, "loss": 0.3099, "step": 252400 }, { "epoch": 0.9897731280122253, "grad_norm": 0.5575063824653625, "learning_rate": 5.303475569139141e-07, "loss": 0.3529, "step": 252600 }, { "epoch": 0.9905567963637788, "grad_norm": 0.1661965399980545, "learning_rate": 4.91164139336233e-07, "loss": 0.3337, "step": 252800 }, { "epoch": 0.9913404647153324, "grad_norm": 1.4163107872009277, "learning_rate": 4.519807217585518e-07, "loss": 0.3657, "step": 253000 }, { "epoch": 0.9921241330668861, "grad_norm": 0.44902482628822327, "learning_rate": 4.127973041808707e-07, "loss": 0.3146, "step": 253200 }, { "epoch": 0.9929078014184397, "grad_norm": 0.4699545204639435, "learning_rate": 3.7361388660318956e-07, "loss": 0.3339, "step": 253400 }, { "epoch": 0.9936914697699933, "grad_norm": 0.21585559844970703, "learning_rate": 3.344304690255084e-07, "loss": 0.3482, "step": 253600 }, { "epoch": 0.994475138121547, "grad_norm": 0.3905218243598938, "learning_rate": 2.952470514478273e-07, "loss": 0.3359, "step": 253800 }, { "epoch": 0.9952588064731006, "grad_norm": 0.5387653112411499, "learning_rate": 2.560636338701462e-07, "loss": 0.3717, "step": 254000 }, { "epoch": 0.9960424748246542, "grad_norm": 0.3408672511577606, "learning_rate": 2.1688021629246505e-07, "loss": 0.3407, "step": 254200 }, { "epoch": 0.9968261431762079, "grad_norm": 0.49749669432640076, "learning_rate": 1.778927158026723e-07, "loss": 0.4048, "step": 254400 }, { "epoch": 0.9976098115277614, "grad_norm": 0.2871319651603699, "learning_rate": 1.387092982249912e-07, "loss": 0.356, "step": 254600 }, { "epoch": 0.998393479879315, "grad_norm": 0.15367092192173004, "learning_rate": 9.952588064731006e-08, "loss": 0.3845, "step": 254800 }, { "epoch": 0.9991771482308687, "grad_norm": 0.5039834976196289, "learning_rate": 6.034246306962894e-08, "loss": 0.3597, "step": 255000 }, { "epoch": 0.9999608165824223, "grad_norm": 0.4402804374694824, "learning_rate": 2.115904549194781e-08, "loss": 0.3453, "step": 255200 } ], "logging_steps": 200, "max_steps": 255210, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.934330347734958e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }