diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,47480 @@ +{ + "best_metric": 0.9356, + "best_model_checkpoint": "checkpoint/vit-large/checkpoint-11970", + "epoch": 100.0, + "eval_steps": 500, + "global_step": 66500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 2.7020020484924316, + "learning_rate": 9.998496240601504e-06, + "loss": 4.6119, + "step": 10 + }, + { + "epoch": 0.03, + "grad_norm": 2.892003059387207, + "learning_rate": 9.996992481203008e-06, + "loss": 4.5669, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 2.7313311100006104, + "learning_rate": 9.995488721804511e-06, + "loss": 4.5263, + "step": 30 + }, + { + "epoch": 0.06, + "grad_norm": 2.659421443939209, + "learning_rate": 9.993984962406017e-06, + "loss": 4.4729, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 2.846480131149292, + "learning_rate": 9.992481203007518e-06, + "loss": 4.3985, + "step": 50 + }, + { + "epoch": 0.09, + "grad_norm": 3.0880706310272217, + "learning_rate": 9.990977443609024e-06, + "loss": 4.3247, + "step": 60 + }, + { + "epoch": 0.11, + "grad_norm": 3.202064037322998, + "learning_rate": 9.989473684210527e-06, + "loss": 4.233, + "step": 70 + }, + { + "epoch": 0.12, + "grad_norm": 4.243076324462891, + "learning_rate": 9.98796992481203e-06, + "loss": 4.1598, + "step": 80 + }, + { + "epoch": 0.14, + "grad_norm": 3.47965669631958, + "learning_rate": 9.986466165413534e-06, + "loss": 4.0929, + "step": 90 + }, + { + "epoch": 0.15, + "grad_norm": 3.431342363357544, + "learning_rate": 9.984962406015038e-06, + "loss": 3.9735, + "step": 100 + }, + { + "epoch": 0.17, + "grad_norm": 3.915376901626587, + "learning_rate": 9.983458646616541e-06, + "loss": 3.9051, + "step": 110 + }, + { + "epoch": 0.18, + "grad_norm": 3.6235947608947754, + "learning_rate": 9.981954887218046e-06, + "loss": 3.7795, + "step": 120 + }, + { + "epoch": 0.2, + "grad_norm": 4.0705485343933105, + "learning_rate": 9.98045112781955e-06, + "loss": 3.6517, + "step": 130 + }, + { + "epoch": 0.21, + "grad_norm": 3.916447162628174, + "learning_rate": 9.978947368421053e-06, + "loss": 3.5197, + "step": 140 + }, + { + "epoch": 0.23, + "grad_norm": 4.508981227874756, + "learning_rate": 9.977443609022557e-06, + "loss": 3.4313, + "step": 150 + }, + { + "epoch": 0.24, + "grad_norm": 4.508138179779053, + "learning_rate": 9.97593984962406e-06, + "loss": 3.3823, + "step": 160 + }, + { + "epoch": 0.26, + "grad_norm": 5.037680625915527, + "learning_rate": 9.974436090225564e-06, + "loss": 3.2748, + "step": 170 + }, + { + "epoch": 0.27, + "grad_norm": 10.304414749145508, + "learning_rate": 9.97293233082707e-06, + "loss": 3.1054, + "step": 180 + }, + { + "epoch": 0.29, + "grad_norm": 5.1311421394348145, + "learning_rate": 9.971428571428571e-06, + "loss": 3.0703, + "step": 190 + }, + { + "epoch": 0.3, + "grad_norm": 4.8540120124816895, + "learning_rate": 9.969924812030076e-06, + "loss": 2.9789, + "step": 200 + }, + { + "epoch": 0.32, + "grad_norm": 4.647185802459717, + "learning_rate": 9.96842105263158e-06, + "loss": 2.8568, + "step": 210 + }, + { + "epoch": 0.33, + "grad_norm": 6.943347454071045, + "learning_rate": 9.966917293233083e-06, + "loss": 2.8175, + "step": 220 + }, + { + "epoch": 0.35, + "grad_norm": 17.15534210205078, + "learning_rate": 9.965413533834587e-06, + "loss": 2.7049, + "step": 230 + }, + { + "epoch": 0.36, + "grad_norm": 5.327253341674805, + "learning_rate": 9.963909774436092e-06, + "loss": 2.6185, + "step": 240 + }, + { + "epoch": 0.38, + "grad_norm": 5.56367301940918, + "learning_rate": 9.962406015037594e-06, + "loss": 2.4704, + "step": 250 + }, + { + "epoch": 0.39, + "grad_norm": 6.038745880126953, + "learning_rate": 9.960902255639099e-06, + "loss": 2.4862, + "step": 260 + }, + { + "epoch": 0.41, + "grad_norm": 5.661726474761963, + "learning_rate": 9.959398496240603e-06, + "loss": 2.4405, + "step": 270 + }, + { + "epoch": 0.42, + "grad_norm": 5.565981864929199, + "learning_rate": 9.957894736842106e-06, + "loss": 2.3698, + "step": 280 + }, + { + "epoch": 0.44, + "grad_norm": 7.349733829498291, + "learning_rate": 9.95639097744361e-06, + "loss": 2.3338, + "step": 290 + }, + { + "epoch": 0.45, + "grad_norm": 6.526618957519531, + "learning_rate": 9.954887218045113e-06, + "loss": 2.2445, + "step": 300 + }, + { + "epoch": 0.47, + "grad_norm": 5.558746337890625, + "learning_rate": 9.953383458646617e-06, + "loss": 2.202, + "step": 310 + }, + { + "epoch": 0.48, + "grad_norm": 6.4157633781433105, + "learning_rate": 9.951879699248122e-06, + "loss": 2.1578, + "step": 320 + }, + { + "epoch": 0.5, + "grad_norm": 5.635522842407227, + "learning_rate": 9.950375939849625e-06, + "loss": 2.099, + "step": 330 + }, + { + "epoch": 0.51, + "grad_norm": 8.216004371643066, + "learning_rate": 9.948872180451129e-06, + "loss": 2.0472, + "step": 340 + }, + { + "epoch": 0.53, + "grad_norm": 7.348927021026611, + "learning_rate": 9.947368421052632e-06, + "loss": 2.0272, + "step": 350 + }, + { + "epoch": 0.54, + "grad_norm": 6.218992710113525, + "learning_rate": 9.945864661654136e-06, + "loss": 1.9022, + "step": 360 + }, + { + "epoch": 0.56, + "grad_norm": 12.379638671875, + "learning_rate": 9.94436090225564e-06, + "loss": 1.9707, + "step": 370 + }, + { + "epoch": 0.57, + "grad_norm": 7.454248905181885, + "learning_rate": 9.942857142857145e-06, + "loss": 1.9612, + "step": 380 + }, + { + "epoch": 0.59, + "grad_norm": 8.975961685180664, + "learning_rate": 9.941353383458647e-06, + "loss": 1.8519, + "step": 390 + }, + { + "epoch": 0.6, + "grad_norm": 11.839798927307129, + "learning_rate": 9.939849624060152e-06, + "loss": 1.8246, + "step": 400 + }, + { + "epoch": 0.62, + "grad_norm": 21.542709350585938, + "learning_rate": 9.938345864661655e-06, + "loss": 1.7591, + "step": 410 + }, + { + "epoch": 0.63, + "grad_norm": 6.640402793884277, + "learning_rate": 9.936842105263159e-06, + "loss": 1.7803, + "step": 420 + }, + { + "epoch": 0.65, + "grad_norm": 7.312070369720459, + "learning_rate": 9.935338345864662e-06, + "loss": 1.6758, + "step": 430 + }, + { + "epoch": 0.66, + "grad_norm": 6.633362770080566, + "learning_rate": 9.933834586466168e-06, + "loss": 1.7095, + "step": 440 + }, + { + "epoch": 0.68, + "grad_norm": 9.548731803894043, + "learning_rate": 9.93233082706767e-06, + "loss": 1.6934, + "step": 450 + }, + { + "epoch": 0.69, + "grad_norm": 9.280405044555664, + "learning_rate": 9.930827067669175e-06, + "loss": 1.74, + "step": 460 + }, + { + "epoch": 0.71, + "grad_norm": 4.683008670806885, + "learning_rate": 9.929323308270678e-06, + "loss": 1.5803, + "step": 470 + }, + { + "epoch": 0.72, + "grad_norm": 5.887816905975342, + "learning_rate": 9.927819548872182e-06, + "loss": 1.6039, + "step": 480 + }, + { + "epoch": 0.74, + "grad_norm": 7.674352645874023, + "learning_rate": 9.926315789473685e-06, + "loss": 1.6009, + "step": 490 + }, + { + "epoch": 0.75, + "grad_norm": 5.968412399291992, + "learning_rate": 9.924812030075189e-06, + "loss": 1.5391, + "step": 500 + }, + { + "epoch": 0.77, + "grad_norm": 5.869007587432861, + "learning_rate": 9.923308270676692e-06, + "loss": 1.5226, + "step": 510 + }, + { + "epoch": 0.78, + "grad_norm": 7.70728874206543, + "learning_rate": 9.921804511278196e-06, + "loss": 1.6173, + "step": 520 + }, + { + "epoch": 0.8, + "grad_norm": 7.701499938964844, + "learning_rate": 9.920300751879701e-06, + "loss": 1.5042, + "step": 530 + }, + { + "epoch": 0.81, + "grad_norm": 6.880636692047119, + "learning_rate": 9.918796992481203e-06, + "loss": 1.456, + "step": 540 + }, + { + "epoch": 0.83, + "grad_norm": 7.021149635314941, + "learning_rate": 9.917293233082708e-06, + "loss": 1.5698, + "step": 550 + }, + { + "epoch": 0.84, + "grad_norm": 7.1271138191223145, + "learning_rate": 9.915789473684211e-06, + "loss": 1.3838, + "step": 560 + }, + { + "epoch": 0.86, + "grad_norm": 7.8424482345581055, + "learning_rate": 9.914285714285715e-06, + "loss": 1.3997, + "step": 570 + }, + { + "epoch": 0.87, + "grad_norm": 8.099345207214355, + "learning_rate": 9.912781954887218e-06, + "loss": 1.4359, + "step": 580 + }, + { + "epoch": 0.89, + "grad_norm": 7.261110782623291, + "learning_rate": 9.911278195488722e-06, + "loss": 1.454, + "step": 590 + }, + { + "epoch": 0.9, + "grad_norm": 6.30597448348999, + "learning_rate": 9.909774436090226e-06, + "loss": 1.3907, + "step": 600 + }, + { + "epoch": 0.92, + "grad_norm": 7.466990947723389, + "learning_rate": 9.90827067669173e-06, + "loss": 1.329, + "step": 610 + }, + { + "epoch": 0.93, + "grad_norm": 7.6469316482543945, + "learning_rate": 9.906766917293234e-06, + "loss": 1.2848, + "step": 620 + }, + { + "epoch": 0.95, + "grad_norm": 7.2480244636535645, + "learning_rate": 9.905263157894738e-06, + "loss": 1.4246, + "step": 630 + }, + { + "epoch": 0.96, + "grad_norm": 7.738135814666748, + "learning_rate": 9.903759398496241e-06, + "loss": 1.3669, + "step": 640 + }, + { + "epoch": 0.98, + "grad_norm": 8.298168182373047, + "learning_rate": 9.902255639097745e-06, + "loss": 1.2883, + "step": 650 + }, + { + "epoch": 0.99, + "grad_norm": 6.747694969177246, + "learning_rate": 9.900751879699248e-06, + "loss": 1.2884, + "step": 660 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.8834, + "eval_loss": 0.8751662373542786, + "eval_runtime": 85.9745, + "eval_samples_per_second": 116.314, + "eval_steps_per_second": 0.465, + "step": 665 + }, + { + "epoch": 1.01, + "grad_norm": 7.908563137054443, + "learning_rate": 9.899248120300754e-06, + "loss": 1.2429, + "step": 670 + }, + { + "epoch": 1.02, + "grad_norm": 7.439415454864502, + "learning_rate": 9.897744360902255e-06, + "loss": 1.2716, + "step": 680 + }, + { + "epoch": 1.04, + "grad_norm": 7.2306599617004395, + "learning_rate": 9.89624060150376e-06, + "loss": 1.1676, + "step": 690 + }, + { + "epoch": 1.05, + "grad_norm": 6.287716388702393, + "learning_rate": 9.894736842105264e-06, + "loss": 1.2212, + "step": 700 + }, + { + "epoch": 1.07, + "grad_norm": 10.363208770751953, + "learning_rate": 9.893233082706768e-06, + "loss": 1.259, + "step": 710 + }, + { + "epoch": 1.08, + "grad_norm": 6.556034564971924, + "learning_rate": 9.891729323308271e-06, + "loss": 1.1383, + "step": 720 + }, + { + "epoch": 1.1, + "grad_norm": 5.640949726104736, + "learning_rate": 9.890225563909776e-06, + "loss": 1.2454, + "step": 730 + }, + { + "epoch": 1.11, + "grad_norm": 7.140594959259033, + "learning_rate": 9.888721804511278e-06, + "loss": 1.1363, + "step": 740 + }, + { + "epoch": 1.13, + "grad_norm": 7.780942440032959, + "learning_rate": 9.887218045112783e-06, + "loss": 1.1806, + "step": 750 + }, + { + "epoch": 1.14, + "grad_norm": 6.301698684692383, + "learning_rate": 9.885714285714287e-06, + "loss": 1.1416, + "step": 760 + }, + { + "epoch": 1.16, + "grad_norm": 7.256317138671875, + "learning_rate": 9.88421052631579e-06, + "loss": 1.1947, + "step": 770 + }, + { + "epoch": 1.17, + "grad_norm": 9.818164825439453, + "learning_rate": 9.882706766917294e-06, + "loss": 1.1865, + "step": 780 + }, + { + "epoch": 1.19, + "grad_norm": 11.159587860107422, + "learning_rate": 9.881203007518797e-06, + "loss": 1.0911, + "step": 790 + }, + { + "epoch": 1.2, + "grad_norm": 12.432153701782227, + "learning_rate": 9.879699248120301e-06, + "loss": 1.1486, + "step": 800 + }, + { + "epoch": 1.22, + "grad_norm": 9.031283378601074, + "learning_rate": 9.878195488721806e-06, + "loss": 1.0838, + "step": 810 + }, + { + "epoch": 1.23, + "grad_norm": 7.850508689880371, + "learning_rate": 9.87669172932331e-06, + "loss": 1.0953, + "step": 820 + }, + { + "epoch": 1.25, + "grad_norm": 6.112914562225342, + "learning_rate": 9.875187969924813e-06, + "loss": 1.1159, + "step": 830 + }, + { + "epoch": 1.26, + "grad_norm": 6.945899486541748, + "learning_rate": 9.873684210526317e-06, + "loss": 0.9613, + "step": 840 + }, + { + "epoch": 1.28, + "grad_norm": 7.430254936218262, + "learning_rate": 9.87218045112782e-06, + "loss": 1.086, + "step": 850 + }, + { + "epoch": 1.29, + "grad_norm": 6.00394868850708, + "learning_rate": 9.870676691729324e-06, + "loss": 1.0873, + "step": 860 + }, + { + "epoch": 1.31, + "grad_norm": 5.864083290100098, + "learning_rate": 9.869172932330829e-06, + "loss": 1.055, + "step": 870 + }, + { + "epoch": 1.32, + "grad_norm": 8.37553882598877, + "learning_rate": 9.86766917293233e-06, + "loss": 1.0894, + "step": 880 + }, + { + "epoch": 1.34, + "grad_norm": 8.302824974060059, + "learning_rate": 9.866165413533836e-06, + "loss": 1.1037, + "step": 890 + }, + { + "epoch": 1.35, + "grad_norm": 6.968749046325684, + "learning_rate": 9.86466165413534e-06, + "loss": 1.0071, + "step": 900 + }, + { + "epoch": 1.37, + "grad_norm": 5.924696922302246, + "learning_rate": 9.863157894736843e-06, + "loss": 1.0498, + "step": 910 + }, + { + "epoch": 1.38, + "grad_norm": 7.433680534362793, + "learning_rate": 9.861654135338347e-06, + "loss": 1.0456, + "step": 920 + }, + { + "epoch": 1.4, + "grad_norm": 7.614802360534668, + "learning_rate": 9.86015037593985e-06, + "loss": 0.9659, + "step": 930 + }, + { + "epoch": 1.41, + "grad_norm": 9.205560684204102, + "learning_rate": 9.858646616541354e-06, + "loss": 0.9692, + "step": 940 + }, + { + "epoch": 1.43, + "grad_norm": 6.741930961608887, + "learning_rate": 9.857142857142859e-06, + "loss": 1.0157, + "step": 950 + }, + { + "epoch": 1.44, + "grad_norm": 8.176901817321777, + "learning_rate": 9.855639097744362e-06, + "loss": 1.0014, + "step": 960 + }, + { + "epoch": 1.46, + "grad_norm": 5.335792541503906, + "learning_rate": 9.854135338345866e-06, + "loss": 0.9361, + "step": 970 + }, + { + "epoch": 1.47, + "grad_norm": 6.488396644592285, + "learning_rate": 9.85263157894737e-06, + "loss": 1.0858, + "step": 980 + }, + { + "epoch": 1.49, + "grad_norm": 6.725528240203857, + "learning_rate": 9.851127819548873e-06, + "loss": 1.0027, + "step": 990 + }, + { + "epoch": 1.5, + "grad_norm": 10.740559577941895, + "learning_rate": 9.849624060150376e-06, + "loss": 1.0121, + "step": 1000 + }, + { + "epoch": 1.52, + "grad_norm": 8.048620223999023, + "learning_rate": 9.84812030075188e-06, + "loss": 0.967, + "step": 1010 + }, + { + "epoch": 1.53, + "grad_norm": 7.225861072540283, + "learning_rate": 9.846616541353383e-06, + "loss": 1.0329, + "step": 1020 + }, + { + "epoch": 1.55, + "grad_norm": 7.12366247177124, + "learning_rate": 9.845112781954887e-06, + "loss": 1.0117, + "step": 1030 + }, + { + "epoch": 1.56, + "grad_norm": 7.5486273765563965, + "learning_rate": 9.843609022556392e-06, + "loss": 0.969, + "step": 1040 + }, + { + "epoch": 1.58, + "grad_norm": 9.591785430908203, + "learning_rate": 9.842105263157896e-06, + "loss": 0.9415, + "step": 1050 + }, + { + "epoch": 1.59, + "grad_norm": 7.203570365905762, + "learning_rate": 9.8406015037594e-06, + "loss": 1.0409, + "step": 1060 + }, + { + "epoch": 1.61, + "grad_norm": 9.412242889404297, + "learning_rate": 9.839097744360903e-06, + "loss": 1.0352, + "step": 1070 + }, + { + "epoch": 1.62, + "grad_norm": 9.688934326171875, + "learning_rate": 9.837593984962406e-06, + "loss": 1.0524, + "step": 1080 + }, + { + "epoch": 1.64, + "grad_norm": 6.9523844718933105, + "learning_rate": 9.83609022556391e-06, + "loss": 0.9546, + "step": 1090 + }, + { + "epoch": 1.65, + "grad_norm": 8.400866508483887, + "learning_rate": 9.834586466165415e-06, + "loss": 0.9361, + "step": 1100 + }, + { + "epoch": 1.67, + "grad_norm": 8.09070110321045, + "learning_rate": 9.833082706766917e-06, + "loss": 0.9797, + "step": 1110 + }, + { + "epoch": 1.68, + "grad_norm": 6.927423000335693, + "learning_rate": 9.831578947368422e-06, + "loss": 0.9439, + "step": 1120 + }, + { + "epoch": 1.7, + "grad_norm": 7.291294574737549, + "learning_rate": 9.830075187969926e-06, + "loss": 0.9753, + "step": 1130 + }, + { + "epoch": 1.71, + "grad_norm": 7.919008731842041, + "learning_rate": 9.828571428571429e-06, + "loss": 0.9697, + "step": 1140 + }, + { + "epoch": 1.73, + "grad_norm": 8.660476684570312, + "learning_rate": 9.827067669172933e-06, + "loss": 0.8882, + "step": 1150 + }, + { + "epoch": 1.74, + "grad_norm": 8.102679252624512, + "learning_rate": 9.825563909774438e-06, + "loss": 0.8959, + "step": 1160 + }, + { + "epoch": 1.76, + "grad_norm": 5.902896404266357, + "learning_rate": 9.82406015037594e-06, + "loss": 0.887, + "step": 1170 + }, + { + "epoch": 1.77, + "grad_norm": 6.6904778480529785, + "learning_rate": 9.822556390977445e-06, + "loss": 0.9352, + "step": 1180 + }, + { + "epoch": 1.79, + "grad_norm": 6.770270824432373, + "learning_rate": 9.821052631578948e-06, + "loss": 0.8916, + "step": 1190 + }, + { + "epoch": 1.8, + "grad_norm": 8.353099822998047, + "learning_rate": 9.819548872180452e-06, + "loss": 0.9523, + "step": 1200 + }, + { + "epoch": 1.82, + "grad_norm": 6.385773658752441, + "learning_rate": 9.818045112781955e-06, + "loss": 0.9175, + "step": 1210 + }, + { + "epoch": 1.83, + "grad_norm": 13.28996467590332, + "learning_rate": 9.816541353383459e-06, + "loss": 0.9375, + "step": 1220 + }, + { + "epoch": 1.85, + "grad_norm": 9.252169609069824, + "learning_rate": 9.815037593984962e-06, + "loss": 1.0589, + "step": 1230 + }, + { + "epoch": 1.86, + "grad_norm": 6.009567737579346, + "learning_rate": 9.813533834586468e-06, + "loss": 0.9608, + "step": 1240 + }, + { + "epoch": 1.88, + "grad_norm": 4.863635063171387, + "learning_rate": 9.812030075187971e-06, + "loss": 0.9724, + "step": 1250 + }, + { + "epoch": 1.89, + "grad_norm": 10.548372268676758, + "learning_rate": 9.810526315789475e-06, + "loss": 0.9651, + "step": 1260 + }, + { + "epoch": 1.91, + "grad_norm": 8.277862548828125, + "learning_rate": 9.809022556390978e-06, + "loss": 0.8868, + "step": 1270 + }, + { + "epoch": 1.92, + "grad_norm": 6.657036304473877, + "learning_rate": 9.807518796992482e-06, + "loss": 0.9133, + "step": 1280 + }, + { + "epoch": 1.94, + "grad_norm": 7.065949440002441, + "learning_rate": 9.806015037593985e-06, + "loss": 0.9602, + "step": 1290 + }, + { + "epoch": 1.95, + "grad_norm": 9.187036514282227, + "learning_rate": 9.80451127819549e-06, + "loss": 0.9626, + "step": 1300 + }, + { + "epoch": 1.97, + "grad_norm": 7.014963150024414, + "learning_rate": 9.803007518796992e-06, + "loss": 0.8653, + "step": 1310 + }, + { + "epoch": 1.98, + "grad_norm": 8.665754318237305, + "learning_rate": 9.801503759398498e-06, + "loss": 0.8795, + "step": 1320 + }, + { + "epoch": 2.0, + "grad_norm": 8.92686939239502, + "learning_rate": 9.800000000000001e-06, + "loss": 0.7958, + "step": 1330 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9142, + "eval_loss": 0.4723776876926422, + "eval_runtime": 84.6568, + "eval_samples_per_second": 118.124, + "eval_steps_per_second": 0.472, + "step": 1330 + }, + { + "epoch": 2.02, + "grad_norm": 6.881080627441406, + "learning_rate": 9.798496240601505e-06, + "loss": 0.9416, + "step": 1340 + }, + { + "epoch": 2.03, + "grad_norm": 5.449582099914551, + "learning_rate": 9.796992481203008e-06, + "loss": 0.8897, + "step": 1350 + }, + { + "epoch": 2.05, + "grad_norm": 6.535237789154053, + "learning_rate": 9.795488721804513e-06, + "loss": 0.8241, + "step": 1360 + }, + { + "epoch": 2.06, + "grad_norm": 7.858671188354492, + "learning_rate": 9.793984962406015e-06, + "loss": 0.7834, + "step": 1370 + }, + { + "epoch": 2.08, + "grad_norm": 6.7106852531433105, + "learning_rate": 9.79248120300752e-06, + "loss": 0.836, + "step": 1380 + }, + { + "epoch": 2.09, + "grad_norm": 6.440729141235352, + "learning_rate": 9.790977443609024e-06, + "loss": 0.775, + "step": 1390 + }, + { + "epoch": 2.11, + "grad_norm": 8.935519218444824, + "learning_rate": 9.789473684210527e-06, + "loss": 0.7826, + "step": 1400 + }, + { + "epoch": 2.12, + "grad_norm": 7.244870662689209, + "learning_rate": 9.787969924812031e-06, + "loss": 0.8917, + "step": 1410 + }, + { + "epoch": 2.14, + "grad_norm": 9.125387191772461, + "learning_rate": 9.786466165413534e-06, + "loss": 0.8167, + "step": 1420 + }, + { + "epoch": 2.15, + "grad_norm": 6.8634114265441895, + "learning_rate": 9.784962406015038e-06, + "loss": 0.8052, + "step": 1430 + }, + { + "epoch": 2.17, + "grad_norm": 7.519056797027588, + "learning_rate": 9.783458646616543e-06, + "loss": 0.8204, + "step": 1440 + }, + { + "epoch": 2.18, + "grad_norm": 7.861953258514404, + "learning_rate": 9.781954887218047e-06, + "loss": 0.8404, + "step": 1450 + }, + { + "epoch": 2.2, + "grad_norm": 10.940001487731934, + "learning_rate": 9.78045112781955e-06, + "loss": 0.7934, + "step": 1460 + }, + { + "epoch": 2.21, + "grad_norm": 5.963690757751465, + "learning_rate": 9.778947368421054e-06, + "loss": 0.7549, + "step": 1470 + }, + { + "epoch": 2.23, + "grad_norm": 6.734865665435791, + "learning_rate": 9.777443609022557e-06, + "loss": 0.8112, + "step": 1480 + }, + { + "epoch": 2.24, + "grad_norm": 7.971401214599609, + "learning_rate": 9.77593984962406e-06, + "loss": 0.8323, + "step": 1490 + }, + { + "epoch": 2.26, + "grad_norm": 9.728713989257812, + "learning_rate": 9.774436090225564e-06, + "loss": 0.7441, + "step": 1500 + }, + { + "epoch": 2.27, + "grad_norm": 9.506553649902344, + "learning_rate": 9.772932330827068e-06, + "loss": 0.8018, + "step": 1510 + }, + { + "epoch": 2.29, + "grad_norm": 7.3224663734436035, + "learning_rate": 9.771428571428571e-06, + "loss": 0.7869, + "step": 1520 + }, + { + "epoch": 2.3, + "grad_norm": 7.251104831695557, + "learning_rate": 9.769924812030077e-06, + "loss": 0.8472, + "step": 1530 + }, + { + "epoch": 2.32, + "grad_norm": 7.3015055656433105, + "learning_rate": 9.76842105263158e-06, + "loss": 0.7224, + "step": 1540 + }, + { + "epoch": 2.33, + "grad_norm": 9.096901893615723, + "learning_rate": 9.766917293233084e-06, + "loss": 0.8195, + "step": 1550 + }, + { + "epoch": 2.35, + "grad_norm": 7.366261959075928, + "learning_rate": 9.765413533834587e-06, + "loss": 0.8322, + "step": 1560 + }, + { + "epoch": 2.36, + "grad_norm": 9.955854415893555, + "learning_rate": 9.76390977443609e-06, + "loss": 0.8315, + "step": 1570 + }, + { + "epoch": 2.38, + "grad_norm": 6.852784633636475, + "learning_rate": 9.762406015037594e-06, + "loss": 0.7113, + "step": 1580 + }, + { + "epoch": 2.39, + "grad_norm": 9.9766263961792, + "learning_rate": 9.7609022556391e-06, + "loss": 0.8024, + "step": 1590 + }, + { + "epoch": 2.41, + "grad_norm": 9.095175743103027, + "learning_rate": 9.759398496240601e-06, + "loss": 0.8774, + "step": 1600 + }, + { + "epoch": 2.42, + "grad_norm": 5.913175106048584, + "learning_rate": 9.757894736842106e-06, + "loss": 0.7608, + "step": 1610 + }, + { + "epoch": 2.44, + "grad_norm": 9.544361114501953, + "learning_rate": 9.75639097744361e-06, + "loss": 0.7984, + "step": 1620 + }, + { + "epoch": 2.45, + "grad_norm": 6.991225242614746, + "learning_rate": 9.754887218045113e-06, + "loss": 0.7942, + "step": 1630 + }, + { + "epoch": 2.47, + "grad_norm": 7.531531810760498, + "learning_rate": 9.753383458646617e-06, + "loss": 0.8005, + "step": 1640 + }, + { + "epoch": 2.48, + "grad_norm": 5.945763111114502, + "learning_rate": 9.751879699248122e-06, + "loss": 0.7673, + "step": 1650 + }, + { + "epoch": 2.5, + "grad_norm": 8.382121086120605, + "learning_rate": 9.750375939849624e-06, + "loss": 0.7966, + "step": 1660 + }, + { + "epoch": 2.51, + "grad_norm": 5.387685775756836, + "learning_rate": 9.74887218045113e-06, + "loss": 0.7892, + "step": 1670 + }, + { + "epoch": 2.53, + "grad_norm": 7.867427349090576, + "learning_rate": 9.747368421052633e-06, + "loss": 0.8002, + "step": 1680 + }, + { + "epoch": 2.54, + "grad_norm": 7.549880027770996, + "learning_rate": 9.745864661654136e-06, + "loss": 0.8879, + "step": 1690 + }, + { + "epoch": 2.56, + "grad_norm": 7.67978572845459, + "learning_rate": 9.74436090225564e-06, + "loss": 0.7849, + "step": 1700 + }, + { + "epoch": 2.57, + "grad_norm": 8.076873779296875, + "learning_rate": 9.742857142857143e-06, + "loss": 0.6503, + "step": 1710 + }, + { + "epoch": 2.59, + "grad_norm": 10.748533248901367, + "learning_rate": 9.741353383458647e-06, + "loss": 0.7868, + "step": 1720 + }, + { + "epoch": 2.6, + "grad_norm": 8.428750991821289, + "learning_rate": 9.739849624060152e-06, + "loss": 0.8195, + "step": 1730 + }, + { + "epoch": 2.62, + "grad_norm": 7.678562164306641, + "learning_rate": 9.738345864661655e-06, + "loss": 0.8428, + "step": 1740 + }, + { + "epoch": 2.63, + "grad_norm": 7.171645164489746, + "learning_rate": 9.736842105263159e-06, + "loss": 0.809, + "step": 1750 + }, + { + "epoch": 2.65, + "grad_norm": 7.041049003601074, + "learning_rate": 9.735338345864663e-06, + "loss": 0.7417, + "step": 1760 + }, + { + "epoch": 2.66, + "grad_norm": 9.66743278503418, + "learning_rate": 9.733834586466166e-06, + "loss": 0.7952, + "step": 1770 + }, + { + "epoch": 2.68, + "grad_norm": 9.864920616149902, + "learning_rate": 9.73233082706767e-06, + "loss": 0.8427, + "step": 1780 + }, + { + "epoch": 2.69, + "grad_norm": 10.242929458618164, + "learning_rate": 9.730827067669175e-06, + "loss": 0.7599, + "step": 1790 + }, + { + "epoch": 2.71, + "grad_norm": 8.138999938964844, + "learning_rate": 9.729323308270677e-06, + "loss": 0.7517, + "step": 1800 + }, + { + "epoch": 2.72, + "grad_norm": 7.668764114379883, + "learning_rate": 9.727819548872182e-06, + "loss": 0.7766, + "step": 1810 + }, + { + "epoch": 2.74, + "grad_norm": 6.978646278381348, + "learning_rate": 9.726315789473685e-06, + "loss": 0.7323, + "step": 1820 + }, + { + "epoch": 2.75, + "grad_norm": 8.794787406921387, + "learning_rate": 9.724812030075189e-06, + "loss": 0.8644, + "step": 1830 + }, + { + "epoch": 2.77, + "grad_norm": 10.154306411743164, + "learning_rate": 9.723308270676692e-06, + "loss": 0.8235, + "step": 1840 + }, + { + "epoch": 2.78, + "grad_norm": 9.513362884521484, + "learning_rate": 9.721804511278196e-06, + "loss": 0.792, + "step": 1850 + }, + { + "epoch": 2.8, + "grad_norm": 4.891651630401611, + "learning_rate": 9.7203007518797e-06, + "loss": 0.7343, + "step": 1860 + }, + { + "epoch": 2.81, + "grad_norm": 6.595260143280029, + "learning_rate": 9.718796992481205e-06, + "loss": 0.6451, + "step": 1870 + }, + { + "epoch": 2.83, + "grad_norm": 11.115670204162598, + "learning_rate": 9.717293233082708e-06, + "loss": 0.7841, + "step": 1880 + }, + { + "epoch": 2.84, + "grad_norm": 7.82785701751709, + "learning_rate": 9.715789473684212e-06, + "loss": 0.8396, + "step": 1890 + }, + { + "epoch": 2.86, + "grad_norm": 5.41937780380249, + "learning_rate": 9.714285714285715e-06, + "loss": 0.772, + "step": 1900 + }, + { + "epoch": 2.87, + "grad_norm": 8.092954635620117, + "learning_rate": 9.712781954887219e-06, + "loss": 0.7124, + "step": 1910 + }, + { + "epoch": 2.89, + "grad_norm": 4.913546562194824, + "learning_rate": 9.711278195488722e-06, + "loss": 0.7824, + "step": 1920 + }, + { + "epoch": 2.9, + "grad_norm": 6.090660572052002, + "learning_rate": 9.709774436090227e-06, + "loss": 0.7911, + "step": 1930 + }, + { + "epoch": 2.92, + "grad_norm": 5.547027111053467, + "learning_rate": 9.70827067669173e-06, + "loss": 0.7818, + "step": 1940 + }, + { + "epoch": 2.93, + "grad_norm": 8.583475112915039, + "learning_rate": 9.706766917293234e-06, + "loss": 0.7272, + "step": 1950 + }, + { + "epoch": 2.95, + "grad_norm": 8.129578590393066, + "learning_rate": 9.705263157894738e-06, + "loss": 0.7352, + "step": 1960 + }, + { + "epoch": 2.96, + "grad_norm": 9.513014793395996, + "learning_rate": 9.703759398496242e-06, + "loss": 0.8126, + "step": 1970 + }, + { + "epoch": 2.98, + "grad_norm": 5.819597244262695, + "learning_rate": 9.702255639097745e-06, + "loss": 0.7599, + "step": 1980 + }, + { + "epoch": 2.99, + "grad_norm": 7.391184329986572, + "learning_rate": 9.700751879699249e-06, + "loss": 0.743, + "step": 1990 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9207, + "eval_loss": 0.3750178813934326, + "eval_runtime": 84.5377, + "eval_samples_per_second": 118.29, + "eval_steps_per_second": 0.473, + "step": 1995 + }, + { + "epoch": 3.01, + "grad_norm": 6.347775459289551, + "learning_rate": 9.699248120300752e-06, + "loss": 0.6685, + "step": 2000 + }, + { + "epoch": 3.02, + "grad_norm": 6.109332084655762, + "learning_rate": 9.697744360902256e-06, + "loss": 0.7398, + "step": 2010 + }, + { + "epoch": 3.04, + "grad_norm": 4.770040512084961, + "learning_rate": 9.69624060150376e-06, + "loss": 0.7587, + "step": 2020 + }, + { + "epoch": 3.05, + "grad_norm": 7.322962760925293, + "learning_rate": 9.694736842105263e-06, + "loss": 0.7917, + "step": 2030 + }, + { + "epoch": 3.07, + "grad_norm": 9.097600936889648, + "learning_rate": 9.693233082706768e-06, + "loss": 0.6967, + "step": 2040 + }, + { + "epoch": 3.08, + "grad_norm": 10.507075309753418, + "learning_rate": 9.691729323308271e-06, + "loss": 0.7229, + "step": 2050 + }, + { + "epoch": 3.1, + "grad_norm": 6.249164581298828, + "learning_rate": 9.690225563909775e-06, + "loss": 0.6491, + "step": 2060 + }, + { + "epoch": 3.11, + "grad_norm": 7.525278568267822, + "learning_rate": 9.688721804511278e-06, + "loss": 0.6476, + "step": 2070 + }, + { + "epoch": 3.13, + "grad_norm": 9.564391136169434, + "learning_rate": 9.687218045112784e-06, + "loss": 0.7133, + "step": 2080 + }, + { + "epoch": 3.14, + "grad_norm": 6.4053955078125, + "learning_rate": 9.685714285714285e-06, + "loss": 0.8012, + "step": 2090 + }, + { + "epoch": 3.16, + "grad_norm": 6.6741251945495605, + "learning_rate": 9.68421052631579e-06, + "loss": 0.7507, + "step": 2100 + }, + { + "epoch": 3.17, + "grad_norm": 12.398811340332031, + "learning_rate": 9.682706766917294e-06, + "loss": 0.7138, + "step": 2110 + }, + { + "epoch": 3.19, + "grad_norm": 8.505881309509277, + "learning_rate": 9.681203007518798e-06, + "loss": 0.7136, + "step": 2120 + }, + { + "epoch": 3.2, + "grad_norm": 5.51025915145874, + "learning_rate": 9.679699248120301e-06, + "loss": 0.6037, + "step": 2130 + }, + { + "epoch": 3.22, + "grad_norm": 6.39398193359375, + "learning_rate": 9.678195488721805e-06, + "loss": 0.6799, + "step": 2140 + }, + { + "epoch": 3.23, + "grad_norm": 6.508944511413574, + "learning_rate": 9.676691729323308e-06, + "loss": 0.6587, + "step": 2150 + }, + { + "epoch": 3.25, + "grad_norm": 7.027959823608398, + "learning_rate": 9.675187969924813e-06, + "loss": 0.7096, + "step": 2160 + }, + { + "epoch": 3.26, + "grad_norm": 8.690469741821289, + "learning_rate": 9.673684210526317e-06, + "loss": 0.6169, + "step": 2170 + }, + { + "epoch": 3.28, + "grad_norm": 9.489269256591797, + "learning_rate": 9.67218045112782e-06, + "loss": 0.6643, + "step": 2180 + }, + { + "epoch": 3.29, + "grad_norm": 12.934528350830078, + "learning_rate": 9.670676691729324e-06, + "loss": 0.7485, + "step": 2190 + }, + { + "epoch": 3.31, + "grad_norm": 9.072815895080566, + "learning_rate": 9.669172932330828e-06, + "loss": 0.704, + "step": 2200 + }, + { + "epoch": 3.32, + "grad_norm": 7.934593200683594, + "learning_rate": 9.667669172932331e-06, + "loss": 0.7134, + "step": 2210 + }, + { + "epoch": 3.34, + "grad_norm": 6.846796989440918, + "learning_rate": 9.666165413533836e-06, + "loss": 0.7372, + "step": 2220 + }, + { + "epoch": 3.35, + "grad_norm": 6.8362250328063965, + "learning_rate": 9.664661654135338e-06, + "loss": 0.6854, + "step": 2230 + }, + { + "epoch": 3.37, + "grad_norm": 8.184903144836426, + "learning_rate": 9.663157894736843e-06, + "loss": 0.6662, + "step": 2240 + }, + { + "epoch": 3.38, + "grad_norm": 8.972626686096191, + "learning_rate": 9.661654135338347e-06, + "loss": 0.7895, + "step": 2250 + }, + { + "epoch": 3.4, + "grad_norm": 6.524502754211426, + "learning_rate": 9.66015037593985e-06, + "loss": 0.8453, + "step": 2260 + }, + { + "epoch": 3.41, + "grad_norm": 6.215096950531006, + "learning_rate": 9.658646616541354e-06, + "loss": 0.716, + "step": 2270 + }, + { + "epoch": 3.43, + "grad_norm": 6.9516401290893555, + "learning_rate": 9.657142857142859e-06, + "loss": 0.7453, + "step": 2280 + }, + { + "epoch": 3.44, + "grad_norm": 9.131119728088379, + "learning_rate": 9.655639097744361e-06, + "loss": 0.6474, + "step": 2290 + }, + { + "epoch": 3.46, + "grad_norm": 7.063914775848389, + "learning_rate": 9.654135338345866e-06, + "loss": 0.6535, + "step": 2300 + }, + { + "epoch": 3.47, + "grad_norm": 9.410021781921387, + "learning_rate": 9.65263157894737e-06, + "loss": 0.708, + "step": 2310 + }, + { + "epoch": 3.49, + "grad_norm": 7.179042816162109, + "learning_rate": 9.651127819548873e-06, + "loss": 0.7098, + "step": 2320 + }, + { + "epoch": 3.5, + "grad_norm": 8.097248077392578, + "learning_rate": 9.649624060150377e-06, + "loss": 0.7052, + "step": 2330 + }, + { + "epoch": 3.52, + "grad_norm": 4.258429050445557, + "learning_rate": 9.64812030075188e-06, + "loss": 0.6803, + "step": 2340 + }, + { + "epoch": 3.53, + "grad_norm": 7.451633930206299, + "learning_rate": 9.646616541353384e-06, + "loss": 0.6569, + "step": 2350 + }, + { + "epoch": 3.55, + "grad_norm": 9.551535606384277, + "learning_rate": 9.645112781954889e-06, + "loss": 0.6411, + "step": 2360 + }, + { + "epoch": 3.56, + "grad_norm": 6.456385612487793, + "learning_rate": 9.643609022556392e-06, + "loss": 0.5938, + "step": 2370 + }, + { + "epoch": 3.58, + "grad_norm": 5.947078704833984, + "learning_rate": 9.642105263157896e-06, + "loss": 0.6175, + "step": 2380 + }, + { + "epoch": 3.59, + "grad_norm": 5.279054164886475, + "learning_rate": 9.6406015037594e-06, + "loss": 0.6788, + "step": 2390 + }, + { + "epoch": 3.61, + "grad_norm": 7.593316555023193, + "learning_rate": 9.639097744360903e-06, + "loss": 0.6255, + "step": 2400 + }, + { + "epoch": 3.62, + "grad_norm": 7.478080749511719, + "learning_rate": 9.637593984962407e-06, + "loss": 0.7908, + "step": 2410 + }, + { + "epoch": 3.64, + "grad_norm": 9.64027214050293, + "learning_rate": 9.636090225563912e-06, + "loss": 0.7241, + "step": 2420 + }, + { + "epoch": 3.65, + "grad_norm": 14.006696701049805, + "learning_rate": 9.634586466165414e-06, + "loss": 0.686, + "step": 2430 + }, + { + "epoch": 3.67, + "grad_norm": 5.0339789390563965, + "learning_rate": 9.633082706766919e-06, + "loss": 0.7082, + "step": 2440 + }, + { + "epoch": 3.68, + "grad_norm": 8.148447036743164, + "learning_rate": 9.631578947368422e-06, + "loss": 0.6859, + "step": 2450 + }, + { + "epoch": 3.7, + "grad_norm": 7.614720344543457, + "learning_rate": 9.630075187969926e-06, + "loss": 0.7117, + "step": 2460 + }, + { + "epoch": 3.71, + "grad_norm": 9.017003059387207, + "learning_rate": 9.62857142857143e-06, + "loss": 0.6505, + "step": 2470 + }, + { + "epoch": 3.73, + "grad_norm": 6.4466986656188965, + "learning_rate": 9.627067669172933e-06, + "loss": 0.71, + "step": 2480 + }, + { + "epoch": 3.74, + "grad_norm": 7.973327159881592, + "learning_rate": 9.625563909774436e-06, + "loss": 0.6466, + "step": 2490 + }, + { + "epoch": 3.76, + "grad_norm": 6.712606906890869, + "learning_rate": 9.62406015037594e-06, + "loss": 0.6767, + "step": 2500 + }, + { + "epoch": 3.77, + "grad_norm": 8.149372100830078, + "learning_rate": 9.622556390977445e-06, + "loss": 0.783, + "step": 2510 + }, + { + "epoch": 3.79, + "grad_norm": 8.645270347595215, + "learning_rate": 9.621052631578947e-06, + "loss": 0.6978, + "step": 2520 + }, + { + "epoch": 3.8, + "grad_norm": 9.347142219543457, + "learning_rate": 9.619548872180452e-06, + "loss": 0.583, + "step": 2530 + }, + { + "epoch": 3.82, + "grad_norm": 7.905392169952393, + "learning_rate": 9.618045112781956e-06, + "loss": 0.6884, + "step": 2540 + }, + { + "epoch": 3.83, + "grad_norm": 8.783331871032715, + "learning_rate": 9.61654135338346e-06, + "loss": 0.691, + "step": 2550 + }, + { + "epoch": 3.85, + "grad_norm": 8.456209182739258, + "learning_rate": 9.615037593984963e-06, + "loss": 0.7281, + "step": 2560 + }, + { + "epoch": 3.86, + "grad_norm": 6.667693138122559, + "learning_rate": 9.613533834586468e-06, + "loss": 0.6912, + "step": 2570 + }, + { + "epoch": 3.88, + "grad_norm": 8.541569709777832, + "learning_rate": 9.61203007518797e-06, + "loss": 0.6853, + "step": 2580 + }, + { + "epoch": 3.89, + "grad_norm": 4.732927322387695, + "learning_rate": 9.610526315789475e-06, + "loss": 0.6647, + "step": 2590 + }, + { + "epoch": 3.91, + "grad_norm": 7.604156017303467, + "learning_rate": 9.609022556390978e-06, + "loss": 0.7526, + "step": 2600 + }, + { + "epoch": 3.92, + "grad_norm": 8.218050956726074, + "learning_rate": 9.607518796992482e-06, + "loss": 0.6828, + "step": 2610 + }, + { + "epoch": 3.94, + "grad_norm": 5.613206386566162, + "learning_rate": 9.606015037593985e-06, + "loss": 0.6964, + "step": 2620 + }, + { + "epoch": 3.95, + "grad_norm": 9.644120216369629, + "learning_rate": 9.604511278195489e-06, + "loss": 0.6912, + "step": 2630 + }, + { + "epoch": 3.97, + "grad_norm": 8.14504337310791, + "learning_rate": 9.603007518796993e-06, + "loss": 0.7527, + "step": 2640 + }, + { + "epoch": 3.98, + "grad_norm": 6.1560468673706055, + "learning_rate": 9.601503759398498e-06, + "loss": 0.6145, + "step": 2650 + }, + { + "epoch": 4.0, + "grad_norm": 20.564706802368164, + "learning_rate": 9.600000000000001e-06, + "loss": 0.6935, + "step": 2660 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.9236, + "eval_loss": 0.319810152053833, + "eval_runtime": 84.5508, + "eval_samples_per_second": 118.272, + "eval_steps_per_second": 0.473, + "step": 2660 + }, + { + "epoch": 4.02, + "grad_norm": 9.448854446411133, + "learning_rate": 9.598496240601505e-06, + "loss": 0.631, + "step": 2670 + }, + { + "epoch": 4.03, + "grad_norm": 10.194000244140625, + "learning_rate": 9.596992481203008e-06, + "loss": 0.6046, + "step": 2680 + }, + { + "epoch": 4.05, + "grad_norm": 8.277205467224121, + "learning_rate": 9.595488721804512e-06, + "loss": 0.5853, + "step": 2690 + }, + { + "epoch": 4.06, + "grad_norm": 7.616865158081055, + "learning_rate": 9.593984962406015e-06, + "loss": 0.6168, + "step": 2700 + }, + { + "epoch": 4.08, + "grad_norm": 5.158208847045898, + "learning_rate": 9.59248120300752e-06, + "loss": 0.5893, + "step": 2710 + }, + { + "epoch": 4.09, + "grad_norm": 10.609253883361816, + "learning_rate": 9.590977443609022e-06, + "loss": 0.5819, + "step": 2720 + }, + { + "epoch": 4.11, + "grad_norm": 7.288332462310791, + "learning_rate": 9.589473684210528e-06, + "loss": 0.6384, + "step": 2730 + }, + { + "epoch": 4.12, + "grad_norm": 6.625866889953613, + "learning_rate": 9.587969924812031e-06, + "loss": 0.6841, + "step": 2740 + }, + { + "epoch": 4.14, + "grad_norm": 8.38702392578125, + "learning_rate": 9.586466165413535e-06, + "loss": 0.5815, + "step": 2750 + }, + { + "epoch": 4.15, + "grad_norm": 6.58852481842041, + "learning_rate": 9.584962406015038e-06, + "loss": 0.7091, + "step": 2760 + }, + { + "epoch": 4.17, + "grad_norm": 5.776881217956543, + "learning_rate": 9.583458646616542e-06, + "loss": 0.6426, + "step": 2770 + }, + { + "epoch": 4.18, + "grad_norm": 7.806540489196777, + "learning_rate": 9.581954887218045e-06, + "loss": 0.6459, + "step": 2780 + }, + { + "epoch": 4.2, + "grad_norm": 7.378940582275391, + "learning_rate": 9.58045112781955e-06, + "loss": 0.6553, + "step": 2790 + }, + { + "epoch": 4.21, + "grad_norm": 8.37366008758545, + "learning_rate": 9.578947368421054e-06, + "loss": 0.6024, + "step": 2800 + }, + { + "epoch": 4.23, + "grad_norm": 5.783264636993408, + "learning_rate": 9.577443609022557e-06, + "loss": 0.5959, + "step": 2810 + }, + { + "epoch": 4.24, + "grad_norm": 6.4687676429748535, + "learning_rate": 9.575939849624061e-06, + "loss": 0.5669, + "step": 2820 + }, + { + "epoch": 4.26, + "grad_norm": 10.575803756713867, + "learning_rate": 9.574436090225564e-06, + "loss": 0.6461, + "step": 2830 + }, + { + "epoch": 4.27, + "grad_norm": 9.703124046325684, + "learning_rate": 9.572932330827068e-06, + "loss": 0.5784, + "step": 2840 + }, + { + "epoch": 4.29, + "grad_norm": 9.454757690429688, + "learning_rate": 9.571428571428573e-06, + "loss": 0.7381, + "step": 2850 + }, + { + "epoch": 4.3, + "grad_norm": 7.034806728363037, + "learning_rate": 9.569924812030075e-06, + "loss": 0.6277, + "step": 2860 + }, + { + "epoch": 4.32, + "grad_norm": 10.1060791015625, + "learning_rate": 9.56842105263158e-06, + "loss": 0.6973, + "step": 2870 + }, + { + "epoch": 4.33, + "grad_norm": 7.225138187408447, + "learning_rate": 9.566917293233084e-06, + "loss": 0.587, + "step": 2880 + }, + { + "epoch": 4.35, + "grad_norm": 6.221525192260742, + "learning_rate": 9.565413533834587e-06, + "loss": 0.6006, + "step": 2890 + }, + { + "epoch": 4.36, + "grad_norm": 6.329552173614502, + "learning_rate": 9.56390977443609e-06, + "loss": 0.6474, + "step": 2900 + }, + { + "epoch": 4.38, + "grad_norm": 8.411649703979492, + "learning_rate": 9.562406015037596e-06, + "loss": 0.5376, + "step": 2910 + }, + { + "epoch": 4.39, + "grad_norm": 8.27790355682373, + "learning_rate": 9.560902255639098e-06, + "loss": 0.6324, + "step": 2920 + }, + { + "epoch": 4.41, + "grad_norm": 6.995235443115234, + "learning_rate": 9.559398496240603e-06, + "loss": 0.6178, + "step": 2930 + }, + { + "epoch": 4.42, + "grad_norm": 8.169748306274414, + "learning_rate": 9.557894736842107e-06, + "loss": 0.5546, + "step": 2940 + }, + { + "epoch": 4.44, + "grad_norm": 7.832982063293457, + "learning_rate": 9.55639097744361e-06, + "loss": 0.5873, + "step": 2950 + }, + { + "epoch": 4.45, + "grad_norm": 7.024545192718506, + "learning_rate": 9.554887218045114e-06, + "loss": 0.5919, + "step": 2960 + }, + { + "epoch": 4.47, + "grad_norm": 8.610920906066895, + "learning_rate": 9.553383458646617e-06, + "loss": 0.6324, + "step": 2970 + }, + { + "epoch": 4.48, + "grad_norm": 8.49885368347168, + "learning_rate": 9.55187969924812e-06, + "loss": 0.6392, + "step": 2980 + }, + { + "epoch": 4.5, + "grad_norm": 6.013737678527832, + "learning_rate": 9.550375939849624e-06, + "loss": 0.6267, + "step": 2990 + }, + { + "epoch": 4.51, + "grad_norm": 9.457529067993164, + "learning_rate": 9.54887218045113e-06, + "loss": 0.633, + "step": 3000 + }, + { + "epoch": 4.53, + "grad_norm": 7.126248359680176, + "learning_rate": 9.547368421052631e-06, + "loss": 0.5527, + "step": 3010 + }, + { + "epoch": 4.54, + "grad_norm": 8.481447219848633, + "learning_rate": 9.545864661654136e-06, + "loss": 0.7163, + "step": 3020 + }, + { + "epoch": 4.56, + "grad_norm": 10.300518035888672, + "learning_rate": 9.54436090225564e-06, + "loss": 0.5921, + "step": 3030 + }, + { + "epoch": 4.57, + "grad_norm": 8.265804290771484, + "learning_rate": 9.542857142857143e-06, + "loss": 0.5952, + "step": 3040 + }, + { + "epoch": 4.59, + "grad_norm": 5.028606414794922, + "learning_rate": 9.541353383458647e-06, + "loss": 0.6269, + "step": 3050 + }, + { + "epoch": 4.6, + "grad_norm": 8.997878074645996, + "learning_rate": 9.53984962406015e-06, + "loss": 0.5761, + "step": 3060 + }, + { + "epoch": 4.62, + "grad_norm": 9.585675239562988, + "learning_rate": 9.538345864661654e-06, + "loss": 0.5851, + "step": 3070 + }, + { + "epoch": 4.63, + "grad_norm": 5.83755350112915, + "learning_rate": 9.53684210526316e-06, + "loss": 0.6105, + "step": 3080 + }, + { + "epoch": 4.65, + "grad_norm": 5.208207607269287, + "learning_rate": 9.535338345864663e-06, + "loss": 0.5786, + "step": 3090 + }, + { + "epoch": 4.66, + "grad_norm": 9.895461082458496, + "learning_rate": 9.533834586466166e-06, + "loss": 0.623, + "step": 3100 + }, + { + "epoch": 4.68, + "grad_norm": 8.958138465881348, + "learning_rate": 9.53233082706767e-06, + "loss": 0.5708, + "step": 3110 + }, + { + "epoch": 4.69, + "grad_norm": 10.452126502990723, + "learning_rate": 9.530827067669173e-06, + "loss": 0.5694, + "step": 3120 + }, + { + "epoch": 4.71, + "grad_norm": 7.20021915435791, + "learning_rate": 9.529323308270677e-06, + "loss": 0.6267, + "step": 3130 + }, + { + "epoch": 4.72, + "grad_norm": 7.995909690856934, + "learning_rate": 9.527819548872182e-06, + "loss": 0.6989, + "step": 3140 + }, + { + "epoch": 4.74, + "grad_norm": 6.9314985275268555, + "learning_rate": 9.526315789473684e-06, + "loss": 0.6093, + "step": 3150 + }, + { + "epoch": 4.75, + "grad_norm": 10.158616065979004, + "learning_rate": 9.524812030075189e-06, + "loss": 0.5696, + "step": 3160 + }, + { + "epoch": 4.77, + "grad_norm": 7.637181758880615, + "learning_rate": 9.523308270676693e-06, + "loss": 0.7494, + "step": 3170 + }, + { + "epoch": 4.78, + "grad_norm": 7.443474769592285, + "learning_rate": 9.521804511278196e-06, + "loss": 0.7636, + "step": 3180 + }, + { + "epoch": 4.8, + "grad_norm": 6.130582809448242, + "learning_rate": 9.5203007518797e-06, + "loss": 0.7397, + "step": 3190 + }, + { + "epoch": 4.81, + "grad_norm": 8.699774742126465, + "learning_rate": 9.518796992481205e-06, + "loss": 0.5461, + "step": 3200 + }, + { + "epoch": 4.83, + "grad_norm": 8.060851097106934, + "learning_rate": 9.517293233082707e-06, + "loss": 0.5424, + "step": 3210 + }, + { + "epoch": 4.84, + "grad_norm": 6.084632396697998, + "learning_rate": 9.515789473684212e-06, + "loss": 0.6181, + "step": 3220 + }, + { + "epoch": 4.86, + "grad_norm": 8.804571151733398, + "learning_rate": 9.514285714285715e-06, + "loss": 0.6696, + "step": 3230 + }, + { + "epoch": 4.87, + "grad_norm": 8.552626609802246, + "learning_rate": 9.512781954887219e-06, + "loss": 0.6237, + "step": 3240 + }, + { + "epoch": 4.89, + "grad_norm": 8.930567741394043, + "learning_rate": 9.511278195488722e-06, + "loss": 0.6427, + "step": 3250 + }, + { + "epoch": 4.9, + "grad_norm": 8.916244506835938, + "learning_rate": 9.509774436090226e-06, + "loss": 0.608, + "step": 3260 + }, + { + "epoch": 4.92, + "grad_norm": 12.679169654846191, + "learning_rate": 9.50827067669173e-06, + "loss": 0.622, + "step": 3270 + }, + { + "epoch": 4.93, + "grad_norm": 9.21071720123291, + "learning_rate": 9.506766917293235e-06, + "loss": 0.6153, + "step": 3280 + }, + { + "epoch": 4.95, + "grad_norm": 8.040297508239746, + "learning_rate": 9.505263157894738e-06, + "loss": 0.654, + "step": 3290 + }, + { + "epoch": 4.96, + "grad_norm": 6.395382404327393, + "learning_rate": 9.503759398496242e-06, + "loss": 0.6729, + "step": 3300 + }, + { + "epoch": 4.98, + "grad_norm": 8.437057495117188, + "learning_rate": 9.502255639097745e-06, + "loss": 0.6457, + "step": 3310 + }, + { + "epoch": 4.99, + "grad_norm": 7.987279415130615, + "learning_rate": 9.500751879699249e-06, + "loss": 0.6159, + "step": 3320 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.9289, + "eval_loss": 0.2945062816143036, + "eval_runtime": 84.9367, + "eval_samples_per_second": 117.735, + "eval_steps_per_second": 0.471, + "step": 3325 + }, + { + "epoch": 5.01, + "grad_norm": 6.12150239944458, + "learning_rate": 9.499248120300752e-06, + "loss": 0.4871, + "step": 3330 + }, + { + "epoch": 5.02, + "grad_norm": 8.20666217803955, + "learning_rate": 9.497744360902257e-06, + "loss": 0.6011, + "step": 3340 + }, + { + "epoch": 5.04, + "grad_norm": 8.818642616271973, + "learning_rate": 9.49624060150376e-06, + "loss": 0.5681, + "step": 3350 + }, + { + "epoch": 5.05, + "grad_norm": 5.606151103973389, + "learning_rate": 9.494736842105265e-06, + "loss": 0.5494, + "step": 3360 + }, + { + "epoch": 5.07, + "grad_norm": 6.230663299560547, + "learning_rate": 9.493233082706768e-06, + "loss": 0.6138, + "step": 3370 + }, + { + "epoch": 5.08, + "grad_norm": 6.923035621643066, + "learning_rate": 9.491729323308272e-06, + "loss": 0.6398, + "step": 3380 + }, + { + "epoch": 5.1, + "grad_norm": 8.464035034179688, + "learning_rate": 9.490225563909775e-06, + "loss": 0.533, + "step": 3390 + }, + { + "epoch": 5.11, + "grad_norm": 5.440852165222168, + "learning_rate": 9.488721804511279e-06, + "loss": 0.5901, + "step": 3400 + }, + { + "epoch": 5.13, + "grad_norm": 6.880829334259033, + "learning_rate": 9.487218045112782e-06, + "loss": 0.5699, + "step": 3410 + }, + { + "epoch": 5.14, + "grad_norm": 6.773617267608643, + "learning_rate": 9.485714285714287e-06, + "loss": 0.5164, + "step": 3420 + }, + { + "epoch": 5.16, + "grad_norm": 6.794729232788086, + "learning_rate": 9.484210526315791e-06, + "loss": 0.4939, + "step": 3430 + }, + { + "epoch": 5.17, + "grad_norm": 8.347722053527832, + "learning_rate": 9.482706766917294e-06, + "loss": 0.6138, + "step": 3440 + }, + { + "epoch": 5.19, + "grad_norm": 6.279055595397949, + "learning_rate": 9.481203007518798e-06, + "loss": 0.4792, + "step": 3450 + }, + { + "epoch": 5.2, + "grad_norm": 7.93798828125, + "learning_rate": 9.479699248120301e-06, + "loss": 0.5945, + "step": 3460 + }, + { + "epoch": 5.22, + "grad_norm": 6.767178535461426, + "learning_rate": 9.478195488721805e-06, + "loss": 0.5878, + "step": 3470 + }, + { + "epoch": 5.23, + "grad_norm": 6.87293004989624, + "learning_rate": 9.476691729323308e-06, + "loss": 0.566, + "step": 3480 + }, + { + "epoch": 5.25, + "grad_norm": 2.407437562942505, + "learning_rate": 9.475187969924814e-06, + "loss": 0.5014, + "step": 3490 + }, + { + "epoch": 5.26, + "grad_norm": 8.233712196350098, + "learning_rate": 9.473684210526315e-06, + "loss": 0.5879, + "step": 3500 + }, + { + "epoch": 5.28, + "grad_norm": 7.905375003814697, + "learning_rate": 9.47218045112782e-06, + "loss": 0.6127, + "step": 3510 + }, + { + "epoch": 5.29, + "grad_norm": 5.8037238121032715, + "learning_rate": 9.470676691729324e-06, + "loss": 0.6048, + "step": 3520 + }, + { + "epoch": 5.31, + "grad_norm": 9.2665433883667, + "learning_rate": 9.469172932330828e-06, + "loss": 0.6233, + "step": 3530 + }, + { + "epoch": 5.32, + "grad_norm": 5.650614261627197, + "learning_rate": 9.467669172932331e-06, + "loss": 0.5703, + "step": 3540 + }, + { + "epoch": 5.34, + "grad_norm": 5.246155738830566, + "learning_rate": 9.466165413533835e-06, + "loss": 0.5108, + "step": 3550 + }, + { + "epoch": 5.35, + "grad_norm": 8.701322555541992, + "learning_rate": 9.464661654135338e-06, + "loss": 0.5783, + "step": 3560 + }, + { + "epoch": 5.37, + "grad_norm": 5.870892524719238, + "learning_rate": 9.463157894736844e-06, + "loss": 0.5546, + "step": 3570 + }, + { + "epoch": 5.38, + "grad_norm": 8.061163902282715, + "learning_rate": 9.461654135338347e-06, + "loss": 0.5973, + "step": 3580 + }, + { + "epoch": 5.4, + "grad_norm": 4.166900157928467, + "learning_rate": 9.46015037593985e-06, + "loss": 0.6042, + "step": 3590 + }, + { + "epoch": 5.41, + "grad_norm": 5.8524346351623535, + "learning_rate": 9.458646616541354e-06, + "loss": 0.5307, + "step": 3600 + }, + { + "epoch": 5.43, + "grad_norm": 7.229081153869629, + "learning_rate": 9.457142857142858e-06, + "loss": 0.6533, + "step": 3610 + }, + { + "epoch": 5.44, + "grad_norm": 5.403026580810547, + "learning_rate": 9.455639097744361e-06, + "loss": 0.5059, + "step": 3620 + }, + { + "epoch": 5.46, + "grad_norm": 7.53814697265625, + "learning_rate": 9.454135338345866e-06, + "loss": 0.6576, + "step": 3630 + }, + { + "epoch": 5.47, + "grad_norm": 8.08530330657959, + "learning_rate": 9.452631578947368e-06, + "loss": 0.5881, + "step": 3640 + }, + { + "epoch": 5.49, + "grad_norm": 7.80808687210083, + "learning_rate": 9.451127819548873e-06, + "loss": 0.5725, + "step": 3650 + }, + { + "epoch": 5.5, + "grad_norm": 4.368475914001465, + "learning_rate": 9.449624060150377e-06, + "loss": 0.5538, + "step": 3660 + }, + { + "epoch": 5.52, + "grad_norm": 7.806415557861328, + "learning_rate": 9.44812030075188e-06, + "loss": 0.6739, + "step": 3670 + }, + { + "epoch": 5.53, + "grad_norm": 8.047362327575684, + "learning_rate": 9.446616541353384e-06, + "loss": 0.5229, + "step": 3680 + }, + { + "epoch": 5.55, + "grad_norm": 5.89243221282959, + "learning_rate": 9.445112781954887e-06, + "loss": 0.599, + "step": 3690 + }, + { + "epoch": 5.56, + "grad_norm": 6.812804222106934, + "learning_rate": 9.443609022556391e-06, + "loss": 0.6213, + "step": 3700 + }, + { + "epoch": 5.58, + "grad_norm": 7.177376747131348, + "learning_rate": 9.442105263157896e-06, + "loss": 0.5568, + "step": 3710 + }, + { + "epoch": 5.59, + "grad_norm": 4.684938907623291, + "learning_rate": 9.4406015037594e-06, + "loss": 0.5266, + "step": 3720 + }, + { + "epoch": 5.61, + "grad_norm": 9.440715789794922, + "learning_rate": 9.439097744360903e-06, + "loss": 0.6431, + "step": 3730 + }, + { + "epoch": 5.62, + "grad_norm": 6.564364910125732, + "learning_rate": 9.437593984962407e-06, + "loss": 0.4702, + "step": 3740 + }, + { + "epoch": 5.64, + "grad_norm": 7.499642372131348, + "learning_rate": 9.43609022556391e-06, + "loss": 0.5249, + "step": 3750 + }, + { + "epoch": 5.65, + "grad_norm": 5.27653169631958, + "learning_rate": 9.434586466165414e-06, + "loss": 0.4517, + "step": 3760 + }, + { + "epoch": 5.67, + "grad_norm": 4.881272792816162, + "learning_rate": 9.433082706766919e-06, + "loss": 0.4682, + "step": 3770 + }, + { + "epoch": 5.68, + "grad_norm": 9.005290985107422, + "learning_rate": 9.43157894736842e-06, + "loss": 0.5642, + "step": 3780 + }, + { + "epoch": 5.7, + "grad_norm": 6.421182155609131, + "learning_rate": 9.430075187969926e-06, + "loss": 0.4855, + "step": 3790 + }, + { + "epoch": 5.71, + "grad_norm": 7.035130023956299, + "learning_rate": 9.42857142857143e-06, + "loss": 0.6362, + "step": 3800 + }, + { + "epoch": 5.73, + "grad_norm": 5.77438497543335, + "learning_rate": 9.427067669172933e-06, + "loss": 0.5876, + "step": 3810 + }, + { + "epoch": 5.74, + "grad_norm": 8.328043937683105, + "learning_rate": 9.425563909774437e-06, + "loss": 0.4768, + "step": 3820 + }, + { + "epoch": 5.76, + "grad_norm": 5.7907586097717285, + "learning_rate": 9.424060150375942e-06, + "loss": 0.5941, + "step": 3830 + }, + { + "epoch": 5.77, + "grad_norm": 8.730267524719238, + "learning_rate": 9.422556390977444e-06, + "loss": 0.6426, + "step": 3840 + }, + { + "epoch": 5.79, + "grad_norm": 8.710532188415527, + "learning_rate": 9.421052631578949e-06, + "loss": 0.5909, + "step": 3850 + }, + { + "epoch": 5.8, + "grad_norm": 8.74202823638916, + "learning_rate": 9.419548872180452e-06, + "loss": 0.5735, + "step": 3860 + }, + { + "epoch": 5.82, + "grad_norm": 7.489967346191406, + "learning_rate": 9.418045112781956e-06, + "loss": 0.6072, + "step": 3870 + }, + { + "epoch": 5.83, + "grad_norm": 6.942547798156738, + "learning_rate": 9.41654135338346e-06, + "loss": 0.5513, + "step": 3880 + }, + { + "epoch": 5.85, + "grad_norm": 5.517817497253418, + "learning_rate": 9.415037593984963e-06, + "loss": 0.5317, + "step": 3890 + }, + { + "epoch": 5.86, + "grad_norm": 6.26224946975708, + "learning_rate": 9.413533834586466e-06, + "loss": 0.6014, + "step": 3900 + }, + { + "epoch": 5.88, + "grad_norm": 4.848892688751221, + "learning_rate": 9.412030075187972e-06, + "loss": 0.5628, + "step": 3910 + }, + { + "epoch": 5.89, + "grad_norm": 7.279343128204346, + "learning_rate": 9.410526315789475e-06, + "loss": 0.631, + "step": 3920 + }, + { + "epoch": 5.91, + "grad_norm": 5.791496753692627, + "learning_rate": 9.409022556390979e-06, + "loss": 0.5312, + "step": 3930 + }, + { + "epoch": 5.92, + "grad_norm": 4.935235977172852, + "learning_rate": 9.407518796992482e-06, + "loss": 0.4694, + "step": 3940 + }, + { + "epoch": 5.94, + "grad_norm": 5.741876125335693, + "learning_rate": 9.406015037593986e-06, + "loss": 0.6073, + "step": 3950 + }, + { + "epoch": 5.95, + "grad_norm": 5.398350715637207, + "learning_rate": 9.40451127819549e-06, + "loss": 0.6009, + "step": 3960 + }, + { + "epoch": 5.97, + "grad_norm": 6.093377590179443, + "learning_rate": 9.403007518796994e-06, + "loss": 0.5845, + "step": 3970 + }, + { + "epoch": 5.98, + "grad_norm": 8.6488676071167, + "learning_rate": 9.401503759398496e-06, + "loss": 0.5932, + "step": 3980 + }, + { + "epoch": 6.0, + "grad_norm": 3.0173494815826416, + "learning_rate": 9.4e-06, + "loss": 0.4423, + "step": 3990 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.925, + "eval_loss": 0.2876322865486145, + "eval_runtime": 85.2695, + "eval_samples_per_second": 117.275, + "eval_steps_per_second": 0.469, + "step": 3990 + }, + { + "epoch": 6.02, + "grad_norm": 4.103921890258789, + "learning_rate": 9.398496240601505e-06, + "loss": 0.5346, + "step": 4000 + }, + { + "epoch": 6.03, + "grad_norm": 4.625704288482666, + "learning_rate": 9.396992481203009e-06, + "loss": 0.5522, + "step": 4010 + }, + { + "epoch": 6.05, + "grad_norm": 7.251491546630859, + "learning_rate": 9.395488721804512e-06, + "loss": 0.6501, + "step": 4020 + }, + { + "epoch": 6.06, + "grad_norm": 8.962389945983887, + "learning_rate": 9.393984962406016e-06, + "loss": 0.5402, + "step": 4030 + }, + { + "epoch": 6.08, + "grad_norm": 6.207771301269531, + "learning_rate": 9.392481203007519e-06, + "loss": 0.5545, + "step": 4040 + }, + { + "epoch": 6.09, + "grad_norm": 5.253688812255859, + "learning_rate": 9.390977443609023e-06, + "loss": 0.5084, + "step": 4050 + }, + { + "epoch": 6.11, + "grad_norm": 7.277046203613281, + "learning_rate": 9.389473684210528e-06, + "loss": 0.5115, + "step": 4060 + }, + { + "epoch": 6.12, + "grad_norm": 5.671750068664551, + "learning_rate": 9.38796992481203e-06, + "loss": 0.5508, + "step": 4070 + }, + { + "epoch": 6.14, + "grad_norm": 3.9672038555145264, + "learning_rate": 9.386466165413535e-06, + "loss": 0.494, + "step": 4080 + }, + { + "epoch": 6.15, + "grad_norm": 6.129919052124023, + "learning_rate": 9.384962406015038e-06, + "loss": 0.5898, + "step": 4090 + }, + { + "epoch": 6.17, + "grad_norm": 6.198451519012451, + "learning_rate": 9.383458646616542e-06, + "loss": 0.5378, + "step": 4100 + }, + { + "epoch": 6.18, + "grad_norm": 9.286908149719238, + "learning_rate": 9.381954887218045e-06, + "loss": 0.5656, + "step": 4110 + }, + { + "epoch": 6.2, + "grad_norm": 6.862420082092285, + "learning_rate": 9.38045112781955e-06, + "loss": 0.5457, + "step": 4120 + }, + { + "epoch": 6.21, + "grad_norm": 5.948605537414551, + "learning_rate": 9.378947368421052e-06, + "loss": 0.5564, + "step": 4130 + }, + { + "epoch": 6.23, + "grad_norm": 7.1652116775512695, + "learning_rate": 9.377443609022558e-06, + "loss": 0.5624, + "step": 4140 + }, + { + "epoch": 6.24, + "grad_norm": 7.091752052307129, + "learning_rate": 9.375939849624061e-06, + "loss": 0.532, + "step": 4150 + }, + { + "epoch": 6.26, + "grad_norm": 4.2119903564453125, + "learning_rate": 9.374436090225565e-06, + "loss": 0.385, + "step": 4160 + }, + { + "epoch": 6.27, + "grad_norm": 9.477155685424805, + "learning_rate": 9.372932330827068e-06, + "loss": 0.5019, + "step": 4170 + }, + { + "epoch": 6.29, + "grad_norm": 7.294814109802246, + "learning_rate": 9.371428571428572e-06, + "loss": 0.556, + "step": 4180 + }, + { + "epoch": 6.3, + "grad_norm": 8.124314308166504, + "learning_rate": 9.369924812030075e-06, + "loss": 0.548, + "step": 4190 + }, + { + "epoch": 6.32, + "grad_norm": 6.1076483726501465, + "learning_rate": 9.36842105263158e-06, + "loss": 0.5228, + "step": 4200 + }, + { + "epoch": 6.33, + "grad_norm": 7.360411643981934, + "learning_rate": 9.366917293233084e-06, + "loss": 0.6335, + "step": 4210 + }, + { + "epoch": 6.35, + "grad_norm": 6.000509738922119, + "learning_rate": 9.365413533834588e-06, + "loss": 0.5727, + "step": 4220 + }, + { + "epoch": 6.36, + "grad_norm": 8.538400650024414, + "learning_rate": 9.363909774436091e-06, + "loss": 0.5135, + "step": 4230 + }, + { + "epoch": 6.38, + "grad_norm": 6.543038845062256, + "learning_rate": 9.362406015037595e-06, + "loss": 0.4859, + "step": 4240 + }, + { + "epoch": 6.39, + "grad_norm": 7.515405178070068, + "learning_rate": 9.360902255639098e-06, + "loss": 0.6172, + "step": 4250 + }, + { + "epoch": 6.41, + "grad_norm": 4.859574317932129, + "learning_rate": 9.359398496240603e-06, + "loss": 0.407, + "step": 4260 + }, + { + "epoch": 6.42, + "grad_norm": 5.684931755065918, + "learning_rate": 9.357894736842105e-06, + "loss": 0.5748, + "step": 4270 + }, + { + "epoch": 6.44, + "grad_norm": 5.063128471374512, + "learning_rate": 9.35639097744361e-06, + "loss": 0.5363, + "step": 4280 + }, + { + "epoch": 6.45, + "grad_norm": 9.34011173248291, + "learning_rate": 9.354887218045114e-06, + "loss": 0.5072, + "step": 4290 + }, + { + "epoch": 6.47, + "grad_norm": 6.302648544311523, + "learning_rate": 9.353383458646617e-06, + "loss": 0.4891, + "step": 4300 + }, + { + "epoch": 6.48, + "grad_norm": 6.268799781799316, + "learning_rate": 9.351879699248121e-06, + "loss": 0.6215, + "step": 4310 + }, + { + "epoch": 6.5, + "grad_norm": 5.54179573059082, + "learning_rate": 9.350375939849624e-06, + "loss": 0.5827, + "step": 4320 + }, + { + "epoch": 6.51, + "grad_norm": 6.042153835296631, + "learning_rate": 9.348872180451128e-06, + "loss": 0.5205, + "step": 4330 + }, + { + "epoch": 6.53, + "grad_norm": 7.558413982391357, + "learning_rate": 9.347368421052633e-06, + "loss": 0.5357, + "step": 4340 + }, + { + "epoch": 6.54, + "grad_norm": 7.838019847869873, + "learning_rate": 9.345864661654137e-06, + "loss": 0.5719, + "step": 4350 + }, + { + "epoch": 6.56, + "grad_norm": 10.056818008422852, + "learning_rate": 9.34436090225564e-06, + "loss": 0.542, + "step": 4360 + }, + { + "epoch": 6.57, + "grad_norm": 7.325047492980957, + "learning_rate": 9.342857142857144e-06, + "loss": 0.5564, + "step": 4370 + }, + { + "epoch": 6.59, + "grad_norm": 8.13595199584961, + "learning_rate": 9.341353383458647e-06, + "loss": 0.5106, + "step": 4380 + }, + { + "epoch": 6.6, + "grad_norm": 7.225549221038818, + "learning_rate": 9.33984962406015e-06, + "loss": 0.5714, + "step": 4390 + }, + { + "epoch": 6.62, + "grad_norm": 9.103632926940918, + "learning_rate": 9.338345864661656e-06, + "loss": 0.559, + "step": 4400 + }, + { + "epoch": 6.63, + "grad_norm": 5.579386234283447, + "learning_rate": 9.336842105263158e-06, + "loss": 0.5789, + "step": 4410 + }, + { + "epoch": 6.65, + "grad_norm": 9.875541687011719, + "learning_rate": 9.335338345864663e-06, + "loss": 0.5435, + "step": 4420 + }, + { + "epoch": 6.66, + "grad_norm": 10.33945083618164, + "learning_rate": 9.333834586466166e-06, + "loss": 0.4678, + "step": 4430 + }, + { + "epoch": 6.68, + "grad_norm": 8.180964469909668, + "learning_rate": 9.33233082706767e-06, + "loss": 0.5476, + "step": 4440 + }, + { + "epoch": 6.69, + "grad_norm": 8.327938079833984, + "learning_rate": 9.330827067669174e-06, + "loss": 0.5605, + "step": 4450 + }, + { + "epoch": 6.71, + "grad_norm": 6.9362311363220215, + "learning_rate": 9.329323308270679e-06, + "loss": 0.6099, + "step": 4460 + }, + { + "epoch": 6.72, + "grad_norm": 4.627447128295898, + "learning_rate": 9.32781954887218e-06, + "loss": 0.4521, + "step": 4470 + }, + { + "epoch": 6.74, + "grad_norm": 5.713562488555908, + "learning_rate": 9.326315789473684e-06, + "loss": 0.4532, + "step": 4480 + }, + { + "epoch": 6.75, + "grad_norm": 6.893897533416748, + "learning_rate": 9.32481203007519e-06, + "loss": 0.5298, + "step": 4490 + }, + { + "epoch": 6.77, + "grad_norm": 6.630578994750977, + "learning_rate": 9.323308270676693e-06, + "loss": 0.5226, + "step": 4500 + }, + { + "epoch": 6.78, + "grad_norm": 7.725119113922119, + "learning_rate": 9.321804511278196e-06, + "loss": 0.4581, + "step": 4510 + }, + { + "epoch": 6.8, + "grad_norm": 8.243720054626465, + "learning_rate": 9.3203007518797e-06, + "loss": 0.4821, + "step": 4520 + }, + { + "epoch": 6.81, + "grad_norm": 7.254865646362305, + "learning_rate": 9.318796992481203e-06, + "loss": 0.4623, + "step": 4530 + }, + { + "epoch": 6.83, + "grad_norm": 6.510406494140625, + "learning_rate": 9.317293233082707e-06, + "loss": 0.5073, + "step": 4540 + }, + { + "epoch": 6.84, + "grad_norm": 5.531012058258057, + "learning_rate": 9.315789473684212e-06, + "loss": 0.4842, + "step": 4550 + }, + { + "epoch": 6.86, + "grad_norm": 9.50185489654541, + "learning_rate": 9.314285714285714e-06, + "loss": 0.5485, + "step": 4560 + }, + { + "epoch": 6.87, + "grad_norm": 5.591551780700684, + "learning_rate": 9.312781954887219e-06, + "loss": 0.5397, + "step": 4570 + }, + { + "epoch": 6.89, + "grad_norm": 3.7708208560943604, + "learning_rate": 9.311278195488723e-06, + "loss": 0.4746, + "step": 4580 + }, + { + "epoch": 6.9, + "grad_norm": 5.826446533203125, + "learning_rate": 9.309774436090226e-06, + "loss": 0.503, + "step": 4590 + }, + { + "epoch": 6.92, + "grad_norm": 14.129280090332031, + "learning_rate": 9.30827067669173e-06, + "loss": 0.5729, + "step": 4600 + }, + { + "epoch": 6.93, + "grad_norm": 5.19706392288208, + "learning_rate": 9.306766917293233e-06, + "loss": 0.4967, + "step": 4610 + }, + { + "epoch": 6.95, + "grad_norm": 6.513811111450195, + "learning_rate": 9.305263157894737e-06, + "loss": 0.587, + "step": 4620 + }, + { + "epoch": 6.96, + "grad_norm": 7.2199506759643555, + "learning_rate": 9.303759398496242e-06, + "loss": 0.5733, + "step": 4630 + }, + { + "epoch": 6.98, + "grad_norm": 6.173489570617676, + "learning_rate": 9.302255639097745e-06, + "loss": 0.6391, + "step": 4640 + }, + { + "epoch": 6.99, + "grad_norm": 4.977587699890137, + "learning_rate": 9.300751879699249e-06, + "loss": 0.5506, + "step": 4650 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.9302, + "eval_loss": 0.2616922855377197, + "eval_runtime": 85.0513, + "eval_samples_per_second": 117.576, + "eval_steps_per_second": 0.47, + "step": 4655 + }, + { + "epoch": 7.01, + "grad_norm": 7.2303266525268555, + "learning_rate": 9.299248120300753e-06, + "loss": 0.4889, + "step": 4660 + }, + { + "epoch": 7.02, + "grad_norm": 5.8191914558410645, + "learning_rate": 9.297744360902256e-06, + "loss": 0.6557, + "step": 4670 + }, + { + "epoch": 7.04, + "grad_norm": 7.453029632568359, + "learning_rate": 9.29624060150376e-06, + "loss": 0.5975, + "step": 4680 + }, + { + "epoch": 7.05, + "grad_norm": 9.434555053710938, + "learning_rate": 9.294736842105265e-06, + "loss": 0.4769, + "step": 4690 + }, + { + "epoch": 7.07, + "grad_norm": 9.096846580505371, + "learning_rate": 9.293233082706767e-06, + "loss": 0.5521, + "step": 4700 + }, + { + "epoch": 7.08, + "grad_norm": 6.146598815917969, + "learning_rate": 9.291729323308272e-06, + "loss": 0.5191, + "step": 4710 + }, + { + "epoch": 7.1, + "grad_norm": 5.138683319091797, + "learning_rate": 9.290225563909775e-06, + "loss": 0.4883, + "step": 4720 + }, + { + "epoch": 7.11, + "grad_norm": 5.840444564819336, + "learning_rate": 9.288721804511279e-06, + "loss": 0.4901, + "step": 4730 + }, + { + "epoch": 7.13, + "grad_norm": 5.589585781097412, + "learning_rate": 9.287218045112782e-06, + "loss": 0.4574, + "step": 4740 + }, + { + "epoch": 7.14, + "grad_norm": 7.447097301483154, + "learning_rate": 9.285714285714288e-06, + "loss": 0.4672, + "step": 4750 + }, + { + "epoch": 7.16, + "grad_norm": 6.8820295333862305, + "learning_rate": 9.28421052631579e-06, + "loss": 0.5641, + "step": 4760 + }, + { + "epoch": 7.17, + "grad_norm": 5.46907901763916, + "learning_rate": 9.282706766917295e-06, + "loss": 0.4948, + "step": 4770 + }, + { + "epoch": 7.19, + "grad_norm": 5.4233527183532715, + "learning_rate": 9.281203007518798e-06, + "loss": 0.5507, + "step": 4780 + }, + { + "epoch": 7.2, + "grad_norm": 6.316089153289795, + "learning_rate": 9.279699248120302e-06, + "loss": 0.5791, + "step": 4790 + }, + { + "epoch": 7.22, + "grad_norm": 3.7618801593780518, + "learning_rate": 9.278195488721805e-06, + "loss": 0.4846, + "step": 4800 + }, + { + "epoch": 7.23, + "grad_norm": 6.426711082458496, + "learning_rate": 9.276691729323309e-06, + "loss": 0.4886, + "step": 4810 + }, + { + "epoch": 7.25, + "grad_norm": 6.98826265335083, + "learning_rate": 9.275187969924812e-06, + "loss": 0.4627, + "step": 4820 + }, + { + "epoch": 7.26, + "grad_norm": 6.147061824798584, + "learning_rate": 9.273684210526317e-06, + "loss": 0.4577, + "step": 4830 + }, + { + "epoch": 7.28, + "grad_norm": 7.308942794799805, + "learning_rate": 9.272180451127821e-06, + "loss": 0.4893, + "step": 4840 + }, + { + "epoch": 7.29, + "grad_norm": 8.406046867370605, + "learning_rate": 9.270676691729324e-06, + "loss": 0.4968, + "step": 4850 + }, + { + "epoch": 7.31, + "grad_norm": 4.631737232208252, + "learning_rate": 9.269172932330828e-06, + "loss": 0.4654, + "step": 4860 + }, + { + "epoch": 7.32, + "grad_norm": 3.802255868911743, + "learning_rate": 9.267669172932331e-06, + "loss": 0.5018, + "step": 4870 + }, + { + "epoch": 7.34, + "grad_norm": 7.958065986633301, + "learning_rate": 9.266165413533835e-06, + "loss": 0.5358, + "step": 4880 + }, + { + "epoch": 7.35, + "grad_norm": 4.825588703155518, + "learning_rate": 9.26466165413534e-06, + "loss": 0.5201, + "step": 4890 + }, + { + "epoch": 7.37, + "grad_norm": 4.964457035064697, + "learning_rate": 9.263157894736842e-06, + "loss": 0.4819, + "step": 4900 + }, + { + "epoch": 7.38, + "grad_norm": 11.642394065856934, + "learning_rate": 9.261654135338347e-06, + "loss": 0.5989, + "step": 4910 + }, + { + "epoch": 7.4, + "grad_norm": 9.31828498840332, + "learning_rate": 9.26015037593985e-06, + "loss": 0.5455, + "step": 4920 + }, + { + "epoch": 7.41, + "grad_norm": 7.8129963874816895, + "learning_rate": 9.258646616541354e-06, + "loss": 0.5542, + "step": 4930 + }, + { + "epoch": 7.43, + "grad_norm": 7.043788909912109, + "learning_rate": 9.257142857142858e-06, + "loss": 0.5153, + "step": 4940 + }, + { + "epoch": 7.44, + "grad_norm": 8.311758995056152, + "learning_rate": 9.255639097744363e-06, + "loss": 0.4802, + "step": 4950 + }, + { + "epoch": 7.46, + "grad_norm": 10.970717430114746, + "learning_rate": 9.254135338345865e-06, + "loss": 0.4495, + "step": 4960 + }, + { + "epoch": 7.47, + "grad_norm": 5.547107219696045, + "learning_rate": 9.252631578947368e-06, + "loss": 0.4824, + "step": 4970 + }, + { + "epoch": 7.49, + "grad_norm": 5.658668518066406, + "learning_rate": 9.251127819548874e-06, + "loss": 0.5722, + "step": 4980 + }, + { + "epoch": 7.5, + "grad_norm": 4.896615982055664, + "learning_rate": 9.249624060150375e-06, + "loss": 0.4936, + "step": 4990 + }, + { + "epoch": 7.52, + "grad_norm": 7.777392864227295, + "learning_rate": 9.24812030075188e-06, + "loss": 0.5078, + "step": 5000 + }, + { + "epoch": 7.53, + "grad_norm": 11.22333812713623, + "learning_rate": 9.246616541353384e-06, + "loss": 0.575, + "step": 5010 + }, + { + "epoch": 7.55, + "grad_norm": 6.031052589416504, + "learning_rate": 9.245112781954888e-06, + "loss": 0.4742, + "step": 5020 + }, + { + "epoch": 7.56, + "grad_norm": 11.427336692810059, + "learning_rate": 9.243609022556391e-06, + "loss": 0.5519, + "step": 5030 + }, + { + "epoch": 7.58, + "grad_norm": 6.76407527923584, + "learning_rate": 9.242105263157896e-06, + "loss": 0.5202, + "step": 5040 + }, + { + "epoch": 7.59, + "grad_norm": 7.091256618499756, + "learning_rate": 9.240601503759398e-06, + "loss": 0.5255, + "step": 5050 + }, + { + "epoch": 7.61, + "grad_norm": 6.818326473236084, + "learning_rate": 9.239097744360903e-06, + "loss": 0.442, + "step": 5060 + }, + { + "epoch": 7.62, + "grad_norm": 7.494906902313232, + "learning_rate": 9.237593984962407e-06, + "loss": 0.5126, + "step": 5070 + }, + { + "epoch": 7.64, + "grad_norm": 5.982577800750732, + "learning_rate": 9.23609022556391e-06, + "loss": 0.5056, + "step": 5080 + }, + { + "epoch": 7.65, + "grad_norm": 4.815781116485596, + "learning_rate": 9.234586466165414e-06, + "loss": 0.4896, + "step": 5090 + }, + { + "epoch": 7.67, + "grad_norm": 10.674721717834473, + "learning_rate": 9.233082706766918e-06, + "loss": 0.5209, + "step": 5100 + }, + { + "epoch": 7.68, + "grad_norm": 5.937568187713623, + "learning_rate": 9.231578947368421e-06, + "loss": 0.4772, + "step": 5110 + }, + { + "epoch": 7.7, + "grad_norm": 5.146367073059082, + "learning_rate": 9.230075187969926e-06, + "loss": 0.5032, + "step": 5120 + }, + { + "epoch": 7.71, + "grad_norm": 8.03272819519043, + "learning_rate": 9.22857142857143e-06, + "loss": 0.5072, + "step": 5130 + }, + { + "epoch": 7.73, + "grad_norm": 5.361180782318115, + "learning_rate": 9.227067669172933e-06, + "loss": 0.5057, + "step": 5140 + }, + { + "epoch": 7.74, + "grad_norm": 5.487973213195801, + "learning_rate": 9.225563909774437e-06, + "loss": 0.5253, + "step": 5150 + }, + { + "epoch": 7.76, + "grad_norm": 6.845251560211182, + "learning_rate": 9.22406015037594e-06, + "loss": 0.5436, + "step": 5160 + }, + { + "epoch": 7.77, + "grad_norm": 4.931974411010742, + "learning_rate": 9.222556390977444e-06, + "loss": 0.4227, + "step": 5170 + }, + { + "epoch": 7.79, + "grad_norm": 7.382147312164307, + "learning_rate": 9.221052631578949e-06, + "loss": 0.5022, + "step": 5180 + }, + { + "epoch": 7.8, + "grad_norm": 8.380685806274414, + "learning_rate": 9.219548872180451e-06, + "loss": 0.5259, + "step": 5190 + }, + { + "epoch": 7.82, + "grad_norm": 6.625802993774414, + "learning_rate": 9.218045112781956e-06, + "loss": 0.5213, + "step": 5200 + }, + { + "epoch": 7.83, + "grad_norm": 6.7276692390441895, + "learning_rate": 9.21654135338346e-06, + "loss": 0.4831, + "step": 5210 + }, + { + "epoch": 7.85, + "grad_norm": 5.930064678192139, + "learning_rate": 9.215037593984963e-06, + "loss": 0.4278, + "step": 5220 + }, + { + "epoch": 7.86, + "grad_norm": 5.958808422088623, + "learning_rate": 9.213533834586467e-06, + "loss": 0.5039, + "step": 5230 + }, + { + "epoch": 7.88, + "grad_norm": 8.592114448547363, + "learning_rate": 9.21203007518797e-06, + "loss": 0.48, + "step": 5240 + }, + { + "epoch": 7.89, + "grad_norm": 7.286666393280029, + "learning_rate": 9.210526315789474e-06, + "loss": 0.5513, + "step": 5250 + }, + { + "epoch": 7.91, + "grad_norm": 3.8542234897613525, + "learning_rate": 9.209022556390979e-06, + "loss": 0.4452, + "step": 5260 + }, + { + "epoch": 7.92, + "grad_norm": 8.812358856201172, + "learning_rate": 9.207518796992482e-06, + "loss": 0.4796, + "step": 5270 + }, + { + "epoch": 7.94, + "grad_norm": 7.729457378387451, + "learning_rate": 9.206015037593986e-06, + "loss": 0.5283, + "step": 5280 + }, + { + "epoch": 7.95, + "grad_norm": 5.370766639709473, + "learning_rate": 9.20451127819549e-06, + "loss": 0.557, + "step": 5290 + }, + { + "epoch": 7.97, + "grad_norm": 7.00390625, + "learning_rate": 9.203007518796993e-06, + "loss": 0.5266, + "step": 5300 + }, + { + "epoch": 7.98, + "grad_norm": 5.945902347564697, + "learning_rate": 9.201503759398496e-06, + "loss": 0.505, + "step": 5310 + }, + { + "epoch": 8.0, + "grad_norm": 39.0330924987793, + "learning_rate": 9.200000000000002e-06, + "loss": 0.5673, + "step": 5320 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.9324, + "eval_loss": 0.2575855553150177, + "eval_runtime": 85.1441, + "eval_samples_per_second": 117.448, + "eval_steps_per_second": 0.47, + "step": 5320 + }, + { + "epoch": 8.02, + "grad_norm": 7.23183012008667, + "learning_rate": 9.198496240601504e-06, + "loss": 0.4209, + "step": 5330 + }, + { + "epoch": 8.03, + "grad_norm": 5.357458591461182, + "learning_rate": 9.196992481203009e-06, + "loss": 0.4701, + "step": 5340 + }, + { + "epoch": 8.05, + "grad_norm": 9.471860885620117, + "learning_rate": 9.195488721804512e-06, + "loss": 0.4924, + "step": 5350 + }, + { + "epoch": 8.06, + "grad_norm": 7.7437214851379395, + "learning_rate": 9.193984962406016e-06, + "loss": 0.4997, + "step": 5360 + }, + { + "epoch": 8.08, + "grad_norm": 6.697991371154785, + "learning_rate": 9.19248120300752e-06, + "loss": 0.4365, + "step": 5370 + }, + { + "epoch": 8.09, + "grad_norm": 9.78630542755127, + "learning_rate": 9.190977443609025e-06, + "loss": 0.5572, + "step": 5380 + }, + { + "epoch": 8.11, + "grad_norm": 6.197582244873047, + "learning_rate": 9.189473684210526e-06, + "loss": 0.4581, + "step": 5390 + }, + { + "epoch": 8.12, + "grad_norm": 5.565506458282471, + "learning_rate": 9.187969924812032e-06, + "loss": 0.5219, + "step": 5400 + }, + { + "epoch": 8.14, + "grad_norm": 5.3856353759765625, + "learning_rate": 9.186466165413535e-06, + "loss": 0.5744, + "step": 5410 + }, + { + "epoch": 8.15, + "grad_norm": 6.119697093963623, + "learning_rate": 9.184962406015039e-06, + "loss": 0.3964, + "step": 5420 + }, + { + "epoch": 8.17, + "grad_norm": 5.344573497772217, + "learning_rate": 9.183458646616542e-06, + "loss": 0.4816, + "step": 5430 + }, + { + "epoch": 8.18, + "grad_norm": 5.624027729034424, + "learning_rate": 9.181954887218046e-06, + "loss": 0.4341, + "step": 5440 + }, + { + "epoch": 8.2, + "grad_norm": 4.847854137420654, + "learning_rate": 9.180451127819549e-06, + "loss": 0.5471, + "step": 5450 + }, + { + "epoch": 8.21, + "grad_norm": 6.051102638244629, + "learning_rate": 9.178947368421053e-06, + "loss": 0.5593, + "step": 5460 + }, + { + "epoch": 8.23, + "grad_norm": 8.345075607299805, + "learning_rate": 9.177443609022558e-06, + "loss": 0.4132, + "step": 5470 + }, + { + "epoch": 8.24, + "grad_norm": 5.369195938110352, + "learning_rate": 9.17593984962406e-06, + "loss": 0.4844, + "step": 5480 + }, + { + "epoch": 8.26, + "grad_norm": 6.0165228843688965, + "learning_rate": 9.174436090225565e-06, + "loss": 0.4712, + "step": 5490 + }, + { + "epoch": 8.27, + "grad_norm": 4.54939079284668, + "learning_rate": 9.172932330827068e-06, + "loss": 0.4956, + "step": 5500 + }, + { + "epoch": 8.29, + "grad_norm": 5.021441459655762, + "learning_rate": 9.171428571428572e-06, + "loss": 0.4428, + "step": 5510 + }, + { + "epoch": 8.3, + "grad_norm": 7.0452446937561035, + "learning_rate": 9.169924812030075e-06, + "loss": 0.5036, + "step": 5520 + }, + { + "epoch": 8.32, + "grad_norm": 8.039512634277344, + "learning_rate": 9.168421052631579e-06, + "loss": 0.478, + "step": 5530 + }, + { + "epoch": 8.33, + "grad_norm": 6.608015537261963, + "learning_rate": 9.166917293233083e-06, + "loss": 0.4338, + "step": 5540 + }, + { + "epoch": 8.35, + "grad_norm": 6.659971714019775, + "learning_rate": 9.165413533834588e-06, + "loss": 0.5033, + "step": 5550 + }, + { + "epoch": 8.36, + "grad_norm": 5.663388252258301, + "learning_rate": 9.163909774436091e-06, + "loss": 0.4779, + "step": 5560 + }, + { + "epoch": 8.38, + "grad_norm": 5.978389739990234, + "learning_rate": 9.162406015037595e-06, + "loss": 0.4615, + "step": 5570 + }, + { + "epoch": 8.39, + "grad_norm": 5.943080425262451, + "learning_rate": 9.160902255639098e-06, + "loss": 0.4282, + "step": 5580 + }, + { + "epoch": 8.41, + "grad_norm": 3.245058536529541, + "learning_rate": 9.159398496240602e-06, + "loss": 0.4423, + "step": 5590 + }, + { + "epoch": 8.42, + "grad_norm": 5.73254919052124, + "learning_rate": 9.157894736842105e-06, + "loss": 0.4777, + "step": 5600 + }, + { + "epoch": 8.44, + "grad_norm": 6.487976551055908, + "learning_rate": 9.15639097744361e-06, + "loss": 0.5132, + "step": 5610 + }, + { + "epoch": 8.45, + "grad_norm": 4.1268463134765625, + "learning_rate": 9.154887218045112e-06, + "loss": 0.3942, + "step": 5620 + }, + { + "epoch": 8.47, + "grad_norm": 7.593535900115967, + "learning_rate": 9.153383458646618e-06, + "loss": 0.5109, + "step": 5630 + }, + { + "epoch": 8.48, + "grad_norm": 4.127936840057373, + "learning_rate": 9.151879699248121e-06, + "loss": 0.4554, + "step": 5640 + }, + { + "epoch": 8.5, + "grad_norm": 12.721508026123047, + "learning_rate": 9.150375939849625e-06, + "loss": 0.5204, + "step": 5650 + }, + { + "epoch": 8.51, + "grad_norm": 7.618612289428711, + "learning_rate": 9.148872180451128e-06, + "loss": 0.5095, + "step": 5660 + }, + { + "epoch": 8.53, + "grad_norm": 5.089692115783691, + "learning_rate": 9.147368421052633e-06, + "loss": 0.478, + "step": 5670 + }, + { + "epoch": 8.54, + "grad_norm": 6.890159606933594, + "learning_rate": 9.145864661654135e-06, + "loss": 0.4634, + "step": 5680 + }, + { + "epoch": 8.56, + "grad_norm": 9.275102615356445, + "learning_rate": 9.14436090225564e-06, + "loss": 0.528, + "step": 5690 + }, + { + "epoch": 8.57, + "grad_norm": 4.839653015136719, + "learning_rate": 9.142857142857144e-06, + "loss": 0.4558, + "step": 5700 + }, + { + "epoch": 8.59, + "grad_norm": 7.7605791091918945, + "learning_rate": 9.141353383458647e-06, + "loss": 0.5086, + "step": 5710 + }, + { + "epoch": 8.6, + "grad_norm": 6.753016948699951, + "learning_rate": 9.139849624060151e-06, + "loss": 0.4953, + "step": 5720 + }, + { + "epoch": 8.62, + "grad_norm": 10.297369003295898, + "learning_rate": 9.138345864661654e-06, + "loss": 0.454, + "step": 5730 + }, + { + "epoch": 8.63, + "grad_norm": 5.704922676086426, + "learning_rate": 9.136842105263158e-06, + "loss": 0.4504, + "step": 5740 + }, + { + "epoch": 8.65, + "grad_norm": 5.4977030754089355, + "learning_rate": 9.135338345864663e-06, + "loss": 0.5203, + "step": 5750 + }, + { + "epoch": 8.66, + "grad_norm": 7.531189918518066, + "learning_rate": 9.133834586466167e-06, + "loss": 0.4534, + "step": 5760 + }, + { + "epoch": 8.68, + "grad_norm": 6.660569667816162, + "learning_rate": 9.13233082706767e-06, + "loss": 0.577, + "step": 5770 + }, + { + "epoch": 8.69, + "grad_norm": 6.752721309661865, + "learning_rate": 9.130827067669174e-06, + "loss": 0.4574, + "step": 5780 + }, + { + "epoch": 8.71, + "grad_norm": 7.526786804199219, + "learning_rate": 9.129323308270677e-06, + "loss": 0.4511, + "step": 5790 + }, + { + "epoch": 8.72, + "grad_norm": 6.5963239669799805, + "learning_rate": 9.12781954887218e-06, + "loss": 0.4917, + "step": 5800 + }, + { + "epoch": 8.74, + "grad_norm": 4.203681468963623, + "learning_rate": 9.126315789473686e-06, + "loss": 0.5218, + "step": 5810 + }, + { + "epoch": 8.75, + "grad_norm": 4.367255210876465, + "learning_rate": 9.124812030075188e-06, + "loss": 0.4162, + "step": 5820 + }, + { + "epoch": 8.77, + "grad_norm": 3.8670730590820312, + "learning_rate": 9.123308270676693e-06, + "loss": 0.3913, + "step": 5830 + }, + { + "epoch": 8.78, + "grad_norm": 9.634267807006836, + "learning_rate": 9.121804511278197e-06, + "loss": 0.5062, + "step": 5840 + }, + { + "epoch": 8.8, + "grad_norm": 2.509295701980591, + "learning_rate": 9.1203007518797e-06, + "loss": 0.4422, + "step": 5850 + }, + { + "epoch": 8.81, + "grad_norm": 7.6244659423828125, + "learning_rate": 9.118796992481204e-06, + "loss": 0.4566, + "step": 5860 + }, + { + "epoch": 8.83, + "grad_norm": 6.837118625640869, + "learning_rate": 9.117293233082709e-06, + "loss": 0.5095, + "step": 5870 + }, + { + "epoch": 8.84, + "grad_norm": 4.819979667663574, + "learning_rate": 9.11578947368421e-06, + "loss": 0.4471, + "step": 5880 + }, + { + "epoch": 8.86, + "grad_norm": 6.350512504577637, + "learning_rate": 9.114285714285716e-06, + "loss": 0.4751, + "step": 5890 + }, + { + "epoch": 8.87, + "grad_norm": 3.4793074131011963, + "learning_rate": 9.11278195488722e-06, + "loss": 0.4089, + "step": 5900 + }, + { + "epoch": 8.89, + "grad_norm": 5.1062774658203125, + "learning_rate": 9.111278195488723e-06, + "loss": 0.5624, + "step": 5910 + }, + { + "epoch": 8.9, + "grad_norm": 8.126543998718262, + "learning_rate": 9.109774436090226e-06, + "loss": 0.5146, + "step": 5920 + }, + { + "epoch": 8.92, + "grad_norm": 7.661808967590332, + "learning_rate": 9.10827067669173e-06, + "loss": 0.4602, + "step": 5930 + }, + { + "epoch": 8.93, + "grad_norm": 6.820888996124268, + "learning_rate": 9.106766917293233e-06, + "loss": 0.4827, + "step": 5940 + }, + { + "epoch": 8.95, + "grad_norm": 24.169485092163086, + "learning_rate": 9.105263157894739e-06, + "loss": 0.5302, + "step": 5950 + }, + { + "epoch": 8.96, + "grad_norm": 5.068043231964111, + "learning_rate": 9.103759398496242e-06, + "loss": 0.5453, + "step": 5960 + }, + { + "epoch": 8.98, + "grad_norm": 5.819450378417969, + "learning_rate": 9.102255639097744e-06, + "loss": 0.4811, + "step": 5970 + }, + { + "epoch": 8.99, + "grad_norm": 4.129781723022461, + "learning_rate": 9.10075187969925e-06, + "loss": 0.4613, + "step": 5980 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.9311, + "eval_loss": 0.25862327218055725, + "eval_runtime": 84.8621, + "eval_samples_per_second": 117.838, + "eval_steps_per_second": 0.471, + "step": 5985 + }, + { + "epoch": 9.01, + "grad_norm": 6.844424247741699, + "learning_rate": 9.099248120300753e-06, + "loss": 0.4154, + "step": 5990 + }, + { + "epoch": 9.02, + "grad_norm": 5.792689323425293, + "learning_rate": 9.097744360902256e-06, + "loss": 0.4345, + "step": 6000 + }, + { + "epoch": 9.04, + "grad_norm": 5.300471305847168, + "learning_rate": 9.09624060150376e-06, + "loss": 0.3986, + "step": 6010 + }, + { + "epoch": 9.05, + "grad_norm": 11.9564208984375, + "learning_rate": 9.094736842105263e-06, + "loss": 0.4255, + "step": 6020 + }, + { + "epoch": 9.07, + "grad_norm": 6.798022270202637, + "learning_rate": 9.093233082706767e-06, + "loss": 0.4296, + "step": 6030 + }, + { + "epoch": 9.08, + "grad_norm": 7.78212308883667, + "learning_rate": 9.091729323308272e-06, + "loss": 0.4373, + "step": 6040 + }, + { + "epoch": 9.1, + "grad_norm": 6.719583988189697, + "learning_rate": 9.090225563909776e-06, + "loss": 0.4943, + "step": 6050 + }, + { + "epoch": 9.11, + "grad_norm": 8.298834800720215, + "learning_rate": 9.088721804511279e-06, + "loss": 0.3937, + "step": 6060 + }, + { + "epoch": 9.13, + "grad_norm": 4.731727600097656, + "learning_rate": 9.087218045112783e-06, + "loss": 0.4711, + "step": 6070 + }, + { + "epoch": 9.14, + "grad_norm": 6.207810878753662, + "learning_rate": 9.085714285714286e-06, + "loss": 0.4565, + "step": 6080 + }, + { + "epoch": 9.16, + "grad_norm": 6.939966678619385, + "learning_rate": 9.08421052631579e-06, + "loss": 0.5276, + "step": 6090 + }, + { + "epoch": 9.17, + "grad_norm": 9.00831127166748, + "learning_rate": 9.082706766917295e-06, + "loss": 0.5558, + "step": 6100 + }, + { + "epoch": 9.19, + "grad_norm": 4.730199813842773, + "learning_rate": 9.081203007518797e-06, + "loss": 0.4826, + "step": 6110 + }, + { + "epoch": 9.2, + "grad_norm": 4.198337078094482, + "learning_rate": 9.079699248120302e-06, + "loss": 0.3978, + "step": 6120 + }, + { + "epoch": 9.22, + "grad_norm": 5.722704887390137, + "learning_rate": 9.078195488721805e-06, + "loss": 0.4622, + "step": 6130 + }, + { + "epoch": 9.23, + "grad_norm": 8.497228622436523, + "learning_rate": 9.076691729323309e-06, + "loss": 0.4726, + "step": 6140 + }, + { + "epoch": 9.25, + "grad_norm": 7.046009063720703, + "learning_rate": 9.075187969924812e-06, + "loss": 0.527, + "step": 6150 + }, + { + "epoch": 9.26, + "grad_norm": 7.972896099090576, + "learning_rate": 9.073684210526316e-06, + "loss": 0.3072, + "step": 6160 + }, + { + "epoch": 9.28, + "grad_norm": 8.850788116455078, + "learning_rate": 9.07218045112782e-06, + "loss": 0.4753, + "step": 6170 + }, + { + "epoch": 9.29, + "grad_norm": 6.064061641693115, + "learning_rate": 9.070676691729325e-06, + "loss": 0.4009, + "step": 6180 + }, + { + "epoch": 9.31, + "grad_norm": 6.12713098526001, + "learning_rate": 9.069172932330828e-06, + "loss": 0.4786, + "step": 6190 + }, + { + "epoch": 9.32, + "grad_norm": 8.643204689025879, + "learning_rate": 9.067669172932332e-06, + "loss": 0.5134, + "step": 6200 + }, + { + "epoch": 9.34, + "grad_norm": 7.257277488708496, + "learning_rate": 9.066165413533835e-06, + "loss": 0.4474, + "step": 6210 + }, + { + "epoch": 9.35, + "grad_norm": 7.62333869934082, + "learning_rate": 9.064661654135339e-06, + "loss": 0.4579, + "step": 6220 + }, + { + "epoch": 9.37, + "grad_norm": 9.346735954284668, + "learning_rate": 9.063157894736842e-06, + "loss": 0.4714, + "step": 6230 + }, + { + "epoch": 9.38, + "grad_norm": 3.8007750511169434, + "learning_rate": 9.061654135338347e-06, + "loss": 0.4334, + "step": 6240 + }, + { + "epoch": 9.4, + "grad_norm": 6.266302108764648, + "learning_rate": 9.06015037593985e-06, + "loss": 0.4704, + "step": 6250 + }, + { + "epoch": 9.41, + "grad_norm": 6.959786891937256, + "learning_rate": 9.058646616541355e-06, + "loss": 0.5353, + "step": 6260 + }, + { + "epoch": 9.43, + "grad_norm": 6.572616100311279, + "learning_rate": 9.057142857142858e-06, + "loss": 0.4629, + "step": 6270 + }, + { + "epoch": 9.44, + "grad_norm": 5.961916446685791, + "learning_rate": 9.055639097744362e-06, + "loss": 0.5119, + "step": 6280 + }, + { + "epoch": 9.46, + "grad_norm": 6.547915935516357, + "learning_rate": 9.054135338345865e-06, + "loss": 0.4576, + "step": 6290 + }, + { + "epoch": 9.47, + "grad_norm": 6.359402179718018, + "learning_rate": 9.05263157894737e-06, + "loss": 0.3692, + "step": 6300 + }, + { + "epoch": 9.49, + "grad_norm": 7.048614501953125, + "learning_rate": 9.051127819548872e-06, + "loss": 0.3911, + "step": 6310 + }, + { + "epoch": 9.5, + "grad_norm": 5.198198318481445, + "learning_rate": 9.049624060150377e-06, + "loss": 0.475, + "step": 6320 + }, + { + "epoch": 9.52, + "grad_norm": 5.797221660614014, + "learning_rate": 9.04812030075188e-06, + "loss": 0.3771, + "step": 6330 + }, + { + "epoch": 9.53, + "grad_norm": 5.751585483551025, + "learning_rate": 9.046616541353384e-06, + "loss": 0.497, + "step": 6340 + }, + { + "epoch": 9.55, + "grad_norm": 9.54306697845459, + "learning_rate": 9.045112781954888e-06, + "loss": 0.3976, + "step": 6350 + }, + { + "epoch": 9.56, + "grad_norm": 8.968032836914062, + "learning_rate": 9.043609022556391e-06, + "loss": 0.4252, + "step": 6360 + }, + { + "epoch": 9.58, + "grad_norm": 13.218304634094238, + "learning_rate": 9.042105263157895e-06, + "loss": 0.5191, + "step": 6370 + }, + { + "epoch": 9.59, + "grad_norm": 4.405686855316162, + "learning_rate": 9.0406015037594e-06, + "loss": 0.4149, + "step": 6380 + }, + { + "epoch": 9.61, + "grad_norm": 4.863158702850342, + "learning_rate": 9.039097744360904e-06, + "loss": 0.4875, + "step": 6390 + }, + { + "epoch": 9.62, + "grad_norm": 6.247385501861572, + "learning_rate": 9.037593984962407e-06, + "loss": 0.4106, + "step": 6400 + }, + { + "epoch": 9.64, + "grad_norm": 6.554888725280762, + "learning_rate": 9.03609022556391e-06, + "loss": 0.4645, + "step": 6410 + }, + { + "epoch": 9.65, + "grad_norm": 7.249465465545654, + "learning_rate": 9.034586466165414e-06, + "loss": 0.4291, + "step": 6420 + }, + { + "epoch": 9.67, + "grad_norm": 6.810882568359375, + "learning_rate": 9.033082706766918e-06, + "loss": 0.4637, + "step": 6430 + }, + { + "epoch": 9.68, + "grad_norm": 2.65733003616333, + "learning_rate": 9.031578947368423e-06, + "loss": 0.4585, + "step": 6440 + }, + { + "epoch": 9.7, + "grad_norm": 8.63343334197998, + "learning_rate": 9.030075187969925e-06, + "loss": 0.5187, + "step": 6450 + }, + { + "epoch": 9.71, + "grad_norm": 5.558303356170654, + "learning_rate": 9.028571428571428e-06, + "loss": 0.4819, + "step": 6460 + }, + { + "epoch": 9.73, + "grad_norm": 8.711833000183105, + "learning_rate": 9.027067669172933e-06, + "loss": 0.4585, + "step": 6470 + }, + { + "epoch": 9.74, + "grad_norm": 7.882017135620117, + "learning_rate": 9.025563909774437e-06, + "loss": 0.5154, + "step": 6480 + }, + { + "epoch": 9.76, + "grad_norm": 5.849830150604248, + "learning_rate": 9.02406015037594e-06, + "loss": 0.5093, + "step": 6490 + }, + { + "epoch": 9.77, + "grad_norm": 7.546263217926025, + "learning_rate": 9.022556390977444e-06, + "loss": 0.3969, + "step": 6500 + }, + { + "epoch": 9.79, + "grad_norm": 7.114614963531494, + "learning_rate": 9.021052631578948e-06, + "loss": 0.4485, + "step": 6510 + }, + { + "epoch": 9.8, + "grad_norm": 7.6169209480285645, + "learning_rate": 9.019548872180451e-06, + "loss": 0.425, + "step": 6520 + }, + { + "epoch": 9.82, + "grad_norm": 5.7843403816223145, + "learning_rate": 9.018045112781956e-06, + "loss": 0.4171, + "step": 6530 + }, + { + "epoch": 9.83, + "grad_norm": 6.0503082275390625, + "learning_rate": 9.016541353383458e-06, + "loss": 0.483, + "step": 6540 + }, + { + "epoch": 9.85, + "grad_norm": 7.677584648132324, + "learning_rate": 9.015037593984963e-06, + "loss": 0.4747, + "step": 6550 + }, + { + "epoch": 9.86, + "grad_norm": 5.793139934539795, + "learning_rate": 9.013533834586467e-06, + "loss": 0.3621, + "step": 6560 + }, + { + "epoch": 9.88, + "grad_norm": 6.399969577789307, + "learning_rate": 9.01203007518797e-06, + "loss": 0.4373, + "step": 6570 + }, + { + "epoch": 9.89, + "grad_norm": 10.296338081359863, + "learning_rate": 9.010526315789474e-06, + "loss": 0.417, + "step": 6580 + }, + { + "epoch": 9.91, + "grad_norm": 6.193917274475098, + "learning_rate": 9.009022556390979e-06, + "loss": 0.4419, + "step": 6590 + }, + { + "epoch": 9.92, + "grad_norm": 3.921016216278076, + "learning_rate": 9.007518796992481e-06, + "loss": 0.3981, + "step": 6600 + }, + { + "epoch": 9.94, + "grad_norm": 6.30132532119751, + "learning_rate": 9.006015037593986e-06, + "loss": 0.4699, + "step": 6610 + }, + { + "epoch": 9.95, + "grad_norm": 8.901771545410156, + "learning_rate": 9.00451127819549e-06, + "loss": 0.4308, + "step": 6620 + }, + { + "epoch": 9.97, + "grad_norm": 5.031552314758301, + "learning_rate": 9.003007518796993e-06, + "loss": 0.48, + "step": 6630 + }, + { + "epoch": 9.98, + "grad_norm": 5.636510372161865, + "learning_rate": 9.001503759398497e-06, + "loss": 0.403, + "step": 6640 + }, + { + "epoch": 10.0, + "grad_norm": 3.5294342041015625, + "learning_rate": 9e-06, + "loss": 0.4179, + "step": 6650 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.9285, + "eval_loss": 0.2555387318134308, + "eval_runtime": 84.7099, + "eval_samples_per_second": 118.05, + "eval_steps_per_second": 0.472, + "step": 6650 + }, + { + "epoch": 10.02, + "grad_norm": 6.522907257080078, + "learning_rate": 8.998496240601504e-06, + "loss": 0.4525, + "step": 6660 + }, + { + "epoch": 10.03, + "grad_norm": 6.142210006713867, + "learning_rate": 8.996992481203009e-06, + "loss": 0.3998, + "step": 6670 + }, + { + "epoch": 10.05, + "grad_norm": 7.781100749969482, + "learning_rate": 8.995488721804512e-06, + "loss": 0.4122, + "step": 6680 + }, + { + "epoch": 10.06, + "grad_norm": 5.448252201080322, + "learning_rate": 8.993984962406016e-06, + "loss": 0.461, + "step": 6690 + }, + { + "epoch": 10.08, + "grad_norm": 7.063671588897705, + "learning_rate": 8.99248120300752e-06, + "loss": 0.432, + "step": 6700 + }, + { + "epoch": 10.09, + "grad_norm": 6.696626663208008, + "learning_rate": 8.990977443609023e-06, + "loss": 0.4208, + "step": 6710 + }, + { + "epoch": 10.11, + "grad_norm": 6.5666656494140625, + "learning_rate": 8.989473684210527e-06, + "loss": 0.4527, + "step": 6720 + }, + { + "epoch": 10.12, + "grad_norm": 8.801324844360352, + "learning_rate": 8.987969924812032e-06, + "loss": 0.442, + "step": 6730 + }, + { + "epoch": 10.14, + "grad_norm": 6.743152141571045, + "learning_rate": 8.986466165413534e-06, + "loss": 0.4424, + "step": 6740 + }, + { + "epoch": 10.15, + "grad_norm": 5.408703327178955, + "learning_rate": 8.984962406015039e-06, + "loss": 0.4548, + "step": 6750 + }, + { + "epoch": 10.17, + "grad_norm": 8.466784477233887, + "learning_rate": 8.983458646616542e-06, + "loss": 0.4355, + "step": 6760 + }, + { + "epoch": 10.18, + "grad_norm": 5.309767723083496, + "learning_rate": 8.981954887218046e-06, + "loss": 0.4023, + "step": 6770 + }, + { + "epoch": 10.2, + "grad_norm": 3.3604421615600586, + "learning_rate": 8.98045112781955e-06, + "loss": 0.4367, + "step": 6780 + }, + { + "epoch": 10.21, + "grad_norm": 6.275347709655762, + "learning_rate": 8.978947368421055e-06, + "loss": 0.4764, + "step": 6790 + }, + { + "epoch": 10.23, + "grad_norm": 6.770579814910889, + "learning_rate": 8.977443609022556e-06, + "loss": 0.4414, + "step": 6800 + }, + { + "epoch": 10.24, + "grad_norm": 8.56733512878418, + "learning_rate": 8.975939849624062e-06, + "loss": 0.4426, + "step": 6810 + }, + { + "epoch": 10.26, + "grad_norm": 6.006712436676025, + "learning_rate": 8.974436090225565e-06, + "loss": 0.3702, + "step": 6820 + }, + { + "epoch": 10.27, + "grad_norm": 4.649052143096924, + "learning_rate": 8.972932330827069e-06, + "loss": 0.5371, + "step": 6830 + }, + { + "epoch": 10.29, + "grad_norm": 9.080769538879395, + "learning_rate": 8.971428571428572e-06, + "loss": 0.4866, + "step": 6840 + }, + { + "epoch": 10.3, + "grad_norm": 5.778624534606934, + "learning_rate": 8.969924812030076e-06, + "loss": 0.381, + "step": 6850 + }, + { + "epoch": 10.32, + "grad_norm": 7.814187049865723, + "learning_rate": 8.96842105263158e-06, + "loss": 0.5162, + "step": 6860 + }, + { + "epoch": 10.33, + "grad_norm": 5.049838542938232, + "learning_rate": 8.966917293233084e-06, + "loss": 0.4879, + "step": 6870 + }, + { + "epoch": 10.35, + "grad_norm": 8.096096992492676, + "learning_rate": 8.965413533834588e-06, + "loss": 0.4726, + "step": 6880 + }, + { + "epoch": 10.36, + "grad_norm": 7.028320789337158, + "learning_rate": 8.963909774436091e-06, + "loss": 0.4424, + "step": 6890 + }, + { + "epoch": 10.38, + "grad_norm": 4.826821804046631, + "learning_rate": 8.962406015037595e-06, + "loss": 0.4552, + "step": 6900 + }, + { + "epoch": 10.39, + "grad_norm": 8.392495155334473, + "learning_rate": 8.960902255639098e-06, + "loss": 0.4378, + "step": 6910 + }, + { + "epoch": 10.41, + "grad_norm": 4.868290424346924, + "learning_rate": 8.959398496240602e-06, + "loss": 0.4151, + "step": 6920 + }, + { + "epoch": 10.42, + "grad_norm": 6.117234230041504, + "learning_rate": 8.957894736842107e-06, + "loss": 0.5149, + "step": 6930 + }, + { + "epoch": 10.44, + "grad_norm": 9.33238697052002, + "learning_rate": 8.956390977443609e-06, + "loss": 0.3984, + "step": 6940 + }, + { + "epoch": 10.45, + "grad_norm": 9.559886932373047, + "learning_rate": 8.954887218045113e-06, + "loss": 0.4171, + "step": 6950 + }, + { + "epoch": 10.47, + "grad_norm": 4.344634056091309, + "learning_rate": 8.953383458646618e-06, + "loss": 0.4419, + "step": 6960 + }, + { + "epoch": 10.48, + "grad_norm": 5.508487701416016, + "learning_rate": 8.951879699248121e-06, + "loss": 0.4605, + "step": 6970 + }, + { + "epoch": 10.5, + "grad_norm": 5.529686450958252, + "learning_rate": 8.950375939849625e-06, + "loss": 0.4004, + "step": 6980 + }, + { + "epoch": 10.51, + "grad_norm": 5.424170970916748, + "learning_rate": 8.948872180451128e-06, + "loss": 0.4351, + "step": 6990 + }, + { + "epoch": 10.53, + "grad_norm": 6.121506690979004, + "learning_rate": 8.947368421052632e-06, + "loss": 0.4198, + "step": 7000 + }, + { + "epoch": 10.54, + "grad_norm": 4.664872169494629, + "learning_rate": 8.945864661654135e-06, + "loss": 0.3917, + "step": 7010 + }, + { + "epoch": 10.56, + "grad_norm": 5.378602027893066, + "learning_rate": 8.94436090225564e-06, + "loss": 0.47, + "step": 7020 + }, + { + "epoch": 10.57, + "grad_norm": 8.281057357788086, + "learning_rate": 8.942857142857142e-06, + "loss": 0.3779, + "step": 7030 + }, + { + "epoch": 10.59, + "grad_norm": 5.378328800201416, + "learning_rate": 8.941353383458648e-06, + "loss": 0.4878, + "step": 7040 + }, + { + "epoch": 10.6, + "grad_norm": 4.809008598327637, + "learning_rate": 8.939849624060151e-06, + "loss": 0.4409, + "step": 7050 + }, + { + "epoch": 10.62, + "grad_norm": 6.703794002532959, + "learning_rate": 8.938345864661655e-06, + "loss": 0.45, + "step": 7060 + }, + { + "epoch": 10.63, + "grad_norm": 10.097111701965332, + "learning_rate": 8.936842105263158e-06, + "loss": 0.4442, + "step": 7070 + }, + { + "epoch": 10.65, + "grad_norm": 5.404522895812988, + "learning_rate": 8.935338345864662e-06, + "loss": 0.4289, + "step": 7080 + }, + { + "epoch": 10.66, + "grad_norm": 2.983161449432373, + "learning_rate": 8.933834586466165e-06, + "loss": 0.4072, + "step": 7090 + }, + { + "epoch": 10.68, + "grad_norm": 6.501340389251709, + "learning_rate": 8.93233082706767e-06, + "loss": 0.4344, + "step": 7100 + }, + { + "epoch": 10.69, + "grad_norm": 7.439212322235107, + "learning_rate": 8.930827067669174e-06, + "loss": 0.4208, + "step": 7110 + }, + { + "epoch": 10.71, + "grad_norm": 6.9180192947387695, + "learning_rate": 8.929323308270677e-06, + "loss": 0.4901, + "step": 7120 + }, + { + "epoch": 10.72, + "grad_norm": 4.9598212242126465, + "learning_rate": 8.927819548872181e-06, + "loss": 0.4551, + "step": 7130 + }, + { + "epoch": 10.74, + "grad_norm": 7.020519256591797, + "learning_rate": 8.926315789473685e-06, + "loss": 0.4469, + "step": 7140 + }, + { + "epoch": 10.75, + "grad_norm": 6.747496604919434, + "learning_rate": 8.924812030075188e-06, + "loss": 0.3626, + "step": 7150 + }, + { + "epoch": 10.77, + "grad_norm": 2.869495153427124, + "learning_rate": 8.923308270676693e-06, + "loss": 0.3794, + "step": 7160 + }, + { + "epoch": 10.78, + "grad_norm": 7.156761169433594, + "learning_rate": 8.921804511278195e-06, + "loss": 0.4909, + "step": 7170 + }, + { + "epoch": 10.8, + "grad_norm": 9.461006164550781, + "learning_rate": 8.9203007518797e-06, + "loss": 0.4487, + "step": 7180 + }, + { + "epoch": 10.81, + "grad_norm": 5.75421142578125, + "learning_rate": 8.918796992481204e-06, + "loss": 0.4953, + "step": 7190 + }, + { + "epoch": 10.83, + "grad_norm": 4.186371326446533, + "learning_rate": 8.917293233082707e-06, + "loss": 0.3788, + "step": 7200 + }, + { + "epoch": 10.84, + "grad_norm": 6.402685165405273, + "learning_rate": 8.915789473684211e-06, + "loss": 0.5009, + "step": 7210 + }, + { + "epoch": 10.86, + "grad_norm": 10.709757804870605, + "learning_rate": 8.914285714285716e-06, + "loss": 0.5308, + "step": 7220 + }, + { + "epoch": 10.87, + "grad_norm": 8.926152229309082, + "learning_rate": 8.912781954887218e-06, + "loss": 0.4461, + "step": 7230 + }, + { + "epoch": 10.89, + "grad_norm": 6.41901969909668, + "learning_rate": 8.911278195488723e-06, + "loss": 0.4188, + "step": 7240 + }, + { + "epoch": 10.9, + "grad_norm": 4.931794166564941, + "learning_rate": 8.909774436090227e-06, + "loss": 0.4089, + "step": 7250 + }, + { + "epoch": 10.92, + "grad_norm": 7.75593376159668, + "learning_rate": 8.90827067669173e-06, + "loss": 0.5261, + "step": 7260 + }, + { + "epoch": 10.93, + "grad_norm": 9.013036727905273, + "learning_rate": 8.906766917293234e-06, + "loss": 0.3274, + "step": 7270 + }, + { + "epoch": 10.95, + "grad_norm": 6.653579235076904, + "learning_rate": 8.905263157894737e-06, + "loss": 0.4646, + "step": 7280 + }, + { + "epoch": 10.96, + "grad_norm": 5.304203987121582, + "learning_rate": 8.90375939849624e-06, + "loss": 0.4399, + "step": 7290 + }, + { + "epoch": 10.98, + "grad_norm": 40.1646842956543, + "learning_rate": 8.902255639097746e-06, + "loss": 0.3007, + "step": 7300 + }, + { + "epoch": 10.99, + "grad_norm": 5.538785934448242, + "learning_rate": 8.90075187969925e-06, + "loss": 0.4438, + "step": 7310 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.9316, + "eval_loss": 0.25541195273399353, + "eval_runtime": 84.8107, + "eval_samples_per_second": 117.91, + "eval_steps_per_second": 0.472, + "step": 7315 + }, + { + "epoch": 11.01, + "grad_norm": 8.498943328857422, + "learning_rate": 8.899248120300753e-06, + "loss": 0.4009, + "step": 7320 + }, + { + "epoch": 11.02, + "grad_norm": 6.2147040367126465, + "learning_rate": 8.897744360902256e-06, + "loss": 0.4283, + "step": 7330 + }, + { + "epoch": 11.04, + "grad_norm": 5.028774261474609, + "learning_rate": 8.89624060150376e-06, + "loss": 0.4145, + "step": 7340 + }, + { + "epoch": 11.05, + "grad_norm": 7.040588855743408, + "learning_rate": 8.894736842105264e-06, + "loss": 0.3753, + "step": 7350 + }, + { + "epoch": 11.07, + "grad_norm": 4.658559322357178, + "learning_rate": 8.893233082706769e-06, + "loss": 0.4139, + "step": 7360 + }, + { + "epoch": 11.08, + "grad_norm": 7.867548942565918, + "learning_rate": 8.89172932330827e-06, + "loss": 0.4854, + "step": 7370 + }, + { + "epoch": 11.1, + "grad_norm": 8.354945182800293, + "learning_rate": 8.890225563909776e-06, + "loss": 0.4186, + "step": 7380 + }, + { + "epoch": 11.11, + "grad_norm": 6.198273658752441, + "learning_rate": 8.88872180451128e-06, + "loss": 0.4486, + "step": 7390 + }, + { + "epoch": 11.13, + "grad_norm": 7.401607990264893, + "learning_rate": 8.887218045112783e-06, + "loss": 0.4015, + "step": 7400 + }, + { + "epoch": 11.14, + "grad_norm": 5.412950038909912, + "learning_rate": 8.885714285714286e-06, + "loss": 0.3654, + "step": 7410 + }, + { + "epoch": 11.16, + "grad_norm": 3.7357654571533203, + "learning_rate": 8.884210526315792e-06, + "loss": 0.4165, + "step": 7420 + }, + { + "epoch": 11.17, + "grad_norm": 7.468185901641846, + "learning_rate": 8.882706766917293e-06, + "loss": 0.3241, + "step": 7430 + }, + { + "epoch": 11.19, + "grad_norm": 5.967494487762451, + "learning_rate": 8.881203007518799e-06, + "loss": 0.4364, + "step": 7440 + }, + { + "epoch": 11.2, + "grad_norm": 8.94781494140625, + "learning_rate": 8.879699248120302e-06, + "loss": 0.4927, + "step": 7450 + }, + { + "epoch": 11.22, + "grad_norm": 7.6195969581604, + "learning_rate": 8.878195488721804e-06, + "loss": 0.3722, + "step": 7460 + }, + { + "epoch": 11.23, + "grad_norm": 9.522473335266113, + "learning_rate": 8.876691729323309e-06, + "loss": 0.3974, + "step": 7470 + }, + { + "epoch": 11.25, + "grad_norm": 9.590860366821289, + "learning_rate": 8.875187969924813e-06, + "loss": 0.3842, + "step": 7480 + }, + { + "epoch": 11.26, + "grad_norm": 6.479350566864014, + "learning_rate": 8.873684210526316e-06, + "loss": 0.4135, + "step": 7490 + }, + { + "epoch": 11.28, + "grad_norm": 8.100231170654297, + "learning_rate": 8.87218045112782e-06, + "loss": 0.4262, + "step": 7500 + }, + { + "epoch": 11.29, + "grad_norm": 9.401702880859375, + "learning_rate": 8.870676691729325e-06, + "loss": 0.3899, + "step": 7510 + }, + { + "epoch": 11.31, + "grad_norm": 7.8885626792907715, + "learning_rate": 8.869172932330827e-06, + "loss": 0.4738, + "step": 7520 + }, + { + "epoch": 11.32, + "grad_norm": 7.2377753257751465, + "learning_rate": 8.867669172932332e-06, + "loss": 0.3686, + "step": 7530 + }, + { + "epoch": 11.34, + "grad_norm": 5.0235209465026855, + "learning_rate": 8.866165413533835e-06, + "loss": 0.3939, + "step": 7540 + }, + { + "epoch": 11.35, + "grad_norm": 6.832250595092773, + "learning_rate": 8.864661654135339e-06, + "loss": 0.4485, + "step": 7550 + }, + { + "epoch": 11.37, + "grad_norm": 8.186062812805176, + "learning_rate": 8.863157894736842e-06, + "loss": 0.4242, + "step": 7560 + }, + { + "epoch": 11.38, + "grad_norm": 5.467780113220215, + "learning_rate": 8.861654135338346e-06, + "loss": 0.4599, + "step": 7570 + }, + { + "epoch": 11.4, + "grad_norm": 6.155720233917236, + "learning_rate": 8.86015037593985e-06, + "loss": 0.5285, + "step": 7580 + }, + { + "epoch": 11.41, + "grad_norm": 6.44677734375, + "learning_rate": 8.858646616541355e-06, + "loss": 0.4931, + "step": 7590 + }, + { + "epoch": 11.43, + "grad_norm": 15.308818817138672, + "learning_rate": 8.857142857142858e-06, + "loss": 0.3899, + "step": 7600 + }, + { + "epoch": 11.44, + "grad_norm": 6.691050052642822, + "learning_rate": 8.855639097744362e-06, + "loss": 0.4313, + "step": 7610 + }, + { + "epoch": 11.46, + "grad_norm": 5.215397357940674, + "learning_rate": 8.854135338345865e-06, + "loss": 0.3528, + "step": 7620 + }, + { + "epoch": 11.47, + "grad_norm": 7.355811595916748, + "learning_rate": 8.852631578947369e-06, + "loss": 0.4402, + "step": 7630 + }, + { + "epoch": 11.49, + "grad_norm": 4.864825248718262, + "learning_rate": 8.851127819548872e-06, + "loss": 0.3485, + "step": 7640 + }, + { + "epoch": 11.5, + "grad_norm": 7.4907755851745605, + "learning_rate": 8.849624060150378e-06, + "loss": 0.4522, + "step": 7650 + }, + { + "epoch": 11.52, + "grad_norm": 6.480433464050293, + "learning_rate": 8.84812030075188e-06, + "loss": 0.4655, + "step": 7660 + }, + { + "epoch": 11.53, + "grad_norm": 5.072092056274414, + "learning_rate": 8.846616541353385e-06, + "loss": 0.3735, + "step": 7670 + }, + { + "epoch": 11.55, + "grad_norm": 10.207109451293945, + "learning_rate": 8.845112781954888e-06, + "loss": 0.3884, + "step": 7680 + }, + { + "epoch": 11.56, + "grad_norm": 5.795559883117676, + "learning_rate": 8.843609022556392e-06, + "loss": 0.4115, + "step": 7690 + }, + { + "epoch": 11.58, + "grad_norm": 7.781355381011963, + "learning_rate": 8.842105263157895e-06, + "loss": 0.4617, + "step": 7700 + }, + { + "epoch": 11.59, + "grad_norm": 6.770030975341797, + "learning_rate": 8.8406015037594e-06, + "loss": 0.4218, + "step": 7710 + }, + { + "epoch": 11.61, + "grad_norm": 7.052707672119141, + "learning_rate": 8.839097744360902e-06, + "loss": 0.3897, + "step": 7720 + }, + { + "epoch": 11.62, + "grad_norm": 8.882899284362793, + "learning_rate": 8.837593984962407e-06, + "loss": 0.4546, + "step": 7730 + }, + { + "epoch": 11.64, + "grad_norm": 7.858944892883301, + "learning_rate": 8.836090225563911e-06, + "loss": 0.4287, + "step": 7740 + }, + { + "epoch": 11.65, + "grad_norm": 6.74614953994751, + "learning_rate": 8.834586466165414e-06, + "loss": 0.3326, + "step": 7750 + }, + { + "epoch": 11.67, + "grad_norm": 8.970141410827637, + "learning_rate": 8.833082706766918e-06, + "loss": 0.4863, + "step": 7760 + }, + { + "epoch": 11.68, + "grad_norm": 6.568352699279785, + "learning_rate": 8.831578947368421e-06, + "loss": 0.4248, + "step": 7770 + }, + { + "epoch": 11.7, + "grad_norm": 6.05830717086792, + "learning_rate": 8.830075187969925e-06, + "loss": 0.4829, + "step": 7780 + }, + { + "epoch": 11.71, + "grad_norm": 7.666469097137451, + "learning_rate": 8.82857142857143e-06, + "loss": 0.5319, + "step": 7790 + }, + { + "epoch": 11.73, + "grad_norm": 5.955508708953857, + "learning_rate": 8.827067669172934e-06, + "loss": 0.4309, + "step": 7800 + }, + { + "epoch": 11.74, + "grad_norm": 6.9883270263671875, + "learning_rate": 8.825563909774437e-06, + "loss": 0.4157, + "step": 7810 + }, + { + "epoch": 11.76, + "grad_norm": 6.703571319580078, + "learning_rate": 8.82406015037594e-06, + "loss": 0.3219, + "step": 7820 + }, + { + "epoch": 11.77, + "grad_norm": 7.131542682647705, + "learning_rate": 8.822556390977444e-06, + "loss": 0.4492, + "step": 7830 + }, + { + "epoch": 11.79, + "grad_norm": 5.014946460723877, + "learning_rate": 8.821052631578948e-06, + "loss": 0.4189, + "step": 7840 + }, + { + "epoch": 11.8, + "grad_norm": 4.254874229431152, + "learning_rate": 8.819548872180453e-06, + "loss": 0.484, + "step": 7850 + }, + { + "epoch": 11.82, + "grad_norm": 4.319407939910889, + "learning_rate": 8.818045112781955e-06, + "loss": 0.3861, + "step": 7860 + }, + { + "epoch": 11.83, + "grad_norm": 7.9686408042907715, + "learning_rate": 8.81654135338346e-06, + "loss": 0.4264, + "step": 7870 + }, + { + "epoch": 11.85, + "grad_norm": 5.5855326652526855, + "learning_rate": 8.815037593984964e-06, + "loss": 0.4532, + "step": 7880 + }, + { + "epoch": 11.86, + "grad_norm": 6.914451599121094, + "learning_rate": 8.813533834586467e-06, + "loss": 0.4355, + "step": 7890 + }, + { + "epoch": 11.88, + "grad_norm": 7.542539596557617, + "learning_rate": 8.81203007518797e-06, + "loss": 0.3671, + "step": 7900 + }, + { + "epoch": 11.89, + "grad_norm": 7.947263717651367, + "learning_rate": 8.810526315789474e-06, + "loss": 0.373, + "step": 7910 + }, + { + "epoch": 11.91, + "grad_norm": 7.884321689605713, + "learning_rate": 8.809022556390978e-06, + "loss": 0.4827, + "step": 7920 + }, + { + "epoch": 11.92, + "grad_norm": 5.361155986785889, + "learning_rate": 8.807518796992483e-06, + "loss": 0.4485, + "step": 7930 + }, + { + "epoch": 11.94, + "grad_norm": 7.507490158081055, + "learning_rate": 8.806015037593986e-06, + "loss": 0.4446, + "step": 7940 + }, + { + "epoch": 11.95, + "grad_norm": 7.053649425506592, + "learning_rate": 8.804511278195488e-06, + "loss": 0.4112, + "step": 7950 + }, + { + "epoch": 11.97, + "grad_norm": 8.394134521484375, + "learning_rate": 8.803007518796993e-06, + "loss": 0.4221, + "step": 7960 + }, + { + "epoch": 11.98, + "grad_norm": 9.852388381958008, + "learning_rate": 8.801503759398497e-06, + "loss": 0.4178, + "step": 7970 + }, + { + "epoch": 12.0, + "grad_norm": 17.5406551361084, + "learning_rate": 8.8e-06, + "loss": 0.4869, + "step": 7980 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.9298, + "eval_loss": 0.2563527822494507, + "eval_runtime": 84.9156, + "eval_samples_per_second": 117.764, + "eval_steps_per_second": 0.471, + "step": 7980 + }, + { + "epoch": 12.02, + "grad_norm": 4.651547908782959, + "learning_rate": 8.798496240601504e-06, + "loss": 0.4307, + "step": 7990 + }, + { + "epoch": 12.03, + "grad_norm": 5.47507905960083, + "learning_rate": 8.796992481203007e-06, + "loss": 0.3957, + "step": 8000 + }, + { + "epoch": 12.05, + "grad_norm": 7.309173583984375, + "learning_rate": 8.795488721804511e-06, + "loss": 0.3848, + "step": 8010 + }, + { + "epoch": 12.06, + "grad_norm": 5.4073591232299805, + "learning_rate": 8.793984962406016e-06, + "loss": 0.347, + "step": 8020 + }, + { + "epoch": 12.08, + "grad_norm": 9.495542526245117, + "learning_rate": 8.79248120300752e-06, + "loss": 0.3963, + "step": 8030 + }, + { + "epoch": 12.09, + "grad_norm": 7.175304412841797, + "learning_rate": 8.790977443609023e-06, + "loss": 0.4028, + "step": 8040 + }, + { + "epoch": 12.11, + "grad_norm": 5.7672624588012695, + "learning_rate": 8.789473684210527e-06, + "loss": 0.4336, + "step": 8050 + }, + { + "epoch": 12.12, + "grad_norm": 5.373271942138672, + "learning_rate": 8.78796992481203e-06, + "loss": 0.4214, + "step": 8060 + }, + { + "epoch": 12.14, + "grad_norm": 7.81503963470459, + "learning_rate": 8.786466165413534e-06, + "loss": 0.3362, + "step": 8070 + }, + { + "epoch": 12.15, + "grad_norm": 5.352240085601807, + "learning_rate": 8.784962406015039e-06, + "loss": 0.438, + "step": 8080 + }, + { + "epoch": 12.17, + "grad_norm": 4.825592994689941, + "learning_rate": 8.783458646616541e-06, + "loss": 0.3996, + "step": 8090 + }, + { + "epoch": 12.18, + "grad_norm": 4.875209808349609, + "learning_rate": 8.781954887218046e-06, + "loss": 0.4056, + "step": 8100 + }, + { + "epoch": 12.2, + "grad_norm": 6.405061721801758, + "learning_rate": 8.78045112781955e-06, + "loss": 0.404, + "step": 8110 + }, + { + "epoch": 12.21, + "grad_norm": 5.762337684631348, + "learning_rate": 8.778947368421053e-06, + "loss": 0.3609, + "step": 8120 + }, + { + "epoch": 12.23, + "grad_norm": 8.700191497802734, + "learning_rate": 8.777443609022557e-06, + "loss": 0.4316, + "step": 8130 + }, + { + "epoch": 12.24, + "grad_norm": 5.509273052215576, + "learning_rate": 8.775939849624062e-06, + "loss": 0.3814, + "step": 8140 + }, + { + "epoch": 12.26, + "grad_norm": 6.949098587036133, + "learning_rate": 8.774436090225564e-06, + "loss": 0.3588, + "step": 8150 + }, + { + "epoch": 12.27, + "grad_norm": 5.564908981323242, + "learning_rate": 8.772932330827069e-06, + "loss": 0.353, + "step": 8160 + }, + { + "epoch": 12.29, + "grad_norm": 6.935297012329102, + "learning_rate": 8.771428571428572e-06, + "loss": 0.4212, + "step": 8170 + }, + { + "epoch": 12.3, + "grad_norm": 4.811358451843262, + "learning_rate": 8.769924812030076e-06, + "loss": 0.3865, + "step": 8180 + }, + { + "epoch": 12.32, + "grad_norm": 6.4804368019104, + "learning_rate": 8.76842105263158e-06, + "loss": 0.3656, + "step": 8190 + }, + { + "epoch": 12.33, + "grad_norm": 3.9236013889312744, + "learning_rate": 8.766917293233083e-06, + "loss": 0.4885, + "step": 8200 + }, + { + "epoch": 12.35, + "grad_norm": 7.50891637802124, + "learning_rate": 8.765413533834586e-06, + "loss": 0.3962, + "step": 8210 + }, + { + "epoch": 12.36, + "grad_norm": 4.313982963562012, + "learning_rate": 8.763909774436092e-06, + "loss": 0.4023, + "step": 8220 + }, + { + "epoch": 12.38, + "grad_norm": 4.385167121887207, + "learning_rate": 8.762406015037595e-06, + "loss": 0.4841, + "step": 8230 + }, + { + "epoch": 12.39, + "grad_norm": 5.977277755737305, + "learning_rate": 8.760902255639099e-06, + "loss": 0.399, + "step": 8240 + }, + { + "epoch": 12.41, + "grad_norm": 8.858118057250977, + "learning_rate": 8.759398496240602e-06, + "loss": 0.451, + "step": 8250 + }, + { + "epoch": 12.42, + "grad_norm": 6.294662952423096, + "learning_rate": 8.757894736842106e-06, + "loss": 0.416, + "step": 8260 + }, + { + "epoch": 12.44, + "grad_norm": 4.536668300628662, + "learning_rate": 8.75639097744361e-06, + "loss": 0.407, + "step": 8270 + }, + { + "epoch": 12.45, + "grad_norm": 5.644812107086182, + "learning_rate": 8.754887218045114e-06, + "loss": 0.3685, + "step": 8280 + }, + { + "epoch": 12.47, + "grad_norm": 5.488842010498047, + "learning_rate": 8.753383458646616e-06, + "loss": 0.4136, + "step": 8290 + }, + { + "epoch": 12.48, + "grad_norm": 4.548142910003662, + "learning_rate": 8.751879699248122e-06, + "loss": 0.4502, + "step": 8300 + }, + { + "epoch": 12.5, + "grad_norm": 3.41457200050354, + "learning_rate": 8.750375939849625e-06, + "loss": 0.3598, + "step": 8310 + }, + { + "epoch": 12.51, + "grad_norm": 6.259812831878662, + "learning_rate": 8.748872180451129e-06, + "loss": 0.3843, + "step": 8320 + }, + { + "epoch": 12.53, + "grad_norm": 5.301551342010498, + "learning_rate": 8.747368421052632e-06, + "loss": 0.4038, + "step": 8330 + }, + { + "epoch": 12.54, + "grad_norm": 14.684255599975586, + "learning_rate": 8.745864661654137e-06, + "loss": 0.4115, + "step": 8340 + }, + { + "epoch": 12.56, + "grad_norm": 6.711531162261963, + "learning_rate": 8.744360902255639e-06, + "loss": 0.4112, + "step": 8350 + }, + { + "epoch": 12.57, + "grad_norm": 8.990388870239258, + "learning_rate": 8.742857142857144e-06, + "loss": 0.3547, + "step": 8360 + }, + { + "epoch": 12.59, + "grad_norm": 4.513948440551758, + "learning_rate": 8.741353383458648e-06, + "loss": 0.3776, + "step": 8370 + }, + { + "epoch": 12.6, + "grad_norm": 6.088433742523193, + "learning_rate": 8.739849624060151e-06, + "loss": 0.4116, + "step": 8380 + }, + { + "epoch": 12.62, + "grad_norm": 7.882970809936523, + "learning_rate": 8.738345864661655e-06, + "loss": 0.3712, + "step": 8390 + }, + { + "epoch": 12.63, + "grad_norm": 6.829627990722656, + "learning_rate": 8.736842105263158e-06, + "loss": 0.3842, + "step": 8400 + }, + { + "epoch": 12.65, + "grad_norm": 6.185722351074219, + "learning_rate": 8.735338345864662e-06, + "loss": 0.424, + "step": 8410 + }, + { + "epoch": 12.66, + "grad_norm": 4.945958137512207, + "learning_rate": 8.733834586466167e-06, + "loss": 0.377, + "step": 8420 + }, + { + "epoch": 12.68, + "grad_norm": 6.356648921966553, + "learning_rate": 8.73233082706767e-06, + "loss": 0.4256, + "step": 8430 + }, + { + "epoch": 12.69, + "grad_norm": 6.276622295379639, + "learning_rate": 8.730827067669172e-06, + "loss": 0.3733, + "step": 8440 + }, + { + "epoch": 12.71, + "grad_norm": 7.50572395324707, + "learning_rate": 8.729323308270678e-06, + "loss": 0.4407, + "step": 8450 + }, + { + "epoch": 12.72, + "grad_norm": 7.089003086090088, + "learning_rate": 8.727819548872181e-06, + "loss": 0.3948, + "step": 8460 + }, + { + "epoch": 12.74, + "grad_norm": 6.90725564956665, + "learning_rate": 8.726315789473685e-06, + "loss": 0.4511, + "step": 8470 + }, + { + "epoch": 12.75, + "grad_norm": 4.369374752044678, + "learning_rate": 8.724812030075188e-06, + "loss": 0.3559, + "step": 8480 + }, + { + "epoch": 12.77, + "grad_norm": 2.895493507385254, + "learning_rate": 8.723308270676692e-06, + "loss": 0.349, + "step": 8490 + }, + { + "epoch": 12.78, + "grad_norm": 8.638984680175781, + "learning_rate": 8.721804511278195e-06, + "loss": 0.3406, + "step": 8500 + }, + { + "epoch": 12.8, + "grad_norm": 7.664207458496094, + "learning_rate": 8.7203007518797e-06, + "loss": 0.3619, + "step": 8510 + }, + { + "epoch": 12.81, + "grad_norm": 4.544347286224365, + "learning_rate": 8.718796992481204e-06, + "loss": 0.3109, + "step": 8520 + }, + { + "epoch": 12.83, + "grad_norm": 6.640614032745361, + "learning_rate": 8.717293233082708e-06, + "loss": 0.4116, + "step": 8530 + }, + { + "epoch": 12.84, + "grad_norm": 7.840051174163818, + "learning_rate": 8.715789473684211e-06, + "loss": 0.4027, + "step": 8540 + }, + { + "epoch": 12.86, + "grad_norm": 10.355204582214355, + "learning_rate": 8.714285714285715e-06, + "loss": 0.4013, + "step": 8550 + }, + { + "epoch": 12.87, + "grad_norm": 7.472030162811279, + "learning_rate": 8.712781954887218e-06, + "loss": 0.4119, + "step": 8560 + }, + { + "epoch": 12.89, + "grad_norm": 7.9360246658325195, + "learning_rate": 8.711278195488723e-06, + "loss": 0.3472, + "step": 8570 + }, + { + "epoch": 12.9, + "grad_norm": 5.889431953430176, + "learning_rate": 8.709774436090225e-06, + "loss": 0.4009, + "step": 8580 + }, + { + "epoch": 12.92, + "grad_norm": 5.548401355743408, + "learning_rate": 8.70827067669173e-06, + "loss": 0.4261, + "step": 8590 + }, + { + "epoch": 12.93, + "grad_norm": 5.590747833251953, + "learning_rate": 8.706766917293234e-06, + "loss": 0.4437, + "step": 8600 + }, + { + "epoch": 12.95, + "grad_norm": 6.401696681976318, + "learning_rate": 8.705263157894737e-06, + "loss": 0.3746, + "step": 8610 + }, + { + "epoch": 12.96, + "grad_norm": 9.315383911132812, + "learning_rate": 8.703759398496241e-06, + "loss": 0.3803, + "step": 8620 + }, + { + "epoch": 12.98, + "grad_norm": 3.9589388370513916, + "learning_rate": 8.702255639097746e-06, + "loss": 0.4448, + "step": 8630 + }, + { + "epoch": 12.99, + "grad_norm": 4.445014953613281, + "learning_rate": 8.700751879699248e-06, + "loss": 0.4289, + "step": 8640 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.9288, + "eval_loss": 0.2712935507297516, + "eval_runtime": 84.8607, + "eval_samples_per_second": 117.84, + "eval_steps_per_second": 0.471, + "step": 8645 + }, + { + "epoch": 13.01, + "grad_norm": 5.444362163543701, + "learning_rate": 8.699248120300753e-06, + "loss": 0.3994, + "step": 8650 + }, + { + "epoch": 13.02, + "grad_norm": 5.8952178955078125, + "learning_rate": 8.697744360902257e-06, + "loss": 0.3819, + "step": 8660 + }, + { + "epoch": 13.04, + "grad_norm": 5.363025188446045, + "learning_rate": 8.69624060150376e-06, + "loss": 0.4251, + "step": 8670 + }, + { + "epoch": 13.05, + "grad_norm": 6.1266961097717285, + "learning_rate": 8.694736842105264e-06, + "loss": 0.4236, + "step": 8680 + }, + { + "epoch": 13.07, + "grad_norm": 6.096094131469727, + "learning_rate": 8.693233082706767e-06, + "loss": 0.4411, + "step": 8690 + }, + { + "epoch": 13.08, + "grad_norm": 6.0483293533325195, + "learning_rate": 8.69172932330827e-06, + "loss": 0.3538, + "step": 8700 + }, + { + "epoch": 13.1, + "grad_norm": 8.619955062866211, + "learning_rate": 8.690225563909776e-06, + "loss": 0.4698, + "step": 8710 + }, + { + "epoch": 13.11, + "grad_norm": 5.028072834014893, + "learning_rate": 8.68872180451128e-06, + "loss": 0.3883, + "step": 8720 + }, + { + "epoch": 13.13, + "grad_norm": 7.43666934967041, + "learning_rate": 8.687218045112783e-06, + "loss": 0.3552, + "step": 8730 + }, + { + "epoch": 13.14, + "grad_norm": 9.520151138305664, + "learning_rate": 8.685714285714287e-06, + "loss": 0.4079, + "step": 8740 + }, + { + "epoch": 13.16, + "grad_norm": 7.852067947387695, + "learning_rate": 8.68421052631579e-06, + "loss": 0.3607, + "step": 8750 + }, + { + "epoch": 13.17, + "grad_norm": 5.92877721786499, + "learning_rate": 8.682706766917294e-06, + "loss": 0.3739, + "step": 8760 + }, + { + "epoch": 13.19, + "grad_norm": 4.25166130065918, + "learning_rate": 8.681203007518799e-06, + "loss": 0.4621, + "step": 8770 + }, + { + "epoch": 13.2, + "grad_norm": 7.073912143707275, + "learning_rate": 8.6796992481203e-06, + "loss": 0.4465, + "step": 8780 + }, + { + "epoch": 13.22, + "grad_norm": 7.39524507522583, + "learning_rate": 8.678195488721806e-06, + "loss": 0.4303, + "step": 8790 + }, + { + "epoch": 13.23, + "grad_norm": 6.938388824462891, + "learning_rate": 8.67669172932331e-06, + "loss": 0.3535, + "step": 8800 + }, + { + "epoch": 13.25, + "grad_norm": 5.0067524909973145, + "learning_rate": 8.675187969924813e-06, + "loss": 0.4399, + "step": 8810 + }, + { + "epoch": 13.26, + "grad_norm": 6.340808391571045, + "learning_rate": 8.673684210526316e-06, + "loss": 0.4199, + "step": 8820 + }, + { + "epoch": 13.28, + "grad_norm": 4.246801853179932, + "learning_rate": 8.67218045112782e-06, + "loss": 0.4039, + "step": 8830 + }, + { + "epoch": 13.29, + "grad_norm": 4.85552453994751, + "learning_rate": 8.670676691729323e-06, + "loss": 0.3753, + "step": 8840 + }, + { + "epoch": 13.31, + "grad_norm": 6.020550727844238, + "learning_rate": 8.669172932330829e-06, + "loss": 0.4039, + "step": 8850 + }, + { + "epoch": 13.32, + "grad_norm": 3.4875411987304688, + "learning_rate": 8.667669172932332e-06, + "loss": 0.3829, + "step": 8860 + }, + { + "epoch": 13.34, + "grad_norm": 6.239095211029053, + "learning_rate": 8.666165413533836e-06, + "loss": 0.3511, + "step": 8870 + }, + { + "epoch": 13.35, + "grad_norm": 4.244966983795166, + "learning_rate": 8.66466165413534e-06, + "loss": 0.4268, + "step": 8880 + }, + { + "epoch": 13.37, + "grad_norm": 12.684317588806152, + "learning_rate": 8.663157894736843e-06, + "loss": 0.3471, + "step": 8890 + }, + { + "epoch": 13.38, + "grad_norm": 8.664961814880371, + "learning_rate": 8.661654135338346e-06, + "loss": 0.3822, + "step": 8900 + }, + { + "epoch": 13.4, + "grad_norm": 5.7766804695129395, + "learning_rate": 8.660150375939851e-06, + "loss": 0.4199, + "step": 8910 + }, + { + "epoch": 13.41, + "grad_norm": 4.019351959228516, + "learning_rate": 8.658646616541353e-06, + "loss": 0.4137, + "step": 8920 + }, + { + "epoch": 13.43, + "grad_norm": 6.156152248382568, + "learning_rate": 8.657142857142858e-06, + "loss": 0.4606, + "step": 8930 + }, + { + "epoch": 13.44, + "grad_norm": 5.74890661239624, + "learning_rate": 8.655639097744362e-06, + "loss": 0.3221, + "step": 8940 + }, + { + "epoch": 13.46, + "grad_norm": 6.321985721588135, + "learning_rate": 8.654135338345866e-06, + "loss": 0.415, + "step": 8950 + }, + { + "epoch": 13.47, + "grad_norm": 8.88508129119873, + "learning_rate": 8.652631578947369e-06, + "loss": 0.3714, + "step": 8960 + }, + { + "epoch": 13.49, + "grad_norm": 6.999327659606934, + "learning_rate": 8.651127819548873e-06, + "loss": 0.3576, + "step": 8970 + }, + { + "epoch": 13.5, + "grad_norm": 7.313613414764404, + "learning_rate": 8.649624060150376e-06, + "loss": 0.4096, + "step": 8980 + }, + { + "epoch": 13.52, + "grad_norm": 6.363276958465576, + "learning_rate": 8.64812030075188e-06, + "loss": 0.3356, + "step": 8990 + }, + { + "epoch": 13.53, + "grad_norm": 7.81085729598999, + "learning_rate": 8.646616541353385e-06, + "loss": 0.4216, + "step": 9000 + }, + { + "epoch": 13.55, + "grad_norm": 8.093158721923828, + "learning_rate": 8.645112781954887e-06, + "loss": 0.5105, + "step": 9010 + }, + { + "epoch": 13.56, + "grad_norm": 3.801630735397339, + "learning_rate": 8.643609022556392e-06, + "loss": 0.439, + "step": 9020 + }, + { + "epoch": 13.58, + "grad_norm": 5.564939975738525, + "learning_rate": 8.642105263157895e-06, + "loss": 0.3854, + "step": 9030 + }, + { + "epoch": 13.59, + "grad_norm": 9.847439765930176, + "learning_rate": 8.640601503759399e-06, + "loss": 0.4034, + "step": 9040 + }, + { + "epoch": 13.61, + "grad_norm": 9.21834659576416, + "learning_rate": 8.639097744360902e-06, + "loss": 0.4448, + "step": 9050 + }, + { + "epoch": 13.62, + "grad_norm": 4.98524808883667, + "learning_rate": 8.637593984962408e-06, + "loss": 0.3646, + "step": 9060 + }, + { + "epoch": 13.64, + "grad_norm": 6.707414150238037, + "learning_rate": 8.63609022556391e-06, + "loss": 0.3618, + "step": 9070 + }, + { + "epoch": 13.65, + "grad_norm": 5.5840840339660645, + "learning_rate": 8.634586466165415e-06, + "loss": 0.3628, + "step": 9080 + }, + { + "epoch": 13.67, + "grad_norm": 4.939608097076416, + "learning_rate": 8.633082706766918e-06, + "loss": 0.3785, + "step": 9090 + }, + { + "epoch": 13.68, + "grad_norm": 7.449197769165039, + "learning_rate": 8.631578947368422e-06, + "loss": 0.4354, + "step": 9100 + }, + { + "epoch": 13.7, + "grad_norm": 9.470358848571777, + "learning_rate": 8.630075187969925e-06, + "loss": 0.4075, + "step": 9110 + }, + { + "epoch": 13.71, + "grad_norm": 7.6183085441589355, + "learning_rate": 8.628571428571429e-06, + "loss": 0.4067, + "step": 9120 + }, + { + "epoch": 13.73, + "grad_norm": 3.0916943550109863, + "learning_rate": 8.627067669172932e-06, + "loss": 0.365, + "step": 9130 + }, + { + "epoch": 13.74, + "grad_norm": 4.251070499420166, + "learning_rate": 8.625563909774437e-06, + "loss": 0.4255, + "step": 9140 + }, + { + "epoch": 13.76, + "grad_norm": 6.8059282302856445, + "learning_rate": 8.624060150375941e-06, + "loss": 0.32, + "step": 9150 + }, + { + "epoch": 13.77, + "grad_norm": 7.302189826965332, + "learning_rate": 8.622556390977444e-06, + "loss": 0.4131, + "step": 9160 + }, + { + "epoch": 13.79, + "grad_norm": 6.402463436126709, + "learning_rate": 8.621052631578948e-06, + "loss": 0.3848, + "step": 9170 + }, + { + "epoch": 13.8, + "grad_norm": 4.343325138092041, + "learning_rate": 8.619548872180452e-06, + "loss": 0.4014, + "step": 9180 + }, + { + "epoch": 13.82, + "grad_norm": 9.013459205627441, + "learning_rate": 8.618045112781955e-06, + "loss": 0.3739, + "step": 9190 + }, + { + "epoch": 13.83, + "grad_norm": 7.037381172180176, + "learning_rate": 8.61654135338346e-06, + "loss": 0.4189, + "step": 9200 + }, + { + "epoch": 13.85, + "grad_norm": 4.7024760246276855, + "learning_rate": 8.615037593984962e-06, + "loss": 0.38, + "step": 9210 + }, + { + "epoch": 13.86, + "grad_norm": 4.808414936065674, + "learning_rate": 8.613533834586467e-06, + "loss": 0.4414, + "step": 9220 + }, + { + "epoch": 13.88, + "grad_norm": 8.237750053405762, + "learning_rate": 8.61203007518797e-06, + "loss": 0.4215, + "step": 9230 + }, + { + "epoch": 13.89, + "grad_norm": 7.862570285797119, + "learning_rate": 8.610526315789474e-06, + "loss": 0.4727, + "step": 9240 + }, + { + "epoch": 13.91, + "grad_norm": 7.045783519744873, + "learning_rate": 8.609022556390978e-06, + "loss": 0.4109, + "step": 9250 + }, + { + "epoch": 13.92, + "grad_norm": 5.3544135093688965, + "learning_rate": 8.607518796992483e-06, + "loss": 0.3824, + "step": 9260 + }, + { + "epoch": 13.94, + "grad_norm": 14.21022891998291, + "learning_rate": 8.606015037593985e-06, + "loss": 0.3659, + "step": 9270 + }, + { + "epoch": 13.95, + "grad_norm": 7.408153533935547, + "learning_rate": 8.60451127819549e-06, + "loss": 0.3508, + "step": 9280 + }, + { + "epoch": 13.97, + "grad_norm": 3.206442356109619, + "learning_rate": 8.603007518796994e-06, + "loss": 0.4451, + "step": 9290 + }, + { + "epoch": 13.98, + "grad_norm": 4.974185466766357, + "learning_rate": 8.601503759398497e-06, + "loss": 0.3888, + "step": 9300 + }, + { + "epoch": 14.0, + "grad_norm": 15.39065170288086, + "learning_rate": 8.6e-06, + "loss": 0.4003, + "step": 9310 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.932, + "eval_loss": 0.2616865336894989, + "eval_runtime": 84.8808, + "eval_samples_per_second": 117.812, + "eval_steps_per_second": 0.471, + "step": 9310 + }, + { + "epoch": 14.02, + "grad_norm": 5.1268534660339355, + "learning_rate": 8.598496240601504e-06, + "loss": 0.4195, + "step": 9320 + }, + { + "epoch": 14.03, + "grad_norm": 6.874084949493408, + "learning_rate": 8.596992481203008e-06, + "loss": 0.3901, + "step": 9330 + }, + { + "epoch": 14.05, + "grad_norm": 11.405204772949219, + "learning_rate": 8.595488721804513e-06, + "loss": 0.3234, + "step": 9340 + }, + { + "epoch": 14.06, + "grad_norm": 4.844882965087891, + "learning_rate": 8.593984962406016e-06, + "loss": 0.3437, + "step": 9350 + }, + { + "epoch": 14.08, + "grad_norm": 7.187948226928711, + "learning_rate": 8.59248120300752e-06, + "loss": 0.3895, + "step": 9360 + }, + { + "epoch": 14.09, + "grad_norm": 3.7594106197357178, + "learning_rate": 8.590977443609023e-06, + "loss": 0.3329, + "step": 9370 + }, + { + "epoch": 14.11, + "grad_norm": 4.245199203491211, + "learning_rate": 8.589473684210527e-06, + "loss": 0.3644, + "step": 9380 + }, + { + "epoch": 14.12, + "grad_norm": 6.302145004272461, + "learning_rate": 8.58796992481203e-06, + "loss": 0.4615, + "step": 9390 + }, + { + "epoch": 14.14, + "grad_norm": 6.26497220993042, + "learning_rate": 8.586466165413536e-06, + "loss": 0.3983, + "step": 9400 + }, + { + "epoch": 14.15, + "grad_norm": 7.315799236297607, + "learning_rate": 8.584962406015038e-06, + "loss": 0.3474, + "step": 9410 + }, + { + "epoch": 14.17, + "grad_norm": 8.584407806396484, + "learning_rate": 8.583458646616543e-06, + "loss": 0.3858, + "step": 9420 + }, + { + "epoch": 14.18, + "grad_norm": 6.192986488342285, + "learning_rate": 8.581954887218046e-06, + "loss": 0.4653, + "step": 9430 + }, + { + "epoch": 14.2, + "grad_norm": 6.261072635650635, + "learning_rate": 8.58045112781955e-06, + "loss": 0.3686, + "step": 9440 + }, + { + "epoch": 14.21, + "grad_norm": 6.7162017822265625, + "learning_rate": 8.578947368421053e-06, + "loss": 0.4143, + "step": 9450 + }, + { + "epoch": 14.23, + "grad_norm": 5.550053119659424, + "learning_rate": 8.577443609022557e-06, + "loss": 0.4262, + "step": 9460 + }, + { + "epoch": 14.24, + "grad_norm": 6.601341247558594, + "learning_rate": 8.57593984962406e-06, + "loss": 0.3615, + "step": 9470 + }, + { + "epoch": 14.26, + "grad_norm": 6.859097957611084, + "learning_rate": 8.574436090225564e-06, + "loss": 0.3584, + "step": 9480 + }, + { + "epoch": 14.27, + "grad_norm": 3.824615478515625, + "learning_rate": 8.572932330827069e-06, + "loss": 0.3941, + "step": 9490 + }, + { + "epoch": 14.29, + "grad_norm": 6.923836708068848, + "learning_rate": 8.571428571428571e-06, + "loss": 0.4016, + "step": 9500 + }, + { + "epoch": 14.3, + "grad_norm": 6.395806789398193, + "learning_rate": 8.569924812030076e-06, + "loss": 0.368, + "step": 9510 + }, + { + "epoch": 14.32, + "grad_norm": 6.522418022155762, + "learning_rate": 8.56842105263158e-06, + "loss": 0.377, + "step": 9520 + }, + { + "epoch": 14.33, + "grad_norm": 7.502889633178711, + "learning_rate": 8.566917293233083e-06, + "loss": 0.4179, + "step": 9530 + }, + { + "epoch": 14.35, + "grad_norm": 6.025669574737549, + "learning_rate": 8.565413533834587e-06, + "loss": 0.3395, + "step": 9540 + }, + { + "epoch": 14.36, + "grad_norm": 7.751435279846191, + "learning_rate": 8.563909774436092e-06, + "loss": 0.3531, + "step": 9550 + }, + { + "epoch": 14.38, + "grad_norm": 6.964672088623047, + "learning_rate": 8.562406015037594e-06, + "loss": 0.3905, + "step": 9560 + }, + { + "epoch": 14.39, + "grad_norm": 2.502666473388672, + "learning_rate": 8.560902255639099e-06, + "loss": 0.3172, + "step": 9570 + }, + { + "epoch": 14.41, + "grad_norm": 7.133659839630127, + "learning_rate": 8.559398496240602e-06, + "loss": 0.442, + "step": 9580 + }, + { + "epoch": 14.42, + "grad_norm": 4.32753324508667, + "learning_rate": 8.557894736842106e-06, + "loss": 0.3458, + "step": 9590 + }, + { + "epoch": 14.44, + "grad_norm": 6.268803119659424, + "learning_rate": 8.55639097744361e-06, + "loss": 0.3431, + "step": 9600 + }, + { + "epoch": 14.45, + "grad_norm": 7.018800735473633, + "learning_rate": 8.554887218045113e-06, + "loss": 0.4369, + "step": 9610 + }, + { + "epoch": 14.47, + "grad_norm": 7.435917377471924, + "learning_rate": 8.553383458646617e-06, + "loss": 0.3861, + "step": 9620 + }, + { + "epoch": 14.48, + "grad_norm": 7.8388752937316895, + "learning_rate": 8.551879699248122e-06, + "loss": 0.4239, + "step": 9630 + }, + { + "epoch": 14.5, + "grad_norm": 7.880455493927002, + "learning_rate": 8.550375939849625e-06, + "loss": 0.3829, + "step": 9640 + }, + { + "epoch": 14.51, + "grad_norm": 5.568830490112305, + "learning_rate": 8.548872180451129e-06, + "loss": 0.4011, + "step": 9650 + }, + { + "epoch": 14.53, + "grad_norm": 5.6487274169921875, + "learning_rate": 8.547368421052632e-06, + "loss": 0.4091, + "step": 9660 + }, + { + "epoch": 14.54, + "grad_norm": 4.51718807220459, + "learning_rate": 8.545864661654136e-06, + "loss": 0.4087, + "step": 9670 + }, + { + "epoch": 14.56, + "grad_norm": 7.874798774719238, + "learning_rate": 8.54436090225564e-06, + "loss": 0.3494, + "step": 9680 + }, + { + "epoch": 14.57, + "grad_norm": 4.497681617736816, + "learning_rate": 8.542857142857145e-06, + "loss": 0.3783, + "step": 9690 + }, + { + "epoch": 14.59, + "grad_norm": 5.411101818084717, + "learning_rate": 8.541353383458646e-06, + "loss": 0.3395, + "step": 9700 + }, + { + "epoch": 14.6, + "grad_norm": 5.714541435241699, + "learning_rate": 8.539849624060152e-06, + "loss": 0.4507, + "step": 9710 + }, + { + "epoch": 14.62, + "grad_norm": 7.042336940765381, + "learning_rate": 8.538345864661655e-06, + "loss": 0.3363, + "step": 9720 + }, + { + "epoch": 14.63, + "grad_norm": 6.70949649810791, + "learning_rate": 8.536842105263159e-06, + "loss": 0.3559, + "step": 9730 + }, + { + "epoch": 14.65, + "grad_norm": 5.84644889831543, + "learning_rate": 8.535338345864662e-06, + "loss": 0.3936, + "step": 9740 + }, + { + "epoch": 14.66, + "grad_norm": 5.070087432861328, + "learning_rate": 8.533834586466166e-06, + "loss": 0.4014, + "step": 9750 + }, + { + "epoch": 14.68, + "grad_norm": 5.353463172912598, + "learning_rate": 8.53233082706767e-06, + "loss": 0.4091, + "step": 9760 + }, + { + "epoch": 14.69, + "grad_norm": 9.118497848510742, + "learning_rate": 8.530827067669174e-06, + "loss": 0.3922, + "step": 9770 + }, + { + "epoch": 14.71, + "grad_norm": 7.667191982269287, + "learning_rate": 8.529323308270678e-06, + "loss": 0.4087, + "step": 9780 + }, + { + "epoch": 14.72, + "grad_norm": 7.210267066955566, + "learning_rate": 8.527819548872181e-06, + "loss": 0.3458, + "step": 9790 + }, + { + "epoch": 14.74, + "grad_norm": 5.242373466491699, + "learning_rate": 8.526315789473685e-06, + "loss": 0.3293, + "step": 9800 + }, + { + "epoch": 14.75, + "grad_norm": 7.6933393478393555, + "learning_rate": 8.524812030075188e-06, + "loss": 0.4406, + "step": 9810 + }, + { + "epoch": 14.77, + "grad_norm": 5.179628372192383, + "learning_rate": 8.523308270676692e-06, + "loss": 0.3827, + "step": 9820 + }, + { + "epoch": 14.78, + "grad_norm": 9.525907516479492, + "learning_rate": 8.521804511278197e-06, + "loss": 0.4095, + "step": 9830 + }, + { + "epoch": 14.8, + "grad_norm": 6.132147789001465, + "learning_rate": 8.520300751879699e-06, + "loss": 0.3778, + "step": 9840 + }, + { + "epoch": 14.81, + "grad_norm": 8.04976749420166, + "learning_rate": 8.518796992481204e-06, + "loss": 0.3996, + "step": 9850 + }, + { + "epoch": 14.83, + "grad_norm": 9.131913185119629, + "learning_rate": 8.517293233082708e-06, + "loss": 0.4103, + "step": 9860 + }, + { + "epoch": 14.84, + "grad_norm": 5.724211692810059, + "learning_rate": 8.515789473684211e-06, + "loss": 0.4166, + "step": 9870 + }, + { + "epoch": 14.86, + "grad_norm": 4.537842750549316, + "learning_rate": 8.514285714285715e-06, + "loss": 0.4357, + "step": 9880 + }, + { + "epoch": 14.87, + "grad_norm": 9.75554370880127, + "learning_rate": 8.51278195488722e-06, + "loss": 0.351, + "step": 9890 + }, + { + "epoch": 14.89, + "grad_norm": 5.427340030670166, + "learning_rate": 8.511278195488722e-06, + "loss": 0.3488, + "step": 9900 + }, + { + "epoch": 14.9, + "grad_norm": 4.465277671813965, + "learning_rate": 8.509774436090227e-06, + "loss": 0.32, + "step": 9910 + }, + { + "epoch": 14.92, + "grad_norm": 4.331689834594727, + "learning_rate": 8.50827067669173e-06, + "loss": 0.4442, + "step": 9920 + }, + { + "epoch": 14.93, + "grad_norm": 5.798705577850342, + "learning_rate": 8.506766917293232e-06, + "loss": 0.4003, + "step": 9930 + }, + { + "epoch": 14.95, + "grad_norm": 9.353456497192383, + "learning_rate": 8.505263157894738e-06, + "loss": 0.3969, + "step": 9940 + }, + { + "epoch": 14.96, + "grad_norm": 7.088143825531006, + "learning_rate": 8.503759398496241e-06, + "loss": 0.3749, + "step": 9950 + }, + { + "epoch": 14.98, + "grad_norm": 6.825297832489014, + "learning_rate": 8.502255639097745e-06, + "loss": 0.3532, + "step": 9960 + }, + { + "epoch": 14.99, + "grad_norm": 3.3977503776550293, + "learning_rate": 8.500751879699248e-06, + "loss": 0.3227, + "step": 9970 + }, + { + "epoch": 15.0, + "eval_accuracy": 0.9335, + "eval_loss": 0.2566547989845276, + "eval_runtime": 84.4818, + "eval_samples_per_second": 118.369, + "eval_steps_per_second": 0.473, + "step": 9975 + }, + { + "epoch": 15.01, + "grad_norm": 3.752779483795166, + "learning_rate": 8.499248120300753e-06, + "loss": 0.2741, + "step": 9980 + }, + { + "epoch": 15.02, + "grad_norm": 5.626667499542236, + "learning_rate": 8.497744360902255e-06, + "loss": 0.3584, + "step": 9990 + }, + { + "epoch": 15.04, + "grad_norm": 5.8750834465026855, + "learning_rate": 8.49624060150376e-06, + "loss": 0.3563, + "step": 10000 + }, + { + "epoch": 15.05, + "grad_norm": 6.653073787689209, + "learning_rate": 8.494736842105264e-06, + "loss": 0.3808, + "step": 10010 + }, + { + "epoch": 15.07, + "grad_norm": 6.017663955688477, + "learning_rate": 8.493233082706767e-06, + "loss": 0.3165, + "step": 10020 + }, + { + "epoch": 15.08, + "grad_norm": 4.427550792694092, + "learning_rate": 8.491729323308271e-06, + "loss": 0.4417, + "step": 10030 + }, + { + "epoch": 15.1, + "grad_norm": 7.968047142028809, + "learning_rate": 8.490225563909775e-06, + "loss": 0.3807, + "step": 10040 + }, + { + "epoch": 15.11, + "grad_norm": 6.646710395812988, + "learning_rate": 8.488721804511278e-06, + "loss": 0.3014, + "step": 10050 + }, + { + "epoch": 15.13, + "grad_norm": 5.180790424346924, + "learning_rate": 8.487218045112783e-06, + "loss": 0.3737, + "step": 10060 + }, + { + "epoch": 15.14, + "grad_norm": 8.142125129699707, + "learning_rate": 8.485714285714287e-06, + "loss": 0.3953, + "step": 10070 + }, + { + "epoch": 15.16, + "grad_norm": 5.258510589599609, + "learning_rate": 8.48421052631579e-06, + "loss": 0.2742, + "step": 10080 + }, + { + "epoch": 15.17, + "grad_norm": 7.299388408660889, + "learning_rate": 8.482706766917294e-06, + "loss": 0.388, + "step": 10090 + }, + { + "epoch": 15.19, + "grad_norm": 9.05027961730957, + "learning_rate": 8.481203007518797e-06, + "loss": 0.3364, + "step": 10100 + }, + { + "epoch": 15.2, + "grad_norm": 8.713417053222656, + "learning_rate": 8.4796992481203e-06, + "loss": 0.2847, + "step": 10110 + }, + { + "epoch": 15.22, + "grad_norm": 4.598002910614014, + "learning_rate": 8.478195488721806e-06, + "loss": 0.3536, + "step": 10120 + }, + { + "epoch": 15.23, + "grad_norm": 8.883098602294922, + "learning_rate": 8.476691729323308e-06, + "loss": 0.3917, + "step": 10130 + }, + { + "epoch": 15.25, + "grad_norm": 9.343592643737793, + "learning_rate": 8.475187969924813e-06, + "loss": 0.3912, + "step": 10140 + }, + { + "epoch": 15.26, + "grad_norm": 11.456267356872559, + "learning_rate": 8.473684210526317e-06, + "loss": 0.3171, + "step": 10150 + }, + { + "epoch": 15.28, + "grad_norm": 7.874906539916992, + "learning_rate": 8.47218045112782e-06, + "loss": 0.4443, + "step": 10160 + }, + { + "epoch": 15.29, + "grad_norm": 7.280247211456299, + "learning_rate": 8.470676691729324e-06, + "loss": 0.4121, + "step": 10170 + }, + { + "epoch": 15.31, + "grad_norm": 7.839987754821777, + "learning_rate": 8.469172932330829e-06, + "loss": 0.4284, + "step": 10180 + }, + { + "epoch": 15.32, + "grad_norm": 5.39335298538208, + "learning_rate": 8.46766917293233e-06, + "loss": 0.4311, + "step": 10190 + }, + { + "epoch": 15.34, + "grad_norm": 8.08191204071045, + "learning_rate": 8.466165413533836e-06, + "loss": 0.3789, + "step": 10200 + }, + { + "epoch": 15.35, + "grad_norm": 3.493443250656128, + "learning_rate": 8.46466165413534e-06, + "loss": 0.3526, + "step": 10210 + }, + { + "epoch": 15.37, + "grad_norm": 7.41270637512207, + "learning_rate": 8.463157894736843e-06, + "loss": 0.4283, + "step": 10220 + }, + { + "epoch": 15.38, + "grad_norm": 6.4891486167907715, + "learning_rate": 8.461654135338346e-06, + "loss": 0.298, + "step": 10230 + }, + { + "epoch": 15.4, + "grad_norm": 6.028573989868164, + "learning_rate": 8.46015037593985e-06, + "loss": 0.3996, + "step": 10240 + }, + { + "epoch": 15.41, + "grad_norm": 3.365438938140869, + "learning_rate": 8.458646616541353e-06, + "loss": 0.334, + "step": 10250 + }, + { + "epoch": 15.43, + "grad_norm": 5.2097015380859375, + "learning_rate": 8.457142857142859e-06, + "loss": 0.3555, + "step": 10260 + }, + { + "epoch": 15.44, + "grad_norm": 4.656721591949463, + "learning_rate": 8.455639097744362e-06, + "loss": 0.398, + "step": 10270 + }, + { + "epoch": 15.46, + "grad_norm": 11.353671073913574, + "learning_rate": 8.454135338345866e-06, + "loss": 0.3341, + "step": 10280 + }, + { + "epoch": 15.47, + "grad_norm": 6.971073627471924, + "learning_rate": 8.45263157894737e-06, + "loss": 0.3569, + "step": 10290 + }, + { + "epoch": 15.49, + "grad_norm": 5.809013843536377, + "learning_rate": 8.451127819548873e-06, + "loss": 0.3642, + "step": 10300 + }, + { + "epoch": 15.5, + "grad_norm": 7.3322834968566895, + "learning_rate": 8.449624060150376e-06, + "loss": 0.3814, + "step": 10310 + }, + { + "epoch": 15.52, + "grad_norm": 9.322681427001953, + "learning_rate": 8.448120300751882e-06, + "loss": 0.3455, + "step": 10320 + }, + { + "epoch": 15.53, + "grad_norm": 7.197205066680908, + "learning_rate": 8.446616541353383e-06, + "loss": 0.3652, + "step": 10330 + }, + { + "epoch": 15.55, + "grad_norm": 7.1502766609191895, + "learning_rate": 8.445112781954889e-06, + "loss": 0.3882, + "step": 10340 + }, + { + "epoch": 15.56, + "grad_norm": 4.826005935668945, + "learning_rate": 8.443609022556392e-06, + "loss": 0.3878, + "step": 10350 + }, + { + "epoch": 15.58, + "grad_norm": 8.432343482971191, + "learning_rate": 8.442105263157896e-06, + "loss": 0.3778, + "step": 10360 + }, + { + "epoch": 15.59, + "grad_norm": 6.166329383850098, + "learning_rate": 8.440601503759399e-06, + "loss": 0.3675, + "step": 10370 + }, + { + "epoch": 15.61, + "grad_norm": 9.84304428100586, + "learning_rate": 8.439097744360903e-06, + "loss": 0.385, + "step": 10380 + }, + { + "epoch": 15.62, + "grad_norm": 4.937039375305176, + "learning_rate": 8.437593984962406e-06, + "loss": 0.3558, + "step": 10390 + }, + { + "epoch": 15.64, + "grad_norm": 5.817636966705322, + "learning_rate": 8.436090225563911e-06, + "loss": 0.3605, + "step": 10400 + }, + { + "epoch": 15.65, + "grad_norm": 6.982740879058838, + "learning_rate": 8.434586466165415e-06, + "loss": 0.2966, + "step": 10410 + }, + { + "epoch": 15.67, + "grad_norm": 5.2945098876953125, + "learning_rate": 8.433082706766918e-06, + "loss": 0.3279, + "step": 10420 + }, + { + "epoch": 15.68, + "grad_norm": 7.526950359344482, + "learning_rate": 8.431578947368422e-06, + "loss": 0.414, + "step": 10430 + }, + { + "epoch": 15.7, + "grad_norm": 8.325518608093262, + "learning_rate": 8.430075187969925e-06, + "loss": 0.314, + "step": 10440 + }, + { + "epoch": 15.71, + "grad_norm": 2.876897096633911, + "learning_rate": 8.428571428571429e-06, + "loss": 0.354, + "step": 10450 + }, + { + "epoch": 15.73, + "grad_norm": 6.776325702667236, + "learning_rate": 8.427067669172932e-06, + "loss": 0.3462, + "step": 10460 + }, + { + "epoch": 15.74, + "grad_norm": 8.158499717712402, + "learning_rate": 8.425563909774438e-06, + "loss": 0.3307, + "step": 10470 + }, + { + "epoch": 15.76, + "grad_norm": 10.01845645904541, + "learning_rate": 8.42406015037594e-06, + "loss": 0.3196, + "step": 10480 + }, + { + "epoch": 15.77, + "grad_norm": 4.778624534606934, + "learning_rate": 8.422556390977445e-06, + "loss": 0.3412, + "step": 10490 + }, + { + "epoch": 15.79, + "grad_norm": 5.8504157066345215, + "learning_rate": 8.421052631578948e-06, + "loss": 0.4183, + "step": 10500 + }, + { + "epoch": 15.8, + "grad_norm": 5.351130962371826, + "learning_rate": 8.419548872180452e-06, + "loss": 0.2639, + "step": 10510 + }, + { + "epoch": 15.82, + "grad_norm": 7.211291313171387, + "learning_rate": 8.418045112781955e-06, + "loss": 0.3021, + "step": 10520 + }, + { + "epoch": 15.83, + "grad_norm": 6.899810791015625, + "learning_rate": 8.416541353383459e-06, + "loss": 0.4069, + "step": 10530 + }, + { + "epoch": 15.85, + "grad_norm": 4.64746618270874, + "learning_rate": 8.415037593984962e-06, + "loss": 0.2798, + "step": 10540 + }, + { + "epoch": 15.86, + "grad_norm": 9.508644104003906, + "learning_rate": 8.413533834586468e-06, + "loss": 0.3791, + "step": 10550 + }, + { + "epoch": 15.88, + "grad_norm": 6.956771373748779, + "learning_rate": 8.412030075187971e-06, + "loss": 0.4188, + "step": 10560 + }, + { + "epoch": 15.89, + "grad_norm": 6.4203667640686035, + "learning_rate": 8.410526315789475e-06, + "loss": 0.3749, + "step": 10570 + }, + { + "epoch": 15.91, + "grad_norm": 8.310030937194824, + "learning_rate": 8.409022556390978e-06, + "loss": 0.4605, + "step": 10580 + }, + { + "epoch": 15.92, + "grad_norm": 8.788355827331543, + "learning_rate": 8.407518796992482e-06, + "loss": 0.3811, + "step": 10590 + }, + { + "epoch": 15.94, + "grad_norm": 5.931136131286621, + "learning_rate": 8.406015037593985e-06, + "loss": 0.4129, + "step": 10600 + }, + { + "epoch": 15.95, + "grad_norm": 7.846260070800781, + "learning_rate": 8.40451127819549e-06, + "loss": 0.3889, + "step": 10610 + }, + { + "epoch": 15.97, + "grad_norm": 6.834481239318848, + "learning_rate": 8.403007518796992e-06, + "loss": 0.4303, + "step": 10620 + }, + { + "epoch": 15.98, + "grad_norm": 3.6619720458984375, + "learning_rate": 8.401503759398497e-06, + "loss": 0.3483, + "step": 10630 + }, + { + "epoch": 16.0, + "grad_norm": 12.63433837890625, + "learning_rate": 8.400000000000001e-06, + "loss": 0.386, + "step": 10640 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.931, + "eval_loss": 0.25709524750709534, + "eval_runtime": 84.9959, + "eval_samples_per_second": 117.653, + "eval_steps_per_second": 0.471, + "step": 10640 + }, + { + "epoch": 16.02, + "grad_norm": 7.7511796951293945, + "learning_rate": 8.398496240601504e-06, + "loss": 0.3916, + "step": 10650 + }, + { + "epoch": 16.03, + "grad_norm": 4.640677452087402, + "learning_rate": 8.396992481203008e-06, + "loss": 0.395, + "step": 10660 + }, + { + "epoch": 16.05, + "grad_norm": 7.270589828491211, + "learning_rate": 8.395488721804511e-06, + "loss": 0.4299, + "step": 10670 + }, + { + "epoch": 16.06, + "grad_norm": 5.698379993438721, + "learning_rate": 8.393984962406015e-06, + "loss": 0.3792, + "step": 10680 + }, + { + "epoch": 16.08, + "grad_norm": 29.69732093811035, + "learning_rate": 8.39248120300752e-06, + "loss": 0.3564, + "step": 10690 + }, + { + "epoch": 16.09, + "grad_norm": 5.11942720413208, + "learning_rate": 8.390977443609024e-06, + "loss": 0.3284, + "step": 10700 + }, + { + "epoch": 16.11, + "grad_norm": 6.748551368713379, + "learning_rate": 8.389473684210527e-06, + "loss": 0.3582, + "step": 10710 + }, + { + "epoch": 16.12, + "grad_norm": 6.748464584350586, + "learning_rate": 8.38796992481203e-06, + "loss": 0.3707, + "step": 10720 + }, + { + "epoch": 16.14, + "grad_norm": 6.992805480957031, + "learning_rate": 8.386466165413534e-06, + "loss": 0.4388, + "step": 10730 + }, + { + "epoch": 16.15, + "grad_norm": 4.689752578735352, + "learning_rate": 8.384962406015038e-06, + "loss": 0.3394, + "step": 10740 + }, + { + "epoch": 16.17, + "grad_norm": 7.47608757019043, + "learning_rate": 8.383458646616543e-06, + "loss": 0.3662, + "step": 10750 + }, + { + "epoch": 16.18, + "grad_norm": 8.161937713623047, + "learning_rate": 8.381954887218045e-06, + "loss": 0.3729, + "step": 10760 + }, + { + "epoch": 16.2, + "grad_norm": 4.649080276489258, + "learning_rate": 8.38045112781955e-06, + "loss": 0.4012, + "step": 10770 + }, + { + "epoch": 16.21, + "grad_norm": 5.3081512451171875, + "learning_rate": 8.378947368421054e-06, + "loss": 0.3275, + "step": 10780 + }, + { + "epoch": 16.23, + "grad_norm": 8.424674034118652, + "learning_rate": 8.377443609022557e-06, + "loss": 0.3169, + "step": 10790 + }, + { + "epoch": 16.24, + "grad_norm": 7.213728427886963, + "learning_rate": 8.37593984962406e-06, + "loss": 0.4158, + "step": 10800 + }, + { + "epoch": 16.26, + "grad_norm": 4.228058815002441, + "learning_rate": 8.374436090225566e-06, + "loss": 0.2967, + "step": 10810 + }, + { + "epoch": 16.27, + "grad_norm": 9.091151237487793, + "learning_rate": 8.372932330827068e-06, + "loss": 0.3631, + "step": 10820 + }, + { + "epoch": 16.29, + "grad_norm": 7.325952053070068, + "learning_rate": 8.371428571428573e-06, + "loss": 0.4045, + "step": 10830 + }, + { + "epoch": 16.3, + "grad_norm": 8.557323455810547, + "learning_rate": 8.369924812030076e-06, + "loss": 0.3657, + "step": 10840 + }, + { + "epoch": 16.32, + "grad_norm": 8.98796558380127, + "learning_rate": 8.36842105263158e-06, + "loss": 0.3812, + "step": 10850 + }, + { + "epoch": 16.33, + "grad_norm": 14.74909496307373, + "learning_rate": 8.366917293233083e-06, + "loss": 0.3966, + "step": 10860 + }, + { + "epoch": 16.35, + "grad_norm": 6.872273921966553, + "learning_rate": 8.365413533834587e-06, + "loss": 0.3797, + "step": 10870 + }, + { + "epoch": 16.36, + "grad_norm": 8.392032623291016, + "learning_rate": 8.36390977443609e-06, + "loss": 0.3864, + "step": 10880 + }, + { + "epoch": 16.38, + "grad_norm": 8.61768627166748, + "learning_rate": 8.362406015037596e-06, + "loss": 0.3434, + "step": 10890 + }, + { + "epoch": 16.39, + "grad_norm": 6.622392177581787, + "learning_rate": 8.3609022556391e-06, + "loss": 0.3751, + "step": 10900 + }, + { + "epoch": 16.41, + "grad_norm": 7.108027458190918, + "learning_rate": 8.359398496240603e-06, + "loss": 0.3542, + "step": 10910 + }, + { + "epoch": 16.42, + "grad_norm": 7.145939826965332, + "learning_rate": 8.357894736842106e-06, + "loss": 0.3586, + "step": 10920 + }, + { + "epoch": 16.44, + "grad_norm": 16.715761184692383, + "learning_rate": 8.35639097744361e-06, + "loss": 0.4365, + "step": 10930 + }, + { + "epoch": 16.45, + "grad_norm": 3.3526134490966797, + "learning_rate": 8.354887218045113e-06, + "loss": 0.38, + "step": 10940 + }, + { + "epoch": 16.47, + "grad_norm": 4.425145149230957, + "learning_rate": 8.353383458646617e-06, + "loss": 0.2944, + "step": 10950 + }, + { + "epoch": 16.48, + "grad_norm": 4.0147552490234375, + "learning_rate": 8.35187969924812e-06, + "loss": 0.3836, + "step": 10960 + }, + { + "epoch": 16.5, + "grad_norm": 6.398830413818359, + "learning_rate": 8.350375939849624e-06, + "loss": 0.3402, + "step": 10970 + }, + { + "epoch": 16.51, + "grad_norm": 5.556189060211182, + "learning_rate": 8.348872180451129e-06, + "loss": 0.3728, + "step": 10980 + }, + { + "epoch": 16.53, + "grad_norm": 4.928891658782959, + "learning_rate": 8.347368421052633e-06, + "loss": 0.2993, + "step": 10990 + }, + { + "epoch": 16.54, + "grad_norm": 8.429544448852539, + "learning_rate": 8.345864661654136e-06, + "loss": 0.3896, + "step": 11000 + }, + { + "epoch": 16.56, + "grad_norm": 9.285510063171387, + "learning_rate": 8.34436090225564e-06, + "loss": 0.3428, + "step": 11010 + }, + { + "epoch": 16.57, + "grad_norm": 5.118491172790527, + "learning_rate": 8.342857142857143e-06, + "loss": 0.3583, + "step": 11020 + }, + { + "epoch": 16.59, + "grad_norm": 9.391587257385254, + "learning_rate": 8.341353383458647e-06, + "loss": 0.312, + "step": 11030 + }, + { + "epoch": 16.6, + "grad_norm": 22.18227767944336, + "learning_rate": 8.339849624060152e-06, + "loss": 0.3549, + "step": 11040 + }, + { + "epoch": 16.62, + "grad_norm": 5.834057331085205, + "learning_rate": 8.338345864661654e-06, + "loss": 0.4242, + "step": 11050 + }, + { + "epoch": 16.63, + "grad_norm": 5.136647701263428, + "learning_rate": 8.336842105263159e-06, + "loss": 0.3623, + "step": 11060 + }, + { + "epoch": 16.65, + "grad_norm": 5.481499195098877, + "learning_rate": 8.335338345864662e-06, + "loss": 0.3745, + "step": 11070 + }, + { + "epoch": 16.66, + "grad_norm": 3.6383814811706543, + "learning_rate": 8.333834586466166e-06, + "loss": 0.3526, + "step": 11080 + }, + { + "epoch": 16.68, + "grad_norm": 4.198364734649658, + "learning_rate": 8.33233082706767e-06, + "loss": 0.3247, + "step": 11090 + }, + { + "epoch": 16.69, + "grad_norm": 6.066871166229248, + "learning_rate": 8.330827067669175e-06, + "loss": 0.346, + "step": 11100 + }, + { + "epoch": 16.71, + "grad_norm": 4.822031497955322, + "learning_rate": 8.329323308270676e-06, + "loss": 0.3421, + "step": 11110 + }, + { + "epoch": 16.72, + "grad_norm": 8.0927152633667, + "learning_rate": 8.327819548872182e-06, + "loss": 0.3212, + "step": 11120 + }, + { + "epoch": 16.74, + "grad_norm": 5.737279415130615, + "learning_rate": 8.326315789473685e-06, + "loss": 0.338, + "step": 11130 + }, + { + "epoch": 16.75, + "grad_norm": 6.6013994216918945, + "learning_rate": 8.324812030075189e-06, + "loss": 0.4221, + "step": 11140 + }, + { + "epoch": 16.77, + "grad_norm": 3.22804856300354, + "learning_rate": 8.323308270676692e-06, + "loss": 0.2824, + "step": 11150 + }, + { + "epoch": 16.78, + "grad_norm": 7.42767333984375, + "learning_rate": 8.321804511278196e-06, + "loss": 0.3654, + "step": 11160 + }, + { + "epoch": 16.8, + "grad_norm": 6.393558979034424, + "learning_rate": 8.3203007518797e-06, + "loss": 0.3939, + "step": 11170 + }, + { + "epoch": 16.81, + "grad_norm": 6.1491241455078125, + "learning_rate": 8.318796992481204e-06, + "loss": 0.3454, + "step": 11180 + }, + { + "epoch": 16.83, + "grad_norm": 4.844079971313477, + "learning_rate": 8.317293233082708e-06, + "loss": 0.3744, + "step": 11190 + }, + { + "epoch": 16.84, + "grad_norm": 7.577675819396973, + "learning_rate": 8.315789473684212e-06, + "loss": 0.4259, + "step": 11200 + }, + { + "epoch": 16.86, + "grad_norm": 5.073234558105469, + "learning_rate": 8.314285714285715e-06, + "loss": 0.343, + "step": 11210 + }, + { + "epoch": 16.87, + "grad_norm": 4.934657096862793, + "learning_rate": 8.312781954887219e-06, + "loss": 0.3834, + "step": 11220 + }, + { + "epoch": 16.89, + "grad_norm": 4.744530200958252, + "learning_rate": 8.311278195488722e-06, + "loss": 0.2937, + "step": 11230 + }, + { + "epoch": 16.9, + "grad_norm": 7.569250583648682, + "learning_rate": 8.309774436090227e-06, + "loss": 0.3719, + "step": 11240 + }, + { + "epoch": 16.92, + "grad_norm": 7.076653480529785, + "learning_rate": 8.308270676691729e-06, + "loss": 0.3489, + "step": 11250 + }, + { + "epoch": 16.93, + "grad_norm": 7.1391520500183105, + "learning_rate": 8.306766917293234e-06, + "loss": 0.3325, + "step": 11260 + }, + { + "epoch": 16.95, + "grad_norm": 4.94738245010376, + "learning_rate": 8.305263157894738e-06, + "loss": 0.3421, + "step": 11270 + }, + { + "epoch": 16.96, + "grad_norm": 6.052053451538086, + "learning_rate": 8.303759398496241e-06, + "loss": 0.4267, + "step": 11280 + }, + { + "epoch": 16.98, + "grad_norm": 6.822144985198975, + "learning_rate": 8.302255639097745e-06, + "loss": 0.4932, + "step": 11290 + }, + { + "epoch": 16.99, + "grad_norm": 7.0719218254089355, + "learning_rate": 8.300751879699248e-06, + "loss": 0.3688, + "step": 11300 + }, + { + "epoch": 17.0, + "eval_accuracy": 0.9346, + "eval_loss": 0.25758126378059387, + "eval_runtime": 84.7562, + "eval_samples_per_second": 117.986, + "eval_steps_per_second": 0.472, + "step": 11305 + }, + { + "epoch": 17.01, + "grad_norm": 4.741504669189453, + "learning_rate": 8.299248120300752e-06, + "loss": 0.3837, + "step": 11310 + }, + { + "epoch": 17.02, + "grad_norm": 6.912674427032471, + "learning_rate": 8.297744360902257e-06, + "loss": 0.2938, + "step": 11320 + }, + { + "epoch": 17.04, + "grad_norm": 5.881788730621338, + "learning_rate": 8.29624060150376e-06, + "loss": 0.3984, + "step": 11330 + }, + { + "epoch": 17.05, + "grad_norm": 5.705871105194092, + "learning_rate": 8.294736842105264e-06, + "loss": 0.3793, + "step": 11340 + }, + { + "epoch": 17.07, + "grad_norm": 5.036585330963135, + "learning_rate": 8.293233082706768e-06, + "loss": 0.365, + "step": 11350 + }, + { + "epoch": 17.08, + "grad_norm": 5.040714740753174, + "learning_rate": 8.291729323308271e-06, + "loss": 0.3662, + "step": 11360 + }, + { + "epoch": 17.1, + "grad_norm": 7.933087348937988, + "learning_rate": 8.290225563909775e-06, + "loss": 0.3204, + "step": 11370 + }, + { + "epoch": 17.11, + "grad_norm": 8.096324920654297, + "learning_rate": 8.28872180451128e-06, + "loss": 0.3383, + "step": 11380 + }, + { + "epoch": 17.13, + "grad_norm": 5.693844318389893, + "learning_rate": 8.287218045112782e-06, + "loss": 0.3455, + "step": 11390 + }, + { + "epoch": 17.14, + "grad_norm": 5.274537086486816, + "learning_rate": 8.285714285714287e-06, + "loss": 0.3695, + "step": 11400 + }, + { + "epoch": 17.16, + "grad_norm": 5.249573230743408, + "learning_rate": 8.28421052631579e-06, + "loss": 0.3801, + "step": 11410 + }, + { + "epoch": 17.17, + "grad_norm": 6.644190311431885, + "learning_rate": 8.282706766917294e-06, + "loss": 0.3355, + "step": 11420 + }, + { + "epoch": 17.19, + "grad_norm": 8.79143238067627, + "learning_rate": 8.281203007518798e-06, + "loss": 0.3857, + "step": 11430 + }, + { + "epoch": 17.2, + "grad_norm": 4.0490851402282715, + "learning_rate": 8.279699248120301e-06, + "loss": 0.3224, + "step": 11440 + }, + { + "epoch": 17.22, + "grad_norm": 8.35059928894043, + "learning_rate": 8.278195488721805e-06, + "loss": 0.372, + "step": 11450 + }, + { + "epoch": 17.23, + "grad_norm": 5.733313083648682, + "learning_rate": 8.276691729323308e-06, + "loss": 0.3511, + "step": 11460 + }, + { + "epoch": 17.25, + "grad_norm": 4.974298477172852, + "learning_rate": 8.275187969924813e-06, + "loss": 0.332, + "step": 11470 + }, + { + "epoch": 17.26, + "grad_norm": 6.859165191650391, + "learning_rate": 8.273684210526317e-06, + "loss": 0.3053, + "step": 11480 + }, + { + "epoch": 17.28, + "grad_norm": 7.4038591384887695, + "learning_rate": 8.27218045112782e-06, + "loss": 0.3419, + "step": 11490 + }, + { + "epoch": 17.29, + "grad_norm": 4.045393943786621, + "learning_rate": 8.270676691729324e-06, + "loss": 0.3656, + "step": 11500 + }, + { + "epoch": 17.31, + "grad_norm": 2.965898036956787, + "learning_rate": 8.269172932330827e-06, + "loss": 0.4222, + "step": 11510 + }, + { + "epoch": 17.32, + "grad_norm": 6.6445746421813965, + "learning_rate": 8.267669172932331e-06, + "loss": 0.4351, + "step": 11520 + }, + { + "epoch": 17.34, + "grad_norm": 11.220673561096191, + "learning_rate": 8.266165413533836e-06, + "loss": 0.3738, + "step": 11530 + }, + { + "epoch": 17.35, + "grad_norm": 3.7990376949310303, + "learning_rate": 8.264661654135338e-06, + "loss": 0.2779, + "step": 11540 + }, + { + "epoch": 17.37, + "grad_norm": 6.193857669830322, + "learning_rate": 8.263157894736843e-06, + "loss": 0.3864, + "step": 11550 + }, + { + "epoch": 17.38, + "grad_norm": 7.089908123016357, + "learning_rate": 8.261654135338347e-06, + "loss": 0.3243, + "step": 11560 + }, + { + "epoch": 17.4, + "grad_norm": 10.148313522338867, + "learning_rate": 8.26015037593985e-06, + "loss": 0.3848, + "step": 11570 + }, + { + "epoch": 17.41, + "grad_norm": 7.47261905670166, + "learning_rate": 8.258646616541354e-06, + "loss": 0.3958, + "step": 11580 + }, + { + "epoch": 17.43, + "grad_norm": 8.237654685974121, + "learning_rate": 8.257142857142857e-06, + "loss": 0.4104, + "step": 11590 + }, + { + "epoch": 17.44, + "grad_norm": 7.028960227966309, + "learning_rate": 8.25563909774436e-06, + "loss": 0.3515, + "step": 11600 + }, + { + "epoch": 17.46, + "grad_norm": 6.804955959320068, + "learning_rate": 8.254135338345866e-06, + "loss": 0.3555, + "step": 11610 + }, + { + "epoch": 17.47, + "grad_norm": 8.740710258483887, + "learning_rate": 8.25263157894737e-06, + "loss": 0.328, + "step": 11620 + }, + { + "epoch": 17.49, + "grad_norm": 6.383413314819336, + "learning_rate": 8.251127819548873e-06, + "loss": 0.3352, + "step": 11630 + }, + { + "epoch": 17.5, + "grad_norm": 8.289705276489258, + "learning_rate": 8.249624060150377e-06, + "loss": 0.4067, + "step": 11640 + }, + { + "epoch": 17.52, + "grad_norm": 6.0566911697387695, + "learning_rate": 8.24812030075188e-06, + "loss": 0.3556, + "step": 11650 + }, + { + "epoch": 17.53, + "grad_norm": 9.813027381896973, + "learning_rate": 8.246616541353384e-06, + "loss": 0.3856, + "step": 11660 + }, + { + "epoch": 17.55, + "grad_norm": 6.0970988273620605, + "learning_rate": 8.245112781954889e-06, + "loss": 0.396, + "step": 11670 + }, + { + "epoch": 17.56, + "grad_norm": 4.837037086486816, + "learning_rate": 8.24360902255639e-06, + "loss": 0.3194, + "step": 11680 + }, + { + "epoch": 17.58, + "grad_norm": 10.178328514099121, + "learning_rate": 8.242105263157896e-06, + "loss": 0.4108, + "step": 11690 + }, + { + "epoch": 17.59, + "grad_norm": 3.696746826171875, + "learning_rate": 8.2406015037594e-06, + "loss": 0.3198, + "step": 11700 + }, + { + "epoch": 17.61, + "grad_norm": 6.4773993492126465, + "learning_rate": 8.239097744360903e-06, + "loss": 0.2889, + "step": 11710 + }, + { + "epoch": 17.62, + "grad_norm": 9.036526679992676, + "learning_rate": 8.237593984962406e-06, + "loss": 0.4616, + "step": 11720 + }, + { + "epoch": 17.64, + "grad_norm": 5.1061320304870605, + "learning_rate": 8.236090225563912e-06, + "loss": 0.3941, + "step": 11730 + }, + { + "epoch": 17.65, + "grad_norm": 5.17496919631958, + "learning_rate": 8.234586466165413e-06, + "loss": 0.3883, + "step": 11740 + }, + { + "epoch": 17.67, + "grad_norm": 4.007594585418701, + "learning_rate": 8.233082706766919e-06, + "loss": 0.3097, + "step": 11750 + }, + { + "epoch": 17.68, + "grad_norm": 3.903956174850464, + "learning_rate": 8.231578947368422e-06, + "loss": 0.3473, + "step": 11760 + }, + { + "epoch": 17.7, + "grad_norm": 9.270066261291504, + "learning_rate": 8.230075187969926e-06, + "loss": 0.3255, + "step": 11770 + }, + { + "epoch": 17.71, + "grad_norm": 4.118042469024658, + "learning_rate": 8.22857142857143e-06, + "loss": 0.335, + "step": 11780 + }, + { + "epoch": 17.73, + "grad_norm": 5.715611457824707, + "learning_rate": 8.227067669172933e-06, + "loss": 0.422, + "step": 11790 + }, + { + "epoch": 17.74, + "grad_norm": 5.848507881164551, + "learning_rate": 8.225563909774436e-06, + "loss": 0.3465, + "step": 11800 + }, + { + "epoch": 17.76, + "grad_norm": 5.273082733154297, + "learning_rate": 8.224060150375941e-06, + "loss": 0.3479, + "step": 11810 + }, + { + "epoch": 17.77, + "grad_norm": 4.500287055969238, + "learning_rate": 8.222556390977445e-06, + "loss": 0.3813, + "step": 11820 + }, + { + "epoch": 17.79, + "grad_norm": 7.676726341247559, + "learning_rate": 8.221052631578948e-06, + "loss": 0.3973, + "step": 11830 + }, + { + "epoch": 17.8, + "grad_norm": 6.1550211906433105, + "learning_rate": 8.219548872180452e-06, + "loss": 0.3209, + "step": 11840 + }, + { + "epoch": 17.82, + "grad_norm": 3.796853542327881, + "learning_rate": 8.218045112781955e-06, + "loss": 0.3241, + "step": 11850 + }, + { + "epoch": 17.83, + "grad_norm": 7.188779354095459, + "learning_rate": 8.216541353383459e-06, + "loss": 0.3923, + "step": 11860 + }, + { + "epoch": 17.85, + "grad_norm": 5.088048934936523, + "learning_rate": 8.215037593984964e-06, + "loss": 0.3528, + "step": 11870 + }, + { + "epoch": 17.86, + "grad_norm": 6.49263334274292, + "learning_rate": 8.213533834586466e-06, + "loss": 0.3724, + "step": 11880 + }, + { + "epoch": 17.88, + "grad_norm": 8.370095252990723, + "learning_rate": 8.212030075187971e-06, + "loss": 0.3697, + "step": 11890 + }, + { + "epoch": 17.89, + "grad_norm": 7.578341007232666, + "learning_rate": 8.210526315789475e-06, + "loss": 0.3317, + "step": 11900 + }, + { + "epoch": 17.91, + "grad_norm": 5.1709723472595215, + "learning_rate": 8.209022556390978e-06, + "loss": 0.3304, + "step": 11910 + }, + { + "epoch": 17.92, + "grad_norm": 4.586398124694824, + "learning_rate": 8.207518796992482e-06, + "loss": 0.3942, + "step": 11920 + }, + { + "epoch": 17.94, + "grad_norm": 3.6240298748016357, + "learning_rate": 8.206015037593985e-06, + "loss": 0.2839, + "step": 11930 + }, + { + "epoch": 17.95, + "grad_norm": 7.672499179840088, + "learning_rate": 8.204511278195489e-06, + "loss": 0.3539, + "step": 11940 + }, + { + "epoch": 17.97, + "grad_norm": 5.807362079620361, + "learning_rate": 8.203007518796992e-06, + "loss": 0.3006, + "step": 11950 + }, + { + "epoch": 17.98, + "grad_norm": 5.875560283660889, + "learning_rate": 8.201503759398498e-06, + "loss": 0.3857, + "step": 11960 + }, + { + "epoch": 18.0, + "grad_norm": 47.7349967956543, + "learning_rate": 8.2e-06, + "loss": 0.3985, + "step": 11970 + }, + { + "epoch": 18.0, + "eval_accuracy": 0.9356, + "eval_loss": 0.25322866439819336, + "eval_runtime": 84.6682, + "eval_samples_per_second": 118.108, + "eval_steps_per_second": 0.472, + "step": 11970 + }, + { + "epoch": 18.02, + "grad_norm": 5.787768363952637, + "learning_rate": 8.198496240601505e-06, + "loss": 0.4037, + "step": 11980 + }, + { + "epoch": 18.03, + "grad_norm": 9.899946212768555, + "learning_rate": 8.196992481203008e-06, + "loss": 0.3305, + "step": 11990 + }, + { + "epoch": 18.05, + "grad_norm": 7.430878639221191, + "learning_rate": 8.195488721804512e-06, + "loss": 0.3072, + "step": 12000 + }, + { + "epoch": 18.06, + "grad_norm": 4.635775566101074, + "learning_rate": 8.193984962406015e-06, + "loss": 0.3824, + "step": 12010 + }, + { + "epoch": 18.08, + "grad_norm": 11.533434867858887, + "learning_rate": 8.19248120300752e-06, + "loss": 0.3494, + "step": 12020 + }, + { + "epoch": 18.09, + "grad_norm": 9.492911338806152, + "learning_rate": 8.190977443609022e-06, + "loss": 0.2719, + "step": 12030 + }, + { + "epoch": 18.11, + "grad_norm": 4.291653156280518, + "learning_rate": 8.189473684210527e-06, + "loss": 0.3766, + "step": 12040 + }, + { + "epoch": 18.12, + "grad_norm": 6.437026023864746, + "learning_rate": 8.187969924812031e-06, + "loss": 0.3757, + "step": 12050 + }, + { + "epoch": 18.14, + "grad_norm": 6.730251789093018, + "learning_rate": 8.186466165413534e-06, + "loss": 0.3426, + "step": 12060 + }, + { + "epoch": 18.15, + "grad_norm": 4.403579235076904, + "learning_rate": 8.184962406015038e-06, + "loss": 0.4402, + "step": 12070 + }, + { + "epoch": 18.17, + "grad_norm": 4.053240776062012, + "learning_rate": 8.183458646616542e-06, + "loss": 0.4017, + "step": 12080 + }, + { + "epoch": 18.18, + "grad_norm": 9.355215072631836, + "learning_rate": 8.181954887218045e-06, + "loss": 0.3034, + "step": 12090 + }, + { + "epoch": 18.2, + "grad_norm": 6.016571521759033, + "learning_rate": 8.18045112781955e-06, + "loss": 0.2966, + "step": 12100 + }, + { + "epoch": 18.21, + "grad_norm": 3.7155065536499023, + "learning_rate": 8.178947368421054e-06, + "loss": 0.325, + "step": 12110 + }, + { + "epoch": 18.23, + "grad_norm": 4.735385417938232, + "learning_rate": 8.177443609022557e-06, + "loss": 0.3916, + "step": 12120 + }, + { + "epoch": 18.24, + "grad_norm": 5.728586673736572, + "learning_rate": 8.17593984962406e-06, + "loss": 0.3307, + "step": 12130 + }, + { + "epoch": 18.26, + "grad_norm": 3.7535057067871094, + "learning_rate": 8.174436090225564e-06, + "loss": 0.2576, + "step": 12140 + }, + { + "epoch": 18.27, + "grad_norm": 7.422114849090576, + "learning_rate": 8.172932330827068e-06, + "loss": 0.405, + "step": 12150 + }, + { + "epoch": 18.29, + "grad_norm": 7.051729679107666, + "learning_rate": 8.171428571428573e-06, + "loss": 0.3504, + "step": 12160 + }, + { + "epoch": 18.3, + "grad_norm": 5.593104362487793, + "learning_rate": 8.169924812030075e-06, + "loss": 0.3071, + "step": 12170 + }, + { + "epoch": 18.32, + "grad_norm": 6.640134811401367, + "learning_rate": 8.16842105263158e-06, + "loss": 0.38, + "step": 12180 + }, + { + "epoch": 18.33, + "grad_norm": 7.922089099884033, + "learning_rate": 8.166917293233084e-06, + "loss": 0.3383, + "step": 12190 + }, + { + "epoch": 18.35, + "grad_norm": 6.431313514709473, + "learning_rate": 8.165413533834587e-06, + "loss": 0.2918, + "step": 12200 + }, + { + "epoch": 18.36, + "grad_norm": 5.122470855712891, + "learning_rate": 8.16390977443609e-06, + "loss": 0.3129, + "step": 12210 + }, + { + "epoch": 18.38, + "grad_norm": 5.576056957244873, + "learning_rate": 8.162406015037594e-06, + "loss": 0.3316, + "step": 12220 + }, + { + "epoch": 18.39, + "grad_norm": 5.235702037811279, + "learning_rate": 8.160902255639098e-06, + "loss": 0.2955, + "step": 12230 + }, + { + "epoch": 18.41, + "grad_norm": 5.954382419586182, + "learning_rate": 8.159398496240603e-06, + "loss": 0.3256, + "step": 12240 + }, + { + "epoch": 18.42, + "grad_norm": 5.545419216156006, + "learning_rate": 8.157894736842106e-06, + "loss": 0.358, + "step": 12250 + }, + { + "epoch": 18.44, + "grad_norm": 12.001044273376465, + "learning_rate": 8.15639097744361e-06, + "loss": 0.3185, + "step": 12260 + }, + { + "epoch": 18.45, + "grad_norm": 10.437068939208984, + "learning_rate": 8.154887218045113e-06, + "loss": 0.3234, + "step": 12270 + }, + { + "epoch": 18.47, + "grad_norm": 5.405700206756592, + "learning_rate": 8.153383458646617e-06, + "loss": 0.251, + "step": 12280 + }, + { + "epoch": 18.48, + "grad_norm": 6.865713119506836, + "learning_rate": 8.15187969924812e-06, + "loss": 0.4126, + "step": 12290 + }, + { + "epoch": 18.5, + "grad_norm": 6.4221038818359375, + "learning_rate": 8.150375939849626e-06, + "loss": 0.3338, + "step": 12300 + }, + { + "epoch": 18.51, + "grad_norm": 5.260381698608398, + "learning_rate": 8.148872180451128e-06, + "loss": 0.3482, + "step": 12310 + }, + { + "epoch": 18.53, + "grad_norm": 8.89275074005127, + "learning_rate": 8.147368421052633e-06, + "loss": 0.3684, + "step": 12320 + }, + { + "epoch": 18.54, + "grad_norm": 4.887394428253174, + "learning_rate": 8.145864661654136e-06, + "loss": 0.3673, + "step": 12330 + }, + { + "epoch": 18.56, + "grad_norm": 3.8077456951141357, + "learning_rate": 8.14436090225564e-06, + "loss": 0.3478, + "step": 12340 + }, + { + "epoch": 18.57, + "grad_norm": 5.575742721557617, + "learning_rate": 8.142857142857143e-06, + "loss": 0.332, + "step": 12350 + }, + { + "epoch": 18.59, + "grad_norm": 6.4698333740234375, + "learning_rate": 8.141353383458649e-06, + "loss": 0.4422, + "step": 12360 + }, + { + "epoch": 18.6, + "grad_norm": 5.90009880065918, + "learning_rate": 8.13984962406015e-06, + "loss": 0.4025, + "step": 12370 + }, + { + "epoch": 18.62, + "grad_norm": 7.366026401519775, + "learning_rate": 8.138345864661656e-06, + "loss": 0.3364, + "step": 12380 + }, + { + "epoch": 18.63, + "grad_norm": 5.042256832122803, + "learning_rate": 8.136842105263159e-06, + "loss": 0.2686, + "step": 12390 + }, + { + "epoch": 18.65, + "grad_norm": 5.483407020568848, + "learning_rate": 8.135338345864663e-06, + "loss": 0.2765, + "step": 12400 + }, + { + "epoch": 18.66, + "grad_norm": 8.821917533874512, + "learning_rate": 8.133834586466166e-06, + "loss": 0.3899, + "step": 12410 + }, + { + "epoch": 18.68, + "grad_norm": 5.6711297035217285, + "learning_rate": 8.13233082706767e-06, + "loss": 0.4293, + "step": 12420 + }, + { + "epoch": 18.69, + "grad_norm": 6.44101619720459, + "learning_rate": 8.130827067669173e-06, + "loss": 0.3191, + "step": 12430 + }, + { + "epoch": 18.71, + "grad_norm": 4.042926788330078, + "learning_rate": 8.129323308270677e-06, + "loss": 0.3414, + "step": 12440 + }, + { + "epoch": 18.72, + "grad_norm": 7.297868728637695, + "learning_rate": 8.127819548872182e-06, + "loss": 0.2944, + "step": 12450 + }, + { + "epoch": 18.74, + "grad_norm": 7.283233642578125, + "learning_rate": 8.126315789473684e-06, + "loss": 0.3262, + "step": 12460 + }, + { + "epoch": 18.75, + "grad_norm": 7.256216526031494, + "learning_rate": 8.124812030075189e-06, + "loss": 0.3714, + "step": 12470 + }, + { + "epoch": 18.77, + "grad_norm": 5.120934009552002, + "learning_rate": 8.123308270676692e-06, + "loss": 0.3963, + "step": 12480 + }, + { + "epoch": 18.78, + "grad_norm": 5.541153907775879, + "learning_rate": 8.121804511278196e-06, + "loss": 0.3602, + "step": 12490 + }, + { + "epoch": 18.8, + "grad_norm": 5.731190204620361, + "learning_rate": 8.1203007518797e-06, + "loss": 0.4258, + "step": 12500 + }, + { + "epoch": 18.81, + "grad_norm": 5.054348468780518, + "learning_rate": 8.118796992481203e-06, + "loss": 0.2603, + "step": 12510 + }, + { + "epoch": 18.83, + "grad_norm": 4.947210311889648, + "learning_rate": 8.117293233082707e-06, + "loss": 0.332, + "step": 12520 + }, + { + "epoch": 18.84, + "grad_norm": 7.019644737243652, + "learning_rate": 8.115789473684212e-06, + "loss": 0.3752, + "step": 12530 + }, + { + "epoch": 18.86, + "grad_norm": 6.0113325119018555, + "learning_rate": 8.114285714285715e-06, + "loss": 0.2853, + "step": 12540 + }, + { + "epoch": 18.87, + "grad_norm": 10.12833309173584, + "learning_rate": 8.112781954887219e-06, + "loss": 0.2721, + "step": 12550 + }, + { + "epoch": 18.89, + "grad_norm": 3.987274408340454, + "learning_rate": 8.111278195488722e-06, + "loss": 0.3351, + "step": 12560 + }, + { + "epoch": 18.9, + "grad_norm": 6.421701431274414, + "learning_rate": 8.109774436090226e-06, + "loss": 0.3202, + "step": 12570 + }, + { + "epoch": 18.92, + "grad_norm": 6.776229381561279, + "learning_rate": 8.10827067669173e-06, + "loss": 0.3123, + "step": 12580 + }, + { + "epoch": 18.93, + "grad_norm": 8.584750175476074, + "learning_rate": 8.106766917293235e-06, + "loss": 0.4226, + "step": 12590 + }, + { + "epoch": 18.95, + "grad_norm": 4.636366844177246, + "learning_rate": 8.105263157894736e-06, + "loss": 0.4647, + "step": 12600 + }, + { + "epoch": 18.96, + "grad_norm": 5.144723415374756, + "learning_rate": 8.103759398496242e-06, + "loss": 0.287, + "step": 12610 + }, + { + "epoch": 18.98, + "grad_norm": 7.949281692504883, + "learning_rate": 8.102255639097745e-06, + "loss": 0.3361, + "step": 12620 + }, + { + "epoch": 18.99, + "grad_norm": 6.716331958770752, + "learning_rate": 8.100751879699249e-06, + "loss": 0.3213, + "step": 12630 + }, + { + "epoch": 19.0, + "eval_accuracy": 0.9321, + "eval_loss": 0.27284207940101624, + "eval_runtime": 84.4705, + "eval_samples_per_second": 118.384, + "eval_steps_per_second": 0.474, + "step": 12635 + }, + { + "epoch": 19.01, + "grad_norm": 5.840856075286865, + "learning_rate": 8.099248120300752e-06, + "loss": 0.3154, + "step": 12640 + }, + { + "epoch": 19.02, + "grad_norm": 6.531993389129639, + "learning_rate": 8.097744360902257e-06, + "loss": 0.3244, + "step": 12650 + }, + { + "epoch": 19.04, + "grad_norm": 8.1339111328125, + "learning_rate": 8.09624060150376e-06, + "loss": 0.3372, + "step": 12660 + }, + { + "epoch": 19.05, + "grad_norm": 5.9135518074035645, + "learning_rate": 8.094736842105264e-06, + "loss": 0.3514, + "step": 12670 + }, + { + "epoch": 19.07, + "grad_norm": 4.900998592376709, + "learning_rate": 8.093233082706768e-06, + "loss": 0.3248, + "step": 12680 + }, + { + "epoch": 19.08, + "grad_norm": 6.988156795501709, + "learning_rate": 8.091729323308271e-06, + "loss": 0.3665, + "step": 12690 + }, + { + "epoch": 19.1, + "grad_norm": 6.229936599731445, + "learning_rate": 8.090225563909775e-06, + "loss": 0.2717, + "step": 12700 + }, + { + "epoch": 19.11, + "grad_norm": 7.66952657699585, + "learning_rate": 8.088721804511278e-06, + "loss": 0.4116, + "step": 12710 + }, + { + "epoch": 19.13, + "grad_norm": 5.332541465759277, + "learning_rate": 8.087218045112782e-06, + "loss": 0.2748, + "step": 12720 + }, + { + "epoch": 19.14, + "grad_norm": 6.3354268074035645, + "learning_rate": 8.085714285714287e-06, + "loss": 0.3508, + "step": 12730 + }, + { + "epoch": 19.16, + "grad_norm": 2.598149538040161, + "learning_rate": 8.08421052631579e-06, + "loss": 0.2887, + "step": 12740 + }, + { + "epoch": 19.17, + "grad_norm": 3.951981544494629, + "learning_rate": 8.082706766917294e-06, + "loss": 0.3201, + "step": 12750 + }, + { + "epoch": 19.19, + "grad_norm": 6.856266021728516, + "learning_rate": 8.081203007518798e-06, + "loss": 0.3015, + "step": 12760 + }, + { + "epoch": 19.2, + "grad_norm": 8.871770858764648, + "learning_rate": 8.079699248120301e-06, + "loss": 0.4151, + "step": 12770 + }, + { + "epoch": 19.22, + "grad_norm": 7.119663238525391, + "learning_rate": 8.078195488721805e-06, + "loss": 0.2968, + "step": 12780 + }, + { + "epoch": 19.23, + "grad_norm": 5.254119396209717, + "learning_rate": 8.07669172932331e-06, + "loss": 0.2823, + "step": 12790 + }, + { + "epoch": 19.25, + "grad_norm": 8.958704948425293, + "learning_rate": 8.075187969924812e-06, + "loss": 0.3453, + "step": 12800 + }, + { + "epoch": 19.26, + "grad_norm": 8.45573616027832, + "learning_rate": 8.073684210526317e-06, + "loss": 0.3827, + "step": 12810 + }, + { + "epoch": 19.28, + "grad_norm": 3.3245856761932373, + "learning_rate": 8.07218045112782e-06, + "loss": 0.3515, + "step": 12820 + }, + { + "epoch": 19.29, + "grad_norm": 4.837510585784912, + "learning_rate": 8.070676691729324e-06, + "loss": 0.3397, + "step": 12830 + }, + { + "epoch": 19.31, + "grad_norm": 3.3870489597320557, + "learning_rate": 8.069172932330828e-06, + "loss": 0.3151, + "step": 12840 + }, + { + "epoch": 19.32, + "grad_norm": 6.1325201988220215, + "learning_rate": 8.067669172932333e-06, + "loss": 0.3491, + "step": 12850 + }, + { + "epoch": 19.34, + "grad_norm": 6.618387699127197, + "learning_rate": 8.066165413533835e-06, + "loss": 0.3685, + "step": 12860 + }, + { + "epoch": 19.35, + "grad_norm": 6.2998223304748535, + "learning_rate": 8.06466165413534e-06, + "loss": 0.4341, + "step": 12870 + }, + { + "epoch": 19.37, + "grad_norm": 5.405755043029785, + "learning_rate": 8.063157894736843e-06, + "loss": 0.3056, + "step": 12880 + }, + { + "epoch": 19.38, + "grad_norm": 5.74697732925415, + "learning_rate": 8.061654135338347e-06, + "loss": 0.4015, + "step": 12890 + }, + { + "epoch": 19.4, + "grad_norm": 4.222403526306152, + "learning_rate": 8.06015037593985e-06, + "loss": 0.2818, + "step": 12900 + }, + { + "epoch": 19.41, + "grad_norm": 11.104449272155762, + "learning_rate": 8.058646616541354e-06, + "loss": 0.3528, + "step": 12910 + }, + { + "epoch": 19.43, + "grad_norm": 5.580349445343018, + "learning_rate": 8.057142857142857e-06, + "loss": 0.3511, + "step": 12920 + }, + { + "epoch": 19.44, + "grad_norm": 4.909841060638428, + "learning_rate": 8.055639097744361e-06, + "loss": 0.2696, + "step": 12930 + }, + { + "epoch": 19.46, + "grad_norm": 4.32835054397583, + "learning_rate": 8.054135338345866e-06, + "loss": 0.2714, + "step": 12940 + }, + { + "epoch": 19.47, + "grad_norm": 4.742302894592285, + "learning_rate": 8.052631578947368e-06, + "loss": 0.3427, + "step": 12950 + }, + { + "epoch": 19.49, + "grad_norm": 6.302316665649414, + "learning_rate": 8.051127819548873e-06, + "loss": 0.406, + "step": 12960 + }, + { + "epoch": 19.5, + "grad_norm": 6.898422718048096, + "learning_rate": 8.049624060150377e-06, + "loss": 0.3048, + "step": 12970 + }, + { + "epoch": 19.52, + "grad_norm": 9.071293830871582, + "learning_rate": 8.04812030075188e-06, + "loss": 0.3603, + "step": 12980 + }, + { + "epoch": 19.53, + "grad_norm": 5.120814323425293, + "learning_rate": 8.046616541353384e-06, + "loss": 0.3555, + "step": 12990 + }, + { + "epoch": 19.55, + "grad_norm": 8.293437957763672, + "learning_rate": 8.045112781954887e-06, + "loss": 0.3061, + "step": 13000 + }, + { + "epoch": 19.56, + "grad_norm": 6.051889419555664, + "learning_rate": 8.04360902255639e-06, + "loss": 0.3308, + "step": 13010 + }, + { + "epoch": 19.58, + "grad_norm": 6.6684889793396, + "learning_rate": 8.042105263157896e-06, + "loss": 0.3497, + "step": 13020 + }, + { + "epoch": 19.59, + "grad_norm": 6.767400741577148, + "learning_rate": 8.0406015037594e-06, + "loss": 0.3037, + "step": 13030 + }, + { + "epoch": 19.61, + "grad_norm": 6.514301300048828, + "learning_rate": 8.039097744360903e-06, + "loss": 0.266, + "step": 13040 + }, + { + "epoch": 19.62, + "grad_norm": 4.410576820373535, + "learning_rate": 8.037593984962407e-06, + "loss": 0.3086, + "step": 13050 + }, + { + "epoch": 19.64, + "grad_norm": 8.338872909545898, + "learning_rate": 8.03609022556391e-06, + "loss": 0.3505, + "step": 13060 + }, + { + "epoch": 19.65, + "grad_norm": 7.640175819396973, + "learning_rate": 8.034586466165414e-06, + "loss": 0.3573, + "step": 13070 + }, + { + "epoch": 19.67, + "grad_norm": 5.462586402893066, + "learning_rate": 8.033082706766919e-06, + "loss": 0.3053, + "step": 13080 + }, + { + "epoch": 19.68, + "grad_norm": 4.294394016265869, + "learning_rate": 8.03157894736842e-06, + "loss": 0.3741, + "step": 13090 + }, + { + "epoch": 19.7, + "grad_norm": 4.442080497741699, + "learning_rate": 8.030075187969926e-06, + "loss": 0.2855, + "step": 13100 + }, + { + "epoch": 19.71, + "grad_norm": 3.8873209953308105, + "learning_rate": 8.02857142857143e-06, + "loss": 0.2914, + "step": 13110 + }, + { + "epoch": 19.73, + "grad_norm": 9.044189453125, + "learning_rate": 8.027067669172933e-06, + "loss": 0.3345, + "step": 13120 + }, + { + "epoch": 19.74, + "grad_norm": 9.596213340759277, + "learning_rate": 8.025563909774436e-06, + "loss": 0.3249, + "step": 13130 + }, + { + "epoch": 19.76, + "grad_norm": 6.889251708984375, + "learning_rate": 8.02406015037594e-06, + "loss": 0.3135, + "step": 13140 + }, + { + "epoch": 19.77, + "grad_norm": 6.394194602966309, + "learning_rate": 8.022556390977443e-06, + "loss": 0.2785, + "step": 13150 + }, + { + "epoch": 19.79, + "grad_norm": 8.294524192810059, + "learning_rate": 8.021052631578949e-06, + "loss": 0.3854, + "step": 13160 + }, + { + "epoch": 19.8, + "grad_norm": 9.557373046875, + "learning_rate": 8.019548872180452e-06, + "loss": 0.3031, + "step": 13170 + }, + { + "epoch": 19.82, + "grad_norm": 5.370851516723633, + "learning_rate": 8.018045112781956e-06, + "loss": 0.3836, + "step": 13180 + }, + { + "epoch": 19.83, + "grad_norm": 7.893476963043213, + "learning_rate": 8.01654135338346e-06, + "loss": 0.4143, + "step": 13190 + }, + { + "epoch": 19.85, + "grad_norm": 6.872874736785889, + "learning_rate": 8.015037593984963e-06, + "loss": 0.3131, + "step": 13200 + }, + { + "epoch": 19.86, + "grad_norm": 8.026068687438965, + "learning_rate": 8.013533834586466e-06, + "loss": 0.3558, + "step": 13210 + }, + { + "epoch": 19.88, + "grad_norm": 5.648526668548584, + "learning_rate": 8.012030075187971e-06, + "loss": 0.3377, + "step": 13220 + }, + { + "epoch": 19.89, + "grad_norm": 9.711199760437012, + "learning_rate": 8.010526315789473e-06, + "loss": 0.3421, + "step": 13230 + }, + { + "epoch": 19.91, + "grad_norm": 5.787407875061035, + "learning_rate": 8.009022556390979e-06, + "loss": 0.3919, + "step": 13240 + }, + { + "epoch": 19.92, + "grad_norm": 6.500585079193115, + "learning_rate": 8.007518796992482e-06, + "loss": 0.3383, + "step": 13250 + }, + { + "epoch": 19.94, + "grad_norm": 6.25896692276001, + "learning_rate": 8.006015037593986e-06, + "loss": 0.3031, + "step": 13260 + }, + { + "epoch": 19.95, + "grad_norm": 4.565229415893555, + "learning_rate": 8.004511278195489e-06, + "loss": 0.379, + "step": 13270 + }, + { + "epoch": 19.97, + "grad_norm": 5.765014171600342, + "learning_rate": 8.003007518796994e-06, + "loss": 0.2949, + "step": 13280 + }, + { + "epoch": 19.98, + "grad_norm": 5.820449352264404, + "learning_rate": 8.001503759398496e-06, + "loss": 0.2912, + "step": 13290 + }, + { + "epoch": 20.0, + "grad_norm": 0.44211289286613464, + "learning_rate": 8.000000000000001e-06, + "loss": 0.3046, + "step": 13300 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.9334, + "eval_loss": 0.27018824219703674, + "eval_runtime": 84.9396, + "eval_samples_per_second": 117.731, + "eval_steps_per_second": 0.471, + "step": 13300 + }, + { + "epoch": 20.02, + "grad_norm": 6.310322284698486, + "learning_rate": 7.998496240601505e-06, + "loss": 0.3525, + "step": 13310 + }, + { + "epoch": 20.03, + "grad_norm": 4.756250381469727, + "learning_rate": 7.996992481203008e-06, + "loss": 0.2493, + "step": 13320 + }, + { + "epoch": 20.05, + "grad_norm": 6.389369487762451, + "learning_rate": 7.995488721804512e-06, + "loss": 0.3425, + "step": 13330 + }, + { + "epoch": 20.06, + "grad_norm": 6.042266368865967, + "learning_rate": 7.993984962406015e-06, + "loss": 0.2894, + "step": 13340 + }, + { + "epoch": 20.08, + "grad_norm": 5.721156597137451, + "learning_rate": 7.992481203007519e-06, + "loss": 0.3251, + "step": 13350 + }, + { + "epoch": 20.09, + "grad_norm": 8.065733909606934, + "learning_rate": 7.990977443609024e-06, + "loss": 0.2898, + "step": 13360 + }, + { + "epoch": 20.11, + "grad_norm": 5.018655300140381, + "learning_rate": 7.989473684210528e-06, + "loss": 0.3525, + "step": 13370 + }, + { + "epoch": 20.12, + "grad_norm": 7.109757900238037, + "learning_rate": 7.987969924812031e-06, + "loss": 0.2977, + "step": 13380 + }, + { + "epoch": 20.14, + "grad_norm": 5.0374274253845215, + "learning_rate": 7.986466165413535e-06, + "loss": 0.2893, + "step": 13390 + }, + { + "epoch": 20.15, + "grad_norm": 6.046887397766113, + "learning_rate": 7.984962406015038e-06, + "loss": 0.2753, + "step": 13400 + }, + { + "epoch": 20.17, + "grad_norm": 4.841386795043945, + "learning_rate": 7.983458646616542e-06, + "loss": 0.3502, + "step": 13410 + }, + { + "epoch": 20.18, + "grad_norm": 6.6681036949157715, + "learning_rate": 7.981954887218045e-06, + "loss": 0.3064, + "step": 13420 + }, + { + "epoch": 20.2, + "grad_norm": 7.210595607757568, + "learning_rate": 7.980451127819549e-06, + "loss": 0.3887, + "step": 13430 + }, + { + "epoch": 20.21, + "grad_norm": 7.28894567489624, + "learning_rate": 7.978947368421052e-06, + "loss": 0.3476, + "step": 13440 + }, + { + "epoch": 20.23, + "grad_norm": 14.942082405090332, + "learning_rate": 7.977443609022558e-06, + "loss": 0.3151, + "step": 13450 + }, + { + "epoch": 20.24, + "grad_norm": 6.0876054763793945, + "learning_rate": 7.975939849624061e-06, + "loss": 0.3542, + "step": 13460 + }, + { + "epoch": 20.26, + "grad_norm": 8.294320106506348, + "learning_rate": 7.974436090225565e-06, + "loss": 0.3312, + "step": 13470 + }, + { + "epoch": 20.27, + "grad_norm": 5.090022087097168, + "learning_rate": 7.972932330827068e-06, + "loss": 0.323, + "step": 13480 + }, + { + "epoch": 20.29, + "grad_norm": 2.9050402641296387, + "learning_rate": 7.971428571428572e-06, + "loss": 0.2753, + "step": 13490 + }, + { + "epoch": 20.3, + "grad_norm": 7.342869758605957, + "learning_rate": 7.969924812030075e-06, + "loss": 0.2639, + "step": 13500 + }, + { + "epoch": 20.32, + "grad_norm": 5.894117832183838, + "learning_rate": 7.96842105263158e-06, + "loss": 0.2893, + "step": 13510 + }, + { + "epoch": 20.33, + "grad_norm": 5.4692792892456055, + "learning_rate": 7.966917293233082e-06, + "loss": 0.3384, + "step": 13520 + }, + { + "epoch": 20.35, + "grad_norm": 7.577072620391846, + "learning_rate": 7.965413533834587e-06, + "loss": 0.3865, + "step": 13530 + }, + { + "epoch": 20.36, + "grad_norm": 22.644380569458008, + "learning_rate": 7.963909774436091e-06, + "loss": 0.3332, + "step": 13540 + }, + { + "epoch": 20.38, + "grad_norm": 6.491385459899902, + "learning_rate": 7.962406015037594e-06, + "loss": 0.3662, + "step": 13550 + }, + { + "epoch": 20.39, + "grad_norm": 5.722530841827393, + "learning_rate": 7.960902255639098e-06, + "loss": 0.3452, + "step": 13560 + }, + { + "epoch": 20.41, + "grad_norm": 5.905157566070557, + "learning_rate": 7.959398496240603e-06, + "loss": 0.3424, + "step": 13570 + }, + { + "epoch": 20.42, + "grad_norm": 8.227418899536133, + "learning_rate": 7.957894736842105e-06, + "loss": 0.3635, + "step": 13580 + }, + { + "epoch": 20.44, + "grad_norm": 3.507075309753418, + "learning_rate": 7.95639097744361e-06, + "loss": 0.2959, + "step": 13590 + }, + { + "epoch": 20.45, + "grad_norm": 10.020979881286621, + "learning_rate": 7.954887218045114e-06, + "loss": 0.325, + "step": 13600 + }, + { + "epoch": 20.47, + "grad_norm": 6.452096939086914, + "learning_rate": 7.953383458646617e-06, + "loss": 0.2817, + "step": 13610 + }, + { + "epoch": 20.48, + "grad_norm": 5.931590557098389, + "learning_rate": 7.95187969924812e-06, + "loss": 0.3138, + "step": 13620 + }, + { + "epoch": 20.5, + "grad_norm": 11.746089935302734, + "learning_rate": 7.950375939849624e-06, + "loss": 0.3263, + "step": 13630 + }, + { + "epoch": 20.51, + "grad_norm": 5.600671291351318, + "learning_rate": 7.948872180451128e-06, + "loss": 0.2635, + "step": 13640 + }, + { + "epoch": 20.53, + "grad_norm": 4.570649147033691, + "learning_rate": 7.947368421052633e-06, + "loss": 0.3372, + "step": 13650 + }, + { + "epoch": 20.54, + "grad_norm": 10.411113739013672, + "learning_rate": 7.945864661654136e-06, + "loss": 0.3403, + "step": 13660 + }, + { + "epoch": 20.56, + "grad_norm": 7.8483710289001465, + "learning_rate": 7.94436090225564e-06, + "loss": 0.4401, + "step": 13670 + }, + { + "epoch": 20.57, + "grad_norm": 6.16978645324707, + "learning_rate": 7.942857142857144e-06, + "loss": 0.2905, + "step": 13680 + }, + { + "epoch": 20.59, + "grad_norm": 5.427683353424072, + "learning_rate": 7.941353383458647e-06, + "loss": 0.3853, + "step": 13690 + }, + { + "epoch": 20.6, + "grad_norm": 10.262079238891602, + "learning_rate": 7.93984962406015e-06, + "loss": 0.3126, + "step": 13700 + }, + { + "epoch": 20.62, + "grad_norm": 2.678656578063965, + "learning_rate": 7.938345864661656e-06, + "loss": 0.2864, + "step": 13710 + }, + { + "epoch": 20.63, + "grad_norm": 4.784215450286865, + "learning_rate": 7.936842105263158e-06, + "loss": 0.2532, + "step": 13720 + }, + { + "epoch": 20.65, + "grad_norm": 6.748628616333008, + "learning_rate": 7.935338345864663e-06, + "loss": 0.3338, + "step": 13730 + }, + { + "epoch": 20.66, + "grad_norm": 6.877127647399902, + "learning_rate": 7.933834586466166e-06, + "loss": 0.2662, + "step": 13740 + }, + { + "epoch": 20.68, + "grad_norm": 7.904510974884033, + "learning_rate": 7.93233082706767e-06, + "loss": 0.3532, + "step": 13750 + }, + { + "epoch": 20.69, + "grad_norm": 3.242637872695923, + "learning_rate": 7.930827067669173e-06, + "loss": 0.3012, + "step": 13760 + }, + { + "epoch": 20.71, + "grad_norm": 3.0852160453796387, + "learning_rate": 7.929323308270679e-06, + "loss": 0.2948, + "step": 13770 + }, + { + "epoch": 20.72, + "grad_norm": 10.306611061096191, + "learning_rate": 7.92781954887218e-06, + "loss": 0.2837, + "step": 13780 + }, + { + "epoch": 20.74, + "grad_norm": 3.293861150741577, + "learning_rate": 7.926315789473686e-06, + "loss": 0.3023, + "step": 13790 + }, + { + "epoch": 20.75, + "grad_norm": 6.937739849090576, + "learning_rate": 7.924812030075189e-06, + "loss": 0.3147, + "step": 13800 + }, + { + "epoch": 20.77, + "grad_norm": 4.617713928222656, + "learning_rate": 7.923308270676693e-06, + "loss": 0.3503, + "step": 13810 + }, + { + "epoch": 20.78, + "grad_norm": 4.726008892059326, + "learning_rate": 7.921804511278196e-06, + "loss": 0.3526, + "step": 13820 + }, + { + "epoch": 20.8, + "grad_norm": 6.7416558265686035, + "learning_rate": 7.9203007518797e-06, + "loss": 0.2887, + "step": 13830 + }, + { + "epoch": 20.81, + "grad_norm": 6.563324451446533, + "learning_rate": 7.918796992481203e-06, + "loss": 0.2874, + "step": 13840 + }, + { + "epoch": 20.83, + "grad_norm": 5.6465840339660645, + "learning_rate": 7.917293233082708e-06, + "loss": 0.3598, + "step": 13850 + }, + { + "epoch": 20.84, + "grad_norm": 5.625797748565674, + "learning_rate": 7.915789473684212e-06, + "loss": 0.3195, + "step": 13860 + }, + { + "epoch": 20.86, + "grad_norm": 7.804876327514648, + "learning_rate": 7.914285714285715e-06, + "loss": 0.3369, + "step": 13870 + }, + { + "epoch": 20.87, + "grad_norm": 8.359640121459961, + "learning_rate": 7.912781954887219e-06, + "loss": 0.3332, + "step": 13880 + }, + { + "epoch": 20.89, + "grad_norm": 5.867996692657471, + "learning_rate": 7.911278195488723e-06, + "loss": 0.4172, + "step": 13890 + }, + { + "epoch": 20.9, + "grad_norm": 5.789632320404053, + "learning_rate": 7.909774436090226e-06, + "loss": 0.3705, + "step": 13900 + }, + { + "epoch": 20.92, + "grad_norm": 4.992484092712402, + "learning_rate": 7.90827067669173e-06, + "loss": 0.3287, + "step": 13910 + }, + { + "epoch": 20.93, + "grad_norm": 8.009930610656738, + "learning_rate": 7.906766917293233e-06, + "loss": 0.3065, + "step": 13920 + }, + { + "epoch": 20.95, + "grad_norm": 5.598669528961182, + "learning_rate": 7.905263157894737e-06, + "loss": 0.3115, + "step": 13930 + }, + { + "epoch": 20.96, + "grad_norm": 5.0005364418029785, + "learning_rate": 7.903759398496242e-06, + "loss": 0.3635, + "step": 13940 + }, + { + "epoch": 20.98, + "grad_norm": 6.9587788581848145, + "learning_rate": 7.902255639097745e-06, + "loss": 0.3386, + "step": 13950 + }, + { + "epoch": 20.99, + "grad_norm": 6.115482330322266, + "learning_rate": 7.900751879699249e-06, + "loss": 0.3676, + "step": 13960 + }, + { + "epoch": 21.0, + "eval_accuracy": 0.9319, + "eval_loss": 0.2700176537036896, + "eval_runtime": 84.5019, + "eval_samples_per_second": 118.341, + "eval_steps_per_second": 0.473, + "step": 13965 + }, + { + "epoch": 21.01, + "grad_norm": 3.4582622051239014, + "learning_rate": 7.899248120300752e-06, + "loss": 0.6324, + "step": 13970 + }, + { + "epoch": 21.02, + "grad_norm": 5.3081512451171875, + "learning_rate": 7.897744360902256e-06, + "loss": 0.3913, + "step": 13980 + }, + { + "epoch": 21.04, + "grad_norm": 7.172489166259766, + "learning_rate": 7.89624060150376e-06, + "loss": 0.3863, + "step": 13990 + }, + { + "epoch": 21.05, + "grad_norm": 6.308826446533203, + "learning_rate": 7.894736842105265e-06, + "loss": 0.3214, + "step": 14000 + }, + { + "epoch": 21.07, + "grad_norm": 5.6395111083984375, + "learning_rate": 7.893233082706766e-06, + "loss": 0.2801, + "step": 14010 + }, + { + "epoch": 21.08, + "grad_norm": 2.734475612640381, + "learning_rate": 7.891729323308272e-06, + "loss": 0.3212, + "step": 14020 + }, + { + "epoch": 21.1, + "grad_norm": 4.720375061035156, + "learning_rate": 7.890225563909775e-06, + "loss": 0.3271, + "step": 14030 + }, + { + "epoch": 21.11, + "grad_norm": 3.2389378547668457, + "learning_rate": 7.888721804511279e-06, + "loss": 0.3118, + "step": 14040 + }, + { + "epoch": 21.13, + "grad_norm": 3.7250821590423584, + "learning_rate": 7.887218045112782e-06, + "loss": 0.337, + "step": 14050 + }, + { + "epoch": 21.14, + "grad_norm": 8.08519458770752, + "learning_rate": 7.885714285714286e-06, + "loss": 0.3126, + "step": 14060 + }, + { + "epoch": 21.16, + "grad_norm": 5.902373313903809, + "learning_rate": 7.88421052631579e-06, + "loss": 0.3088, + "step": 14070 + }, + { + "epoch": 21.17, + "grad_norm": 4.858761787414551, + "learning_rate": 7.882706766917294e-06, + "loss": 0.3258, + "step": 14080 + }, + { + "epoch": 21.19, + "grad_norm": 6.120957374572754, + "learning_rate": 7.881203007518798e-06, + "loss": 0.2718, + "step": 14090 + }, + { + "epoch": 21.2, + "grad_norm": 3.7284491062164307, + "learning_rate": 7.879699248120301e-06, + "loss": 0.3521, + "step": 14100 + }, + { + "epoch": 21.22, + "grad_norm": 6.68389892578125, + "learning_rate": 7.878195488721805e-06, + "loss": 0.345, + "step": 14110 + }, + { + "epoch": 21.23, + "grad_norm": 6.401340484619141, + "learning_rate": 7.876691729323309e-06, + "loss": 0.302, + "step": 14120 + }, + { + "epoch": 21.25, + "grad_norm": 3.490999698638916, + "learning_rate": 7.875187969924812e-06, + "loss": 0.3658, + "step": 14130 + }, + { + "epoch": 21.26, + "grad_norm": 7.066733360290527, + "learning_rate": 7.873684210526317e-06, + "loss": 0.3089, + "step": 14140 + }, + { + "epoch": 21.28, + "grad_norm": 5.916280269622803, + "learning_rate": 7.872180451127819e-06, + "loss": 0.2996, + "step": 14150 + }, + { + "epoch": 21.29, + "grad_norm": 5.348523139953613, + "learning_rate": 7.870676691729324e-06, + "loss": 0.3389, + "step": 14160 + }, + { + "epoch": 21.31, + "grad_norm": 5.067401885986328, + "learning_rate": 7.869172932330828e-06, + "loss": 0.334, + "step": 14170 + }, + { + "epoch": 21.32, + "grad_norm": 4.67525053024292, + "learning_rate": 7.867669172932331e-06, + "loss": 0.3098, + "step": 14180 + }, + { + "epoch": 21.34, + "grad_norm": 7.282690525054932, + "learning_rate": 7.866165413533835e-06, + "loss": 0.3779, + "step": 14190 + }, + { + "epoch": 21.35, + "grad_norm": 6.01992654800415, + "learning_rate": 7.86466165413534e-06, + "loss": 0.2547, + "step": 14200 + }, + { + "epoch": 21.37, + "grad_norm": 3.902848720550537, + "learning_rate": 7.863157894736842e-06, + "loss": 0.2964, + "step": 14210 + }, + { + "epoch": 21.38, + "grad_norm": 5.350156784057617, + "learning_rate": 7.861654135338347e-06, + "loss": 0.325, + "step": 14220 + }, + { + "epoch": 21.4, + "grad_norm": 6.831077575683594, + "learning_rate": 7.86015037593985e-06, + "loss": 0.3629, + "step": 14230 + }, + { + "epoch": 21.41, + "grad_norm": 6.829043865203857, + "learning_rate": 7.858646616541354e-06, + "loss": 0.364, + "step": 14240 + }, + { + "epoch": 21.43, + "grad_norm": 4.683376789093018, + "learning_rate": 7.857142857142858e-06, + "loss": 0.3197, + "step": 14250 + }, + { + "epoch": 21.44, + "grad_norm": 5.582406520843506, + "learning_rate": 7.855639097744361e-06, + "loss": 0.2911, + "step": 14260 + }, + { + "epoch": 21.46, + "grad_norm": 3.7415030002593994, + "learning_rate": 7.854135338345865e-06, + "loss": 0.321, + "step": 14270 + }, + { + "epoch": 21.47, + "grad_norm": 5.881400108337402, + "learning_rate": 7.85263157894737e-06, + "loss": 0.2524, + "step": 14280 + }, + { + "epoch": 21.49, + "grad_norm": 8.578994750976562, + "learning_rate": 7.851127819548873e-06, + "loss": 0.2918, + "step": 14290 + }, + { + "epoch": 21.5, + "grad_norm": 8.9381742477417, + "learning_rate": 7.849624060150377e-06, + "loss": 0.4393, + "step": 14300 + }, + { + "epoch": 21.52, + "grad_norm": 4.9871602058410645, + "learning_rate": 7.84812030075188e-06, + "loss": 0.3624, + "step": 14310 + }, + { + "epoch": 21.53, + "grad_norm": 6.881855487823486, + "learning_rate": 7.846616541353384e-06, + "loss": 0.3561, + "step": 14320 + }, + { + "epoch": 21.55, + "grad_norm": 4.248598575592041, + "learning_rate": 7.845112781954888e-06, + "loss": 0.2925, + "step": 14330 + }, + { + "epoch": 21.56, + "grad_norm": 3.864591121673584, + "learning_rate": 7.843609022556393e-06, + "loss": 0.327, + "step": 14340 + }, + { + "epoch": 21.58, + "grad_norm": 6.71173095703125, + "learning_rate": 7.842105263157895e-06, + "loss": 0.3397, + "step": 14350 + }, + { + "epoch": 21.59, + "grad_norm": 3.9306325912475586, + "learning_rate": 7.8406015037594e-06, + "loss": 0.2788, + "step": 14360 + }, + { + "epoch": 21.61, + "grad_norm": 5.838838577270508, + "learning_rate": 7.839097744360903e-06, + "loss": 0.3116, + "step": 14370 + }, + { + "epoch": 21.62, + "grad_norm": 13.80683708190918, + "learning_rate": 7.837593984962407e-06, + "loss": 0.2882, + "step": 14380 + }, + { + "epoch": 21.64, + "grad_norm": 5.687681198120117, + "learning_rate": 7.83609022556391e-06, + "loss": 0.3521, + "step": 14390 + }, + { + "epoch": 21.65, + "grad_norm": 6.886200904846191, + "learning_rate": 7.834586466165414e-06, + "loss": 0.3544, + "step": 14400 + }, + { + "epoch": 21.67, + "grad_norm": 7.935529708862305, + "learning_rate": 7.833082706766917e-06, + "loss": 0.3214, + "step": 14410 + }, + { + "epoch": 21.68, + "grad_norm": 8.129573822021484, + "learning_rate": 7.831578947368421e-06, + "loss": 0.3341, + "step": 14420 + }, + { + "epoch": 21.7, + "grad_norm": 6.135876178741455, + "learning_rate": 7.830075187969926e-06, + "loss": 0.2829, + "step": 14430 + }, + { + "epoch": 21.71, + "grad_norm": 8.596725463867188, + "learning_rate": 7.828571428571428e-06, + "loss": 0.3856, + "step": 14440 + }, + { + "epoch": 21.73, + "grad_norm": 3.492475748062134, + "learning_rate": 7.827067669172933e-06, + "loss": 0.4355, + "step": 14450 + }, + { + "epoch": 21.74, + "grad_norm": 8.090415000915527, + "learning_rate": 7.825563909774437e-06, + "loss": 0.2981, + "step": 14460 + }, + { + "epoch": 21.76, + "grad_norm": 6.346333026885986, + "learning_rate": 7.82406015037594e-06, + "loss": 0.3715, + "step": 14470 + }, + { + "epoch": 21.77, + "grad_norm": 4.496158123016357, + "learning_rate": 7.822556390977444e-06, + "loss": 0.3275, + "step": 14480 + }, + { + "epoch": 21.79, + "grad_norm": 5.410872459411621, + "learning_rate": 7.821052631578949e-06, + "loss": 0.3401, + "step": 14490 + }, + { + "epoch": 21.8, + "grad_norm": 6.038545608520508, + "learning_rate": 7.81954887218045e-06, + "loss": 0.374, + "step": 14500 + }, + { + "epoch": 21.82, + "grad_norm": 6.986004829406738, + "learning_rate": 7.818045112781956e-06, + "loss": 0.3294, + "step": 14510 + }, + { + "epoch": 21.83, + "grad_norm": 6.893804550170898, + "learning_rate": 7.81654135338346e-06, + "loss": 0.3189, + "step": 14520 + }, + { + "epoch": 21.85, + "grad_norm": 3.2566752433776855, + "learning_rate": 7.815037593984963e-06, + "loss": 0.3837, + "step": 14530 + }, + { + "epoch": 21.86, + "grad_norm": 18.923295974731445, + "learning_rate": 7.813533834586466e-06, + "loss": 0.3319, + "step": 14540 + }, + { + "epoch": 21.88, + "grad_norm": 4.527941703796387, + "learning_rate": 7.81203007518797e-06, + "loss": 0.2917, + "step": 14550 + }, + { + "epoch": 21.89, + "grad_norm": 4.466981410980225, + "learning_rate": 7.810526315789474e-06, + "loss": 0.3239, + "step": 14560 + }, + { + "epoch": 21.91, + "grad_norm": 7.884292125701904, + "learning_rate": 7.809022556390979e-06, + "loss": 0.3068, + "step": 14570 + }, + { + "epoch": 21.92, + "grad_norm": 5.265153408050537, + "learning_rate": 7.807518796992482e-06, + "loss": 0.3472, + "step": 14580 + }, + { + "epoch": 21.94, + "grad_norm": 5.18987512588501, + "learning_rate": 7.806015037593986e-06, + "loss": 0.3452, + "step": 14590 + }, + { + "epoch": 21.95, + "grad_norm": 2.9694736003875732, + "learning_rate": 7.80451127819549e-06, + "loss": 0.3544, + "step": 14600 + }, + { + "epoch": 21.97, + "grad_norm": 5.018842697143555, + "learning_rate": 7.803007518796993e-06, + "loss": 0.2905, + "step": 14610 + }, + { + "epoch": 21.98, + "grad_norm": 7.495065212249756, + "learning_rate": 7.801503759398496e-06, + "loss": 0.3675, + "step": 14620 + }, + { + "epoch": 22.0, + "grad_norm": 4.347672939300537, + "learning_rate": 7.800000000000002e-06, + "loss": 0.3329, + "step": 14630 + }, + { + "epoch": 22.0, + "eval_accuracy": 0.9333, + "eval_loss": 0.272014319896698, + "eval_runtime": 84.1987, + "eval_samples_per_second": 118.767, + "eval_steps_per_second": 0.475, + "step": 14630 + }, + { + "epoch": 22.02, + "grad_norm": 6.415873050689697, + "learning_rate": 7.798496240601503e-06, + "loss": 0.3666, + "step": 14640 + }, + { + "epoch": 22.03, + "grad_norm": 3.398313522338867, + "learning_rate": 7.796992481203009e-06, + "loss": 0.2848, + "step": 14650 + }, + { + "epoch": 22.05, + "grad_norm": 6.848686218261719, + "learning_rate": 7.795488721804512e-06, + "loss": 0.3241, + "step": 14660 + }, + { + "epoch": 22.06, + "grad_norm": 4.208133697509766, + "learning_rate": 7.793984962406016e-06, + "loss": 0.3478, + "step": 14670 + }, + { + "epoch": 22.08, + "grad_norm": 7.1864213943481445, + "learning_rate": 7.792481203007519e-06, + "loss": 0.3515, + "step": 14680 + }, + { + "epoch": 22.09, + "grad_norm": 6.161532878875732, + "learning_rate": 7.790977443609024e-06, + "loss": 0.3254, + "step": 14690 + }, + { + "epoch": 22.11, + "grad_norm": 5.770120143890381, + "learning_rate": 7.789473684210526e-06, + "loss": 0.3253, + "step": 14700 + }, + { + "epoch": 22.12, + "grad_norm": 6.7618021965026855, + "learning_rate": 7.787969924812031e-06, + "loss": 0.2816, + "step": 14710 + }, + { + "epoch": 22.14, + "grad_norm": 8.137096405029297, + "learning_rate": 7.786466165413535e-06, + "loss": 0.3144, + "step": 14720 + }, + { + "epoch": 22.15, + "grad_norm": 10.209376335144043, + "learning_rate": 7.784962406015038e-06, + "loss": 0.3009, + "step": 14730 + }, + { + "epoch": 22.17, + "grad_norm": 4.7329792976379395, + "learning_rate": 7.783458646616542e-06, + "loss": 0.2923, + "step": 14740 + }, + { + "epoch": 22.18, + "grad_norm": 7.629988670349121, + "learning_rate": 7.781954887218045e-06, + "loss": 0.2598, + "step": 14750 + }, + { + "epoch": 22.2, + "grad_norm": 3.4664206504821777, + "learning_rate": 7.780451127819549e-06, + "loss": 0.2701, + "step": 14760 + }, + { + "epoch": 22.21, + "grad_norm": 7.403353691101074, + "learning_rate": 7.778947368421054e-06, + "loss": 0.2668, + "step": 14770 + }, + { + "epoch": 22.23, + "grad_norm": 7.961404800415039, + "learning_rate": 7.777443609022558e-06, + "loss": 0.2531, + "step": 14780 + }, + { + "epoch": 22.24, + "grad_norm": 6.001272201538086, + "learning_rate": 7.775939849624061e-06, + "loss": 0.3847, + "step": 14790 + }, + { + "epoch": 22.26, + "grad_norm": 9.084465026855469, + "learning_rate": 7.774436090225565e-06, + "loss": 0.2646, + "step": 14800 + }, + { + "epoch": 22.27, + "grad_norm": 7.2393012046813965, + "learning_rate": 7.772932330827068e-06, + "loss": 0.3203, + "step": 14810 + }, + { + "epoch": 22.29, + "grad_norm": 8.683479309082031, + "learning_rate": 7.771428571428572e-06, + "loss": 0.285, + "step": 14820 + }, + { + "epoch": 22.3, + "grad_norm": 5.802537441253662, + "learning_rate": 7.769924812030077e-06, + "loss": 0.3638, + "step": 14830 + }, + { + "epoch": 22.32, + "grad_norm": 2.3165462017059326, + "learning_rate": 7.768421052631579e-06, + "loss": 0.3051, + "step": 14840 + }, + { + "epoch": 22.33, + "grad_norm": 7.184248924255371, + "learning_rate": 7.766917293233084e-06, + "loss": 0.3008, + "step": 14850 + }, + { + "epoch": 22.35, + "grad_norm": 4.384416580200195, + "learning_rate": 7.765413533834588e-06, + "loss": 0.3827, + "step": 14860 + }, + { + "epoch": 22.36, + "grad_norm": 4.507965087890625, + "learning_rate": 7.763909774436091e-06, + "loss": 0.2811, + "step": 14870 + }, + { + "epoch": 22.38, + "grad_norm": 6.374339580535889, + "learning_rate": 7.762406015037595e-06, + "loss": 0.2874, + "step": 14880 + }, + { + "epoch": 22.39, + "grad_norm": 6.437895774841309, + "learning_rate": 7.760902255639098e-06, + "loss": 0.3101, + "step": 14890 + }, + { + "epoch": 22.41, + "grad_norm": 7.283292770385742, + "learning_rate": 7.759398496240602e-06, + "loss": 0.3169, + "step": 14900 + }, + { + "epoch": 22.42, + "grad_norm": 7.374083042144775, + "learning_rate": 7.757894736842105e-06, + "loss": 0.334, + "step": 14910 + }, + { + "epoch": 22.44, + "grad_norm": 8.147022247314453, + "learning_rate": 7.75639097744361e-06, + "loss": 0.382, + "step": 14920 + }, + { + "epoch": 22.45, + "grad_norm": 4.416728496551514, + "learning_rate": 7.754887218045112e-06, + "loss": 0.329, + "step": 14930 + }, + { + "epoch": 22.47, + "grad_norm": 4.376534938812256, + "learning_rate": 7.753383458646617e-06, + "loss": 0.4032, + "step": 14940 + }, + { + "epoch": 22.48, + "grad_norm": 9.247624397277832, + "learning_rate": 7.751879699248121e-06, + "loss": 0.3748, + "step": 14950 + }, + { + "epoch": 22.5, + "grad_norm": 6.153301239013672, + "learning_rate": 7.750375939849624e-06, + "loss": 0.295, + "step": 14960 + }, + { + "epoch": 22.51, + "grad_norm": 9.856022834777832, + "learning_rate": 7.748872180451128e-06, + "loss": 0.3033, + "step": 14970 + }, + { + "epoch": 22.53, + "grad_norm": 6.767205715179443, + "learning_rate": 7.747368421052631e-06, + "loss": 0.3249, + "step": 14980 + }, + { + "epoch": 22.54, + "grad_norm": 7.620680809020996, + "learning_rate": 7.745864661654135e-06, + "loss": 0.3775, + "step": 14990 + }, + { + "epoch": 22.56, + "grad_norm": 4.250925064086914, + "learning_rate": 7.74436090225564e-06, + "loss": 0.3464, + "step": 15000 + }, + { + "epoch": 22.57, + "grad_norm": 5.965568542480469, + "learning_rate": 7.742857142857144e-06, + "loss": 0.287, + "step": 15010 + }, + { + "epoch": 22.59, + "grad_norm": 7.024303913116455, + "learning_rate": 7.741353383458647e-06, + "loss": 0.3145, + "step": 15020 + }, + { + "epoch": 22.6, + "grad_norm": 5.691739559173584, + "learning_rate": 7.73984962406015e-06, + "loss": 0.2593, + "step": 15030 + }, + { + "epoch": 22.62, + "grad_norm": 4.950546741485596, + "learning_rate": 7.738345864661654e-06, + "loss": 0.2312, + "step": 15040 + }, + { + "epoch": 22.63, + "grad_norm": 8.352819442749023, + "learning_rate": 7.736842105263158e-06, + "loss": 0.3138, + "step": 15050 + }, + { + "epoch": 22.65, + "grad_norm": 7.059927940368652, + "learning_rate": 7.735338345864663e-06, + "loss": 0.3281, + "step": 15060 + }, + { + "epoch": 22.66, + "grad_norm": 8.698114395141602, + "learning_rate": 7.733834586466165e-06, + "loss": 0.3882, + "step": 15070 + }, + { + "epoch": 22.68, + "grad_norm": 9.284893989562988, + "learning_rate": 7.73233082706767e-06, + "loss": 0.3517, + "step": 15080 + }, + { + "epoch": 22.69, + "grad_norm": 10.998963356018066, + "learning_rate": 7.730827067669174e-06, + "loss": 0.3258, + "step": 15090 + }, + { + "epoch": 22.71, + "grad_norm": 6.2202277183532715, + "learning_rate": 7.729323308270677e-06, + "loss": 0.3219, + "step": 15100 + }, + { + "epoch": 22.72, + "grad_norm": 5.044315814971924, + "learning_rate": 7.72781954887218e-06, + "loss": 0.3523, + "step": 15110 + }, + { + "epoch": 22.74, + "grad_norm": 7.376669883728027, + "learning_rate": 7.726315789473686e-06, + "loss": 0.3644, + "step": 15120 + }, + { + "epoch": 22.75, + "grad_norm": 4.171261787414551, + "learning_rate": 7.724812030075188e-06, + "loss": 0.356, + "step": 15130 + }, + { + "epoch": 22.77, + "grad_norm": 4.358337879180908, + "learning_rate": 7.723308270676693e-06, + "loss": 0.3316, + "step": 15140 + }, + { + "epoch": 22.78, + "grad_norm": 7.347938060760498, + "learning_rate": 7.721804511278196e-06, + "loss": 0.3134, + "step": 15150 + }, + { + "epoch": 22.8, + "grad_norm": 6.51662540435791, + "learning_rate": 7.7203007518797e-06, + "loss": 0.3304, + "step": 15160 + }, + { + "epoch": 22.81, + "grad_norm": 4.554670810699463, + "learning_rate": 7.718796992481203e-06, + "loss": 0.3568, + "step": 15170 + }, + { + "epoch": 22.83, + "grad_norm": 5.587891101837158, + "learning_rate": 7.717293233082707e-06, + "loss": 0.2347, + "step": 15180 + }, + { + "epoch": 22.84, + "grad_norm": 4.167472839355469, + "learning_rate": 7.71578947368421e-06, + "loss": 0.2926, + "step": 15190 + }, + { + "epoch": 22.86, + "grad_norm": 7.967960834503174, + "learning_rate": 7.714285714285716e-06, + "loss": 0.3413, + "step": 15200 + }, + { + "epoch": 22.87, + "grad_norm": 4.611883640289307, + "learning_rate": 7.71278195488722e-06, + "loss": 0.328, + "step": 15210 + }, + { + "epoch": 22.89, + "grad_norm": 6.755552291870117, + "learning_rate": 7.711278195488723e-06, + "loss": 0.3418, + "step": 15220 + }, + { + "epoch": 22.9, + "grad_norm": 7.961472034454346, + "learning_rate": 7.709774436090226e-06, + "loss": 0.3717, + "step": 15230 + }, + { + "epoch": 22.92, + "grad_norm": 6.6936187744140625, + "learning_rate": 7.70827067669173e-06, + "loss": 0.318, + "step": 15240 + }, + { + "epoch": 22.93, + "grad_norm": 5.8512444496154785, + "learning_rate": 7.706766917293233e-06, + "loss": 0.3481, + "step": 15250 + }, + { + "epoch": 22.95, + "grad_norm": 6.354591369628906, + "learning_rate": 7.705263157894738e-06, + "loss": 0.3909, + "step": 15260 + }, + { + "epoch": 22.96, + "grad_norm": 5.42380428314209, + "learning_rate": 7.70375939849624e-06, + "loss": 0.34, + "step": 15270 + }, + { + "epoch": 22.98, + "grad_norm": 9.55280876159668, + "learning_rate": 7.702255639097746e-06, + "loss": 0.3557, + "step": 15280 + }, + { + "epoch": 22.99, + "grad_norm": 5.095829963684082, + "learning_rate": 7.700751879699249e-06, + "loss": 0.4089, + "step": 15290 + }, + { + "epoch": 23.0, + "eval_accuracy": 0.9325, + "eval_loss": 0.27643856406211853, + "eval_runtime": 84.9045, + "eval_samples_per_second": 117.779, + "eval_steps_per_second": 0.471, + "step": 15295 + }, + { + "epoch": 23.01, + "grad_norm": 5.0213518142700195, + "learning_rate": 7.699248120300753e-06, + "loss": 0.3049, + "step": 15300 + }, + { + "epoch": 23.02, + "grad_norm": 7.806211948394775, + "learning_rate": 7.697744360902256e-06, + "loss": 0.3641, + "step": 15310 + }, + { + "epoch": 23.04, + "grad_norm": 3.328399896621704, + "learning_rate": 7.696240601503761e-06, + "loss": 0.3035, + "step": 15320 + }, + { + "epoch": 23.05, + "grad_norm": 4.0139875411987305, + "learning_rate": 7.694736842105263e-06, + "loss": 0.3657, + "step": 15330 + }, + { + "epoch": 23.07, + "grad_norm": 5.296818256378174, + "learning_rate": 7.693233082706768e-06, + "loss": 0.3067, + "step": 15340 + }, + { + "epoch": 23.08, + "grad_norm": 6.957002639770508, + "learning_rate": 7.691729323308272e-06, + "loss": 0.3746, + "step": 15350 + }, + { + "epoch": 23.1, + "grad_norm": 5.104499816894531, + "learning_rate": 7.690225563909775e-06, + "loss": 0.3475, + "step": 15360 + }, + { + "epoch": 23.11, + "grad_norm": 4.252979755401611, + "learning_rate": 7.688721804511279e-06, + "loss": 0.3913, + "step": 15370 + }, + { + "epoch": 23.13, + "grad_norm": 6.049491882324219, + "learning_rate": 7.687218045112782e-06, + "loss": 0.2903, + "step": 15380 + }, + { + "epoch": 23.14, + "grad_norm": 5.293207168579102, + "learning_rate": 7.685714285714286e-06, + "loss": 0.2508, + "step": 15390 + }, + { + "epoch": 23.16, + "grad_norm": 5.338700771331787, + "learning_rate": 7.68421052631579e-06, + "loss": 0.2695, + "step": 15400 + }, + { + "epoch": 23.17, + "grad_norm": 3.670703172683716, + "learning_rate": 7.682706766917295e-06, + "loss": 0.3534, + "step": 15410 + }, + { + "epoch": 23.19, + "grad_norm": 5.252980709075928, + "learning_rate": 7.681203007518796e-06, + "loss": 0.3555, + "step": 15420 + }, + { + "epoch": 23.2, + "grad_norm": 8.209909439086914, + "learning_rate": 7.679699248120302e-06, + "loss": 0.3796, + "step": 15430 + }, + { + "epoch": 23.22, + "grad_norm": 4.704070091247559, + "learning_rate": 7.678195488721805e-06, + "loss": 0.3024, + "step": 15440 + }, + { + "epoch": 23.23, + "grad_norm": 5.453511714935303, + "learning_rate": 7.676691729323309e-06, + "loss": 0.2894, + "step": 15450 + }, + { + "epoch": 23.25, + "grad_norm": 7.019674777984619, + "learning_rate": 7.675187969924812e-06, + "loss": 0.3011, + "step": 15460 + }, + { + "epoch": 23.26, + "grad_norm": 16.94940185546875, + "learning_rate": 7.673684210526316e-06, + "loss": 0.3526, + "step": 15470 + }, + { + "epoch": 23.28, + "grad_norm": 5.086287498474121, + "learning_rate": 7.67218045112782e-06, + "loss": 0.3219, + "step": 15480 + }, + { + "epoch": 23.29, + "grad_norm": 3.3719613552093506, + "learning_rate": 7.670676691729325e-06, + "loss": 0.2706, + "step": 15490 + }, + { + "epoch": 23.31, + "grad_norm": 6.172971248626709, + "learning_rate": 7.669172932330828e-06, + "loss": 0.2633, + "step": 15500 + }, + { + "epoch": 23.32, + "grad_norm": 6.304495811462402, + "learning_rate": 7.667669172932332e-06, + "loss": 0.3932, + "step": 15510 + }, + { + "epoch": 23.34, + "grad_norm": 7.454652309417725, + "learning_rate": 7.666165413533835e-06, + "loss": 0.3278, + "step": 15520 + }, + { + "epoch": 23.35, + "grad_norm": 4.993924617767334, + "learning_rate": 7.664661654135339e-06, + "loss": 0.3158, + "step": 15530 + }, + { + "epoch": 23.37, + "grad_norm": 8.269258499145508, + "learning_rate": 7.663157894736842e-06, + "loss": 0.2965, + "step": 15540 + }, + { + "epoch": 23.38, + "grad_norm": 9.472188949584961, + "learning_rate": 7.661654135338347e-06, + "loss": 0.3321, + "step": 15550 + }, + { + "epoch": 23.4, + "grad_norm": 4.879781246185303, + "learning_rate": 7.66015037593985e-06, + "loss": 0.3012, + "step": 15560 + }, + { + "epoch": 23.41, + "grad_norm": 3.718254566192627, + "learning_rate": 7.658646616541354e-06, + "loss": 0.3498, + "step": 15570 + }, + { + "epoch": 23.43, + "grad_norm": 3.531419038772583, + "learning_rate": 7.657142857142858e-06, + "loss": 0.2622, + "step": 15580 + }, + { + "epoch": 23.44, + "grad_norm": 4.4930949211120605, + "learning_rate": 7.655639097744361e-06, + "loss": 0.2778, + "step": 15590 + }, + { + "epoch": 23.46, + "grad_norm": 4.008451461791992, + "learning_rate": 7.654135338345865e-06, + "loss": 0.3077, + "step": 15600 + }, + { + "epoch": 23.47, + "grad_norm": 6.081947326660156, + "learning_rate": 7.65263157894737e-06, + "loss": 0.2566, + "step": 15610 + }, + { + "epoch": 23.49, + "grad_norm": 3.446821689605713, + "learning_rate": 7.651127819548872e-06, + "loss": 0.3096, + "step": 15620 + }, + { + "epoch": 23.5, + "grad_norm": 10.16897201538086, + "learning_rate": 7.649624060150377e-06, + "loss": 0.3254, + "step": 15630 + }, + { + "epoch": 23.52, + "grad_norm": 1.691789150238037, + "learning_rate": 7.64812030075188e-06, + "loss": 0.3094, + "step": 15640 + }, + { + "epoch": 23.53, + "grad_norm": 4.911680698394775, + "learning_rate": 7.646616541353384e-06, + "loss": 0.2983, + "step": 15650 + }, + { + "epoch": 23.55, + "grad_norm": 6.379064559936523, + "learning_rate": 7.645112781954888e-06, + "loss": 0.2172, + "step": 15660 + }, + { + "epoch": 23.56, + "grad_norm": 4.125355243682861, + "learning_rate": 7.643609022556391e-06, + "loss": 0.2838, + "step": 15670 + }, + { + "epoch": 23.58, + "grad_norm": 6.153583526611328, + "learning_rate": 7.642105263157895e-06, + "loss": 0.3016, + "step": 15680 + }, + { + "epoch": 23.59, + "grad_norm": 7.581343173980713, + "learning_rate": 7.6406015037594e-06, + "loss": 0.3684, + "step": 15690 + }, + { + "epoch": 23.61, + "grad_norm": 8.388538360595703, + "learning_rate": 7.639097744360904e-06, + "loss": 0.3272, + "step": 15700 + }, + { + "epoch": 23.62, + "grad_norm": 7.5239362716674805, + "learning_rate": 7.637593984962407e-06, + "loss": 0.3148, + "step": 15710 + }, + { + "epoch": 23.64, + "grad_norm": 7.033330917358398, + "learning_rate": 7.63609022556391e-06, + "loss": 0.3008, + "step": 15720 + }, + { + "epoch": 23.65, + "grad_norm": 5.851361274719238, + "learning_rate": 7.634586466165414e-06, + "loss": 0.3205, + "step": 15730 + }, + { + "epoch": 23.67, + "grad_norm": 2.9884681701660156, + "learning_rate": 7.633082706766918e-06, + "loss": 0.2622, + "step": 15740 + }, + { + "epoch": 23.68, + "grad_norm": 7.287815570831299, + "learning_rate": 7.631578947368423e-06, + "loss": 0.3984, + "step": 15750 + }, + { + "epoch": 23.7, + "grad_norm": 4.209038734436035, + "learning_rate": 7.630075187969925e-06, + "loss": 0.2849, + "step": 15760 + }, + { + "epoch": 23.71, + "grad_norm": 3.485328197479248, + "learning_rate": 7.62857142857143e-06, + "loss": 0.3395, + "step": 15770 + }, + { + "epoch": 23.73, + "grad_norm": 4.945652484893799, + "learning_rate": 7.6270676691729325e-06, + "loss": 0.3045, + "step": 15780 + }, + { + "epoch": 23.74, + "grad_norm": 9.309234619140625, + "learning_rate": 7.625563909774437e-06, + "loss": 0.3023, + "step": 15790 + }, + { + "epoch": 23.76, + "grad_norm": 4.634711265563965, + "learning_rate": 7.62406015037594e-06, + "loss": 0.371, + "step": 15800 + }, + { + "epoch": 23.77, + "grad_norm": 5.632133483886719, + "learning_rate": 7.622556390977445e-06, + "loss": 0.3813, + "step": 15810 + }, + { + "epoch": 23.79, + "grad_norm": 3.4830102920532227, + "learning_rate": 7.621052631578948e-06, + "loss": 0.2663, + "step": 15820 + }, + { + "epoch": 23.8, + "grad_norm": 4.266145706176758, + "learning_rate": 7.619548872180453e-06, + "loss": 0.3295, + "step": 15830 + }, + { + "epoch": 23.82, + "grad_norm": 6.466432571411133, + "learning_rate": 7.618045112781955e-06, + "loss": 0.3109, + "step": 15840 + }, + { + "epoch": 23.83, + "grad_norm": 7.617573261260986, + "learning_rate": 7.61654135338346e-06, + "loss": 0.3228, + "step": 15850 + }, + { + "epoch": 23.85, + "grad_norm": 6.581226825714111, + "learning_rate": 7.615037593984963e-06, + "loss": 0.293, + "step": 15860 + }, + { + "epoch": 23.86, + "grad_norm": 3.765594720840454, + "learning_rate": 7.6135338345864676e-06, + "loss": 0.2756, + "step": 15870 + }, + { + "epoch": 23.88, + "grad_norm": 5.244023323059082, + "learning_rate": 7.61203007518797e-06, + "loss": 0.3237, + "step": 15880 + }, + { + "epoch": 23.89, + "grad_norm": 4.341951370239258, + "learning_rate": 7.610526315789474e-06, + "loss": 0.3115, + "step": 15890 + }, + { + "epoch": 23.91, + "grad_norm": 7.612969875335693, + "learning_rate": 7.609022556390978e-06, + "loss": 0.3097, + "step": 15900 + }, + { + "epoch": 23.92, + "grad_norm": 4.730962753295898, + "learning_rate": 7.607518796992482e-06, + "loss": 0.381, + "step": 15910 + }, + { + "epoch": 23.94, + "grad_norm": 6.929048538208008, + "learning_rate": 7.606015037593986e-06, + "loss": 0.4364, + "step": 15920 + }, + { + "epoch": 23.95, + "grad_norm": 8.269315719604492, + "learning_rate": 7.604511278195489e-06, + "loss": 0.3312, + "step": 15930 + }, + { + "epoch": 23.97, + "grad_norm": 5.69378662109375, + "learning_rate": 7.603007518796993e-06, + "loss": 0.3105, + "step": 15940 + }, + { + "epoch": 23.98, + "grad_norm": 2.1282405853271484, + "learning_rate": 7.6015037593984966e-06, + "loss": 0.3473, + "step": 15950 + }, + { + "epoch": 24.0, + "grad_norm": 64.98310089111328, + "learning_rate": 7.600000000000001e-06, + "loss": 0.3196, + "step": 15960 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.9305, + "eval_loss": 0.27353373169898987, + "eval_runtime": 84.6965, + "eval_samples_per_second": 118.069, + "eval_steps_per_second": 0.472, + "step": 15960 + }, + { + "epoch": 24.02, + "grad_norm": 6.7349653244018555, + "learning_rate": 7.598496240601504e-06, + "loss": 0.2695, + "step": 15970 + }, + { + "epoch": 24.03, + "grad_norm": 3.1804494857788086, + "learning_rate": 7.596992481203008e-06, + "loss": 0.2994, + "step": 15980 + }, + { + "epoch": 24.05, + "grad_norm": 5.326109886169434, + "learning_rate": 7.5954887218045115e-06, + "loss": 0.3565, + "step": 15990 + }, + { + "epoch": 24.06, + "grad_norm": 5.608499526977539, + "learning_rate": 7.593984962406016e-06, + "loss": 0.2904, + "step": 16000 + }, + { + "epoch": 24.08, + "grad_norm": 3.598764181137085, + "learning_rate": 7.592481203007519e-06, + "loss": 0.2959, + "step": 16010 + }, + { + "epoch": 24.09, + "grad_norm": 11.522496223449707, + "learning_rate": 7.590977443609024e-06, + "loss": 0.2509, + "step": 16020 + }, + { + "epoch": 24.11, + "grad_norm": 5.142250061035156, + "learning_rate": 7.589473684210526e-06, + "loss": 0.2768, + "step": 16030 + }, + { + "epoch": 24.12, + "grad_norm": 4.173882484436035, + "learning_rate": 7.587969924812031e-06, + "loss": 0.3249, + "step": 16040 + }, + { + "epoch": 24.14, + "grad_norm": 5.356581211090088, + "learning_rate": 7.586466165413534e-06, + "loss": 0.342, + "step": 16050 + }, + { + "epoch": 24.15, + "grad_norm": 5.133426666259766, + "learning_rate": 7.584962406015039e-06, + "loss": 0.372, + "step": 16060 + }, + { + "epoch": 24.17, + "grad_norm": 6.920117378234863, + "learning_rate": 7.583458646616541e-06, + "loss": 0.2469, + "step": 16070 + }, + { + "epoch": 24.18, + "grad_norm": 5.305706024169922, + "learning_rate": 7.581954887218046e-06, + "loss": 0.3355, + "step": 16080 + }, + { + "epoch": 24.2, + "grad_norm": 5.647949695587158, + "learning_rate": 7.580451127819549e-06, + "loss": 0.3394, + "step": 16090 + }, + { + "epoch": 24.21, + "grad_norm": 8.31951904296875, + "learning_rate": 7.578947368421054e-06, + "loss": 0.2804, + "step": 16100 + }, + { + "epoch": 24.23, + "grad_norm": 7.295516490936279, + "learning_rate": 7.577443609022557e-06, + "loss": 0.2522, + "step": 16110 + }, + { + "epoch": 24.24, + "grad_norm": 4.9929022789001465, + "learning_rate": 7.575939849624061e-06, + "loss": 0.2747, + "step": 16120 + }, + { + "epoch": 24.26, + "grad_norm": 4.699337482452393, + "learning_rate": 7.574436090225564e-06, + "loss": 0.2968, + "step": 16130 + }, + { + "epoch": 24.27, + "grad_norm": 8.267194747924805, + "learning_rate": 7.5729323308270685e-06, + "loss": 0.396, + "step": 16140 + }, + { + "epoch": 24.29, + "grad_norm": 5.007888317108154, + "learning_rate": 7.571428571428572e-06, + "loss": 0.2286, + "step": 16150 + }, + { + "epoch": 24.3, + "grad_norm": 4.469249725341797, + "learning_rate": 7.569924812030076e-06, + "loss": 0.2907, + "step": 16160 + }, + { + "epoch": 24.32, + "grad_norm": 5.973073482513428, + "learning_rate": 7.568421052631579e-06, + "loss": 0.2583, + "step": 16170 + }, + { + "epoch": 24.33, + "grad_norm": 7.025624752044678, + "learning_rate": 7.5669172932330834e-06, + "loss": 0.332, + "step": 16180 + }, + { + "epoch": 24.35, + "grad_norm": 6.508031368255615, + "learning_rate": 7.565413533834587e-06, + "loss": 0.3216, + "step": 16190 + }, + { + "epoch": 24.36, + "grad_norm": 5.830172538757324, + "learning_rate": 7.563909774436091e-06, + "loss": 0.2935, + "step": 16200 + }, + { + "epoch": 24.38, + "grad_norm": 8.18553638458252, + "learning_rate": 7.562406015037595e-06, + "loss": 0.309, + "step": 16210 + }, + { + "epoch": 24.39, + "grad_norm": 19.962448120117188, + "learning_rate": 7.560902255639098e-06, + "loss": 0.3435, + "step": 16220 + }, + { + "epoch": 24.41, + "grad_norm": 4.706577301025391, + "learning_rate": 7.559398496240602e-06, + "loss": 0.3436, + "step": 16230 + }, + { + "epoch": 24.42, + "grad_norm": 5.416121959686279, + "learning_rate": 7.557894736842106e-06, + "loss": 0.3253, + "step": 16240 + }, + { + "epoch": 24.44, + "grad_norm": 7.063791275024414, + "learning_rate": 7.55639097744361e-06, + "loss": 0.3762, + "step": 16250 + }, + { + "epoch": 24.45, + "grad_norm": 4.680507659912109, + "learning_rate": 7.554887218045114e-06, + "loss": 0.2883, + "step": 16260 + }, + { + "epoch": 24.47, + "grad_norm": 8.150341987609863, + "learning_rate": 7.553383458646617e-06, + "loss": 0.3714, + "step": 16270 + }, + { + "epoch": 24.48, + "grad_norm": 3.5707345008850098, + "learning_rate": 7.551879699248121e-06, + "loss": 0.2659, + "step": 16280 + }, + { + "epoch": 24.5, + "grad_norm": 7.740113735198975, + "learning_rate": 7.550375939849625e-06, + "loss": 0.3134, + "step": 16290 + }, + { + "epoch": 24.51, + "grad_norm": 28.682573318481445, + "learning_rate": 7.548872180451129e-06, + "loss": 0.3294, + "step": 16300 + }, + { + "epoch": 24.53, + "grad_norm": 5.2670793533325195, + "learning_rate": 7.547368421052632e-06, + "loss": 0.272, + "step": 16310 + }, + { + "epoch": 24.54, + "grad_norm": 6.7343339920043945, + "learning_rate": 7.545864661654136e-06, + "loss": 0.2836, + "step": 16320 + }, + { + "epoch": 24.56, + "grad_norm": 5.1457672119140625, + "learning_rate": 7.54436090225564e-06, + "loss": 0.222, + "step": 16330 + }, + { + "epoch": 24.57, + "grad_norm": 5.616130828857422, + "learning_rate": 7.542857142857144e-06, + "loss": 0.2684, + "step": 16340 + }, + { + "epoch": 24.59, + "grad_norm": 6.380970478057861, + "learning_rate": 7.5413533834586475e-06, + "loss": 0.3899, + "step": 16350 + }, + { + "epoch": 24.6, + "grad_norm": 9.467977523803711, + "learning_rate": 7.539849624060152e-06, + "loss": 0.2699, + "step": 16360 + }, + { + "epoch": 24.62, + "grad_norm": 5.1410136222839355, + "learning_rate": 7.5383458646616545e-06, + "loss": 0.2859, + "step": 16370 + }, + { + "epoch": 24.63, + "grad_norm": 7.608031749725342, + "learning_rate": 7.536842105263158e-06, + "loss": 0.342, + "step": 16380 + }, + { + "epoch": 24.65, + "grad_norm": 3.234297513961792, + "learning_rate": 7.535338345864662e-06, + "loss": 0.3229, + "step": 16390 + }, + { + "epoch": 24.66, + "grad_norm": 5.070233345031738, + "learning_rate": 7.533834586466165e-06, + "loss": 0.3494, + "step": 16400 + }, + { + "epoch": 24.68, + "grad_norm": 5.208074569702148, + "learning_rate": 7.5323308270676694e-06, + "loss": 0.3288, + "step": 16410 + }, + { + "epoch": 24.69, + "grad_norm": 3.5855026245117188, + "learning_rate": 7.530827067669173e-06, + "loss": 0.3241, + "step": 16420 + }, + { + "epoch": 24.71, + "grad_norm": 5.344216823577881, + "learning_rate": 7.529323308270677e-06, + "loss": 0.2997, + "step": 16430 + }, + { + "epoch": 24.72, + "grad_norm": 7.309630393981934, + "learning_rate": 7.527819548872181e-06, + "loss": 0.304, + "step": 16440 + }, + { + "epoch": 24.74, + "grad_norm": 4.991174697875977, + "learning_rate": 7.526315789473685e-06, + "loss": 0.3107, + "step": 16450 + }, + { + "epoch": 24.75, + "grad_norm": 6.519332408905029, + "learning_rate": 7.524812030075188e-06, + "loss": 0.2857, + "step": 16460 + }, + { + "epoch": 24.77, + "grad_norm": 6.599658012390137, + "learning_rate": 7.523308270676692e-06, + "loss": 0.3334, + "step": 16470 + }, + { + "epoch": 24.78, + "grad_norm": 6.142018795013428, + "learning_rate": 7.521804511278196e-06, + "loss": 0.3121, + "step": 16480 + }, + { + "epoch": 24.8, + "grad_norm": 5.483290195465088, + "learning_rate": 7.5203007518797e-06, + "loss": 0.3155, + "step": 16490 + }, + { + "epoch": 24.81, + "grad_norm": 10.318331718444824, + "learning_rate": 7.518796992481203e-06, + "loss": 0.3272, + "step": 16500 + }, + { + "epoch": 24.83, + "grad_norm": 4.9072771072387695, + "learning_rate": 7.517293233082707e-06, + "loss": 0.3807, + "step": 16510 + }, + { + "epoch": 24.84, + "grad_norm": 5.348799228668213, + "learning_rate": 7.515789473684211e-06, + "loss": 0.3662, + "step": 16520 + }, + { + "epoch": 24.86, + "grad_norm": 4.338939189910889, + "learning_rate": 7.514285714285715e-06, + "loss": 0.3022, + "step": 16530 + }, + { + "epoch": 24.87, + "grad_norm": 7.069118499755859, + "learning_rate": 7.512781954887219e-06, + "loss": 0.3111, + "step": 16540 + }, + { + "epoch": 24.89, + "grad_norm": 7.8967790603637695, + "learning_rate": 7.511278195488723e-06, + "loss": 0.2683, + "step": 16550 + }, + { + "epoch": 24.9, + "grad_norm": 6.323410987854004, + "learning_rate": 7.509774436090226e-06, + "loss": 0.3274, + "step": 16560 + }, + { + "epoch": 24.92, + "grad_norm": 6.77821683883667, + "learning_rate": 7.50827067669173e-06, + "loss": 0.2803, + "step": 16570 + }, + { + "epoch": 24.93, + "grad_norm": 6.189555644989014, + "learning_rate": 7.5067669172932335e-06, + "loss": 0.3104, + "step": 16580 + }, + { + "epoch": 24.95, + "grad_norm": 3.698765277862549, + "learning_rate": 7.505263157894738e-06, + "loss": 0.3238, + "step": 16590 + }, + { + "epoch": 24.96, + "grad_norm": 5.7407355308532715, + "learning_rate": 7.5037593984962405e-06, + "loss": 0.3003, + "step": 16600 + }, + { + "epoch": 24.98, + "grad_norm": 4.214737892150879, + "learning_rate": 7.502255639097745e-06, + "loss": 0.3016, + "step": 16610 + }, + { + "epoch": 24.99, + "grad_norm": 6.830847263336182, + "learning_rate": 7.5007518796992484e-06, + "loss": 0.2982, + "step": 16620 + }, + { + "epoch": 25.0, + "eval_accuracy": 0.9312, + "eval_loss": 0.27709507942199707, + "eval_runtime": 84.7229, + "eval_samples_per_second": 118.032, + "eval_steps_per_second": 0.472, + "step": 16625 + }, + { + "epoch": 25.01, + "grad_norm": 8.3620023727417, + "learning_rate": 7.499248120300753e-06, + "loss": 0.2671, + "step": 16630 + }, + { + "epoch": 25.02, + "grad_norm": 6.188384532928467, + "learning_rate": 7.497744360902256e-06, + "loss": 0.3315, + "step": 16640 + }, + { + "epoch": 25.04, + "grad_norm": 4.457509994506836, + "learning_rate": 7.496240601503761e-06, + "loss": 0.2999, + "step": 16650 + }, + { + "epoch": 25.05, + "grad_norm": 3.5126192569732666, + "learning_rate": 7.494736842105263e-06, + "loss": 0.2304, + "step": 16660 + }, + { + "epoch": 25.07, + "grad_norm": 8.772309303283691, + "learning_rate": 7.493233082706768e-06, + "loss": 0.3678, + "step": 16670 + }, + { + "epoch": 25.08, + "grad_norm": 6.1839189529418945, + "learning_rate": 7.491729323308271e-06, + "loss": 0.3029, + "step": 16680 + }, + { + "epoch": 25.1, + "grad_norm": 3.8670859336853027, + "learning_rate": 7.490225563909776e-06, + "loss": 0.2816, + "step": 16690 + }, + { + "epoch": 25.11, + "grad_norm": 6.036872863769531, + "learning_rate": 7.488721804511278e-06, + "loss": 0.3167, + "step": 16700 + }, + { + "epoch": 25.13, + "grad_norm": 5.029440879821777, + "learning_rate": 7.487218045112783e-06, + "loss": 0.3886, + "step": 16710 + }, + { + "epoch": 25.14, + "grad_norm": 6.317779541015625, + "learning_rate": 7.485714285714286e-06, + "loss": 0.3374, + "step": 16720 + }, + { + "epoch": 25.16, + "grad_norm": 4.726473808288574, + "learning_rate": 7.4842105263157905e-06, + "loss": 0.2748, + "step": 16730 + }, + { + "epoch": 25.17, + "grad_norm": 7.509809494018555, + "learning_rate": 7.482706766917294e-06, + "loss": 0.3124, + "step": 16740 + }, + { + "epoch": 25.19, + "grad_norm": 6.474162578582764, + "learning_rate": 7.481203007518798e-06, + "loss": 0.3249, + "step": 16750 + }, + { + "epoch": 25.2, + "grad_norm": 5.663277626037598, + "learning_rate": 7.479699248120301e-06, + "loss": 0.3175, + "step": 16760 + }, + { + "epoch": 25.22, + "grad_norm": 5.958733558654785, + "learning_rate": 7.4781954887218055e-06, + "loss": 0.2671, + "step": 16770 + }, + { + "epoch": 25.23, + "grad_norm": 3.8807878494262695, + "learning_rate": 7.476691729323309e-06, + "loss": 0.2347, + "step": 16780 + }, + { + "epoch": 25.25, + "grad_norm": 8.403759956359863, + "learning_rate": 7.475187969924813e-06, + "loss": 0.2834, + "step": 16790 + }, + { + "epoch": 25.26, + "grad_norm": 4.284714698791504, + "learning_rate": 7.473684210526316e-06, + "loss": 0.2763, + "step": 16800 + }, + { + "epoch": 25.28, + "grad_norm": 5.796482563018799, + "learning_rate": 7.47218045112782e-06, + "loss": 0.3044, + "step": 16810 + }, + { + "epoch": 25.29, + "grad_norm": 4.833478927612305, + "learning_rate": 7.470676691729324e-06, + "loss": 0.3536, + "step": 16820 + }, + { + "epoch": 25.31, + "grad_norm": 6.935545921325684, + "learning_rate": 7.469172932330828e-06, + "loss": 0.3231, + "step": 16830 + }, + { + "epoch": 25.32, + "grad_norm": 7.3656229972839355, + "learning_rate": 7.467669172932332e-06, + "loss": 0.2747, + "step": 16840 + }, + { + "epoch": 25.34, + "grad_norm": 6.294661998748779, + "learning_rate": 7.466165413533836e-06, + "loss": 0.3172, + "step": 16850 + }, + { + "epoch": 25.35, + "grad_norm": 5.849190711975098, + "learning_rate": 7.464661654135339e-06, + "loss": 0.307, + "step": 16860 + }, + { + "epoch": 25.37, + "grad_norm": 3.9488296508789062, + "learning_rate": 7.463157894736843e-06, + "loss": 0.2835, + "step": 16870 + }, + { + "epoch": 25.38, + "grad_norm": 5.17800760269165, + "learning_rate": 7.461654135338347e-06, + "loss": 0.3037, + "step": 16880 + }, + { + "epoch": 25.4, + "grad_norm": 5.4995436668396, + "learning_rate": 7.460150375939849e-06, + "loss": 0.3726, + "step": 16890 + }, + { + "epoch": 25.41, + "grad_norm": 5.913205623626709, + "learning_rate": 7.458646616541354e-06, + "loss": 0.244, + "step": 16900 + }, + { + "epoch": 25.43, + "grad_norm": 7.40120267868042, + "learning_rate": 7.457142857142857e-06, + "loss": 0.3658, + "step": 16910 + }, + { + "epoch": 25.44, + "grad_norm": 6.155134677886963, + "learning_rate": 7.455639097744362e-06, + "loss": 0.2988, + "step": 16920 + }, + { + "epoch": 25.46, + "grad_norm": 7.154820919036865, + "learning_rate": 7.454135338345865e-06, + "loss": 0.2828, + "step": 16930 + }, + { + "epoch": 25.47, + "grad_norm": 15.181897163391113, + "learning_rate": 7.4526315789473695e-06, + "loss": 0.2919, + "step": 16940 + }, + { + "epoch": 25.49, + "grad_norm": 15.616399765014648, + "learning_rate": 7.451127819548872e-06, + "loss": 0.2602, + "step": 16950 + }, + { + "epoch": 25.5, + "grad_norm": 3.783785104751587, + "learning_rate": 7.4496240601503765e-06, + "loss": 0.2663, + "step": 16960 + }, + { + "epoch": 25.52, + "grad_norm": 3.7820167541503906, + "learning_rate": 7.44812030075188e-06, + "loss": 0.3069, + "step": 16970 + }, + { + "epoch": 25.53, + "grad_norm": 22.630653381347656, + "learning_rate": 7.4466165413533844e-06, + "loss": 0.3701, + "step": 16980 + }, + { + "epoch": 25.55, + "grad_norm": 7.006454944610596, + "learning_rate": 7.445112781954887e-06, + "loss": 0.2762, + "step": 16990 + }, + { + "epoch": 25.56, + "grad_norm": 3.897531747817993, + "learning_rate": 7.4436090225563915e-06, + "loss": 0.2871, + "step": 17000 + }, + { + "epoch": 25.58, + "grad_norm": 9.737010955810547, + "learning_rate": 7.442105263157895e-06, + "loss": 0.3115, + "step": 17010 + }, + { + "epoch": 25.59, + "grad_norm": 9.285514831542969, + "learning_rate": 7.440601503759399e-06, + "loss": 0.3061, + "step": 17020 + }, + { + "epoch": 25.61, + "grad_norm": 6.54390811920166, + "learning_rate": 7.439097744360903e-06, + "loss": 0.4048, + "step": 17030 + }, + { + "epoch": 25.62, + "grad_norm": 7.407090663909912, + "learning_rate": 7.437593984962406e-06, + "loss": 0.335, + "step": 17040 + }, + { + "epoch": 25.64, + "grad_norm": 5.179807186126709, + "learning_rate": 7.43609022556391e-06, + "loss": 0.3313, + "step": 17050 + }, + { + "epoch": 25.65, + "grad_norm": 3.106466770172119, + "learning_rate": 7.434586466165414e-06, + "loss": 0.3702, + "step": 17060 + }, + { + "epoch": 25.67, + "grad_norm": 4.030908584594727, + "learning_rate": 7.433082706766918e-06, + "loss": 0.262, + "step": 17070 + }, + { + "epoch": 25.68, + "grad_norm": 5.041976451873779, + "learning_rate": 7.431578947368422e-06, + "loss": 0.3317, + "step": 17080 + }, + { + "epoch": 25.7, + "grad_norm": 7.29601526260376, + "learning_rate": 7.430075187969925e-06, + "loss": 0.295, + "step": 17090 + }, + { + "epoch": 25.71, + "grad_norm": 6.026291370391846, + "learning_rate": 7.428571428571429e-06, + "loss": 0.3131, + "step": 17100 + }, + { + "epoch": 25.73, + "grad_norm": 6.111357688903809, + "learning_rate": 7.427067669172933e-06, + "loss": 0.2887, + "step": 17110 + }, + { + "epoch": 25.74, + "grad_norm": 6.1451520919799805, + "learning_rate": 7.425563909774437e-06, + "loss": 0.2974, + "step": 17120 + }, + { + "epoch": 25.76, + "grad_norm": 4.193663120269775, + "learning_rate": 7.424060150375941e-06, + "loss": 0.2106, + "step": 17130 + }, + { + "epoch": 25.77, + "grad_norm": 5.288703441619873, + "learning_rate": 7.422556390977444e-06, + "loss": 0.2647, + "step": 17140 + }, + { + "epoch": 25.79, + "grad_norm": 9.568791389465332, + "learning_rate": 7.421052631578948e-06, + "loss": 0.383, + "step": 17150 + }, + { + "epoch": 25.8, + "grad_norm": 4.507370471954346, + "learning_rate": 7.419548872180452e-06, + "loss": 0.3109, + "step": 17160 + }, + { + "epoch": 25.82, + "grad_norm": 4.298795223236084, + "learning_rate": 7.4180451127819555e-06, + "loss": 0.3037, + "step": 17170 + }, + { + "epoch": 25.83, + "grad_norm": 7.694872856140137, + "learning_rate": 7.41654135338346e-06, + "loss": 0.353, + "step": 17180 + }, + { + "epoch": 25.85, + "grad_norm": 4.656931400299072, + "learning_rate": 7.4150375939849626e-06, + "loss": 0.3047, + "step": 17190 + }, + { + "epoch": 25.86, + "grad_norm": 6.448072910308838, + "learning_rate": 7.413533834586467e-06, + "loss": 0.3562, + "step": 17200 + }, + { + "epoch": 25.88, + "grad_norm": 7.515552043914795, + "learning_rate": 7.4120300751879705e-06, + "loss": 0.3391, + "step": 17210 + }, + { + "epoch": 25.89, + "grad_norm": 9.326400756835938, + "learning_rate": 7.410526315789475e-06, + "loss": 0.2554, + "step": 17220 + }, + { + "epoch": 25.91, + "grad_norm": 4.4977498054504395, + "learning_rate": 7.4090225563909775e-06, + "loss": 0.3115, + "step": 17230 + }, + { + "epoch": 25.92, + "grad_norm": 5.859455585479736, + "learning_rate": 7.407518796992482e-06, + "loss": 0.2923, + "step": 17240 + }, + { + "epoch": 25.94, + "grad_norm": 3.0939574241638184, + "learning_rate": 7.406015037593985e-06, + "loss": 0.3387, + "step": 17250 + }, + { + "epoch": 25.95, + "grad_norm": 6.690404415130615, + "learning_rate": 7.40451127819549e-06, + "loss": 0.2812, + "step": 17260 + }, + { + "epoch": 25.97, + "grad_norm": 3.768836736679077, + "learning_rate": 7.403007518796993e-06, + "loss": 0.2778, + "step": 17270 + }, + { + "epoch": 25.98, + "grad_norm": 4.476847171783447, + "learning_rate": 7.401503759398498e-06, + "loss": 0.3684, + "step": 17280 + }, + { + "epoch": 26.0, + "grad_norm": 0.6643197536468506, + "learning_rate": 7.4e-06, + "loss": 0.1884, + "step": 17290 + }, + { + "epoch": 26.0, + "eval_accuracy": 0.9304, + "eval_loss": 0.2943102717399597, + "eval_runtime": 84.682, + "eval_samples_per_second": 118.089, + "eval_steps_per_second": 0.472, + "step": 17290 + }, + { + "epoch": 26.02, + "grad_norm": 6.030354976654053, + "learning_rate": 7.398496240601505e-06, + "loss": 0.2949, + "step": 17300 + }, + { + "epoch": 26.03, + "grad_norm": 6.9479193687438965, + "learning_rate": 7.396992481203008e-06, + "loss": 0.3083, + "step": 17310 + }, + { + "epoch": 26.05, + "grad_norm": 5.633476257324219, + "learning_rate": 7.3954887218045126e-06, + "loss": 0.2906, + "step": 17320 + }, + { + "epoch": 26.06, + "grad_norm": 4.715734481811523, + "learning_rate": 7.393984962406015e-06, + "loss": 0.3034, + "step": 17330 + }, + { + "epoch": 26.08, + "grad_norm": 5.452293395996094, + "learning_rate": 7.39248120300752e-06, + "loss": 0.3242, + "step": 17340 + }, + { + "epoch": 26.09, + "grad_norm": 5.496037006378174, + "learning_rate": 7.390977443609023e-06, + "loss": 0.2861, + "step": 17350 + }, + { + "epoch": 26.11, + "grad_norm": 8.563288688659668, + "learning_rate": 7.3894736842105275e-06, + "loss": 0.3451, + "step": 17360 + }, + { + "epoch": 26.12, + "grad_norm": 3.828183889389038, + "learning_rate": 7.387969924812031e-06, + "loss": 0.3346, + "step": 17370 + }, + { + "epoch": 26.14, + "grad_norm": 6.062718391418457, + "learning_rate": 7.386466165413534e-06, + "loss": 0.2778, + "step": 17380 + }, + { + "epoch": 26.15, + "grad_norm": 5.245468616485596, + "learning_rate": 7.384962406015038e-06, + "loss": 0.2623, + "step": 17390 + }, + { + "epoch": 26.17, + "grad_norm": 5.32465934753418, + "learning_rate": 7.3834586466165416e-06, + "loss": 0.2771, + "step": 17400 + }, + { + "epoch": 26.18, + "grad_norm": 6.1053009033203125, + "learning_rate": 7.381954887218046e-06, + "loss": 0.2738, + "step": 17410 + }, + { + "epoch": 26.2, + "grad_norm": 4.224456310272217, + "learning_rate": 7.380451127819549e-06, + "loss": 0.2576, + "step": 17420 + }, + { + "epoch": 26.21, + "grad_norm": 5.59401798248291, + "learning_rate": 7.378947368421053e-06, + "loss": 0.2733, + "step": 17430 + }, + { + "epoch": 26.23, + "grad_norm": 9.04089641571045, + "learning_rate": 7.3774436090225565e-06, + "loss": 0.294, + "step": 17440 + }, + { + "epoch": 26.24, + "grad_norm": 5.4782233238220215, + "learning_rate": 7.375939849624061e-06, + "loss": 0.2776, + "step": 17450 + }, + { + "epoch": 26.26, + "grad_norm": 3.4573464393615723, + "learning_rate": 7.374436090225564e-06, + "loss": 0.2913, + "step": 17460 + }, + { + "epoch": 26.27, + "grad_norm": 4.618410587310791, + "learning_rate": 7.372932330827069e-06, + "loss": 0.3398, + "step": 17470 + }, + { + "epoch": 26.29, + "grad_norm": 3.339372396469116, + "learning_rate": 7.371428571428571e-06, + "loss": 0.2932, + "step": 17480 + }, + { + "epoch": 26.3, + "grad_norm": 4.173111438751221, + "learning_rate": 7.369924812030076e-06, + "loss": 0.3159, + "step": 17490 + }, + { + "epoch": 26.32, + "grad_norm": 5.077332973480225, + "learning_rate": 7.368421052631579e-06, + "loss": 0.2747, + "step": 17500 + }, + { + "epoch": 26.33, + "grad_norm": 3.467737913131714, + "learning_rate": 7.366917293233084e-06, + "loss": 0.3015, + "step": 17510 + }, + { + "epoch": 26.35, + "grad_norm": 4.046982288360596, + "learning_rate": 7.365413533834586e-06, + "loss": 0.2847, + "step": 17520 + }, + { + "epoch": 26.36, + "grad_norm": 5.868749141693115, + "learning_rate": 7.363909774436091e-06, + "loss": 0.2743, + "step": 17530 + }, + { + "epoch": 26.38, + "grad_norm": 4.371850967407227, + "learning_rate": 7.362406015037594e-06, + "loss": 0.2951, + "step": 17540 + }, + { + "epoch": 26.39, + "grad_norm": 5.016296863555908, + "learning_rate": 7.3609022556390986e-06, + "loss": 0.3517, + "step": 17550 + }, + { + "epoch": 26.41, + "grad_norm": 4.3615946769714355, + "learning_rate": 7.359398496240602e-06, + "loss": 0.4145, + "step": 17560 + }, + { + "epoch": 26.42, + "grad_norm": 5.725963115692139, + "learning_rate": 7.3578947368421065e-06, + "loss": 0.2518, + "step": 17570 + }, + { + "epoch": 26.44, + "grad_norm": 9.736555099487305, + "learning_rate": 7.356390977443609e-06, + "loss": 0.3154, + "step": 17580 + }, + { + "epoch": 26.45, + "grad_norm": 7.777997016906738, + "learning_rate": 7.3548872180451135e-06, + "loss": 0.325, + "step": 17590 + }, + { + "epoch": 26.47, + "grad_norm": 4.7586894035339355, + "learning_rate": 7.353383458646617e-06, + "loss": 0.3155, + "step": 17600 + }, + { + "epoch": 26.48, + "grad_norm": 5.5998430252075195, + "learning_rate": 7.351879699248121e-06, + "loss": 0.3358, + "step": 17610 + }, + { + "epoch": 26.5, + "grad_norm": 6.661489486694336, + "learning_rate": 7.350375939849624e-06, + "loss": 0.2973, + "step": 17620 + }, + { + "epoch": 26.51, + "grad_norm": 8.244695663452148, + "learning_rate": 7.348872180451128e-06, + "loss": 0.2739, + "step": 17630 + }, + { + "epoch": 26.53, + "grad_norm": 7.346087455749512, + "learning_rate": 7.347368421052632e-06, + "loss": 0.3281, + "step": 17640 + }, + { + "epoch": 26.54, + "grad_norm": 8.243110656738281, + "learning_rate": 7.345864661654136e-06, + "loss": 0.2994, + "step": 17650 + }, + { + "epoch": 26.56, + "grad_norm": 3.7893612384796143, + "learning_rate": 7.34436090225564e-06, + "loss": 0.3569, + "step": 17660 + }, + { + "epoch": 26.57, + "grad_norm": 4.714302062988281, + "learning_rate": 7.342857142857144e-06, + "loss": 0.2856, + "step": 17670 + }, + { + "epoch": 26.59, + "grad_norm": 7.72084903717041, + "learning_rate": 7.341353383458647e-06, + "loss": 0.2402, + "step": 17680 + }, + { + "epoch": 26.6, + "grad_norm": 4.239202499389648, + "learning_rate": 7.339849624060151e-06, + "loss": 0.2312, + "step": 17690 + }, + { + "epoch": 26.62, + "grad_norm": 6.118954658508301, + "learning_rate": 7.338345864661655e-06, + "loss": 0.3843, + "step": 17700 + }, + { + "epoch": 26.63, + "grad_norm": 6.067955017089844, + "learning_rate": 7.336842105263159e-06, + "loss": 0.3087, + "step": 17710 + }, + { + "epoch": 26.65, + "grad_norm": 6.024227619171143, + "learning_rate": 7.335338345864662e-06, + "loss": 0.2224, + "step": 17720 + }, + { + "epoch": 26.66, + "grad_norm": 3.350494146347046, + "learning_rate": 7.333834586466166e-06, + "loss": 0.2977, + "step": 17730 + }, + { + "epoch": 26.68, + "grad_norm": 5.893447399139404, + "learning_rate": 7.33233082706767e-06, + "loss": 0.2636, + "step": 17740 + }, + { + "epoch": 26.69, + "grad_norm": 5.77102518081665, + "learning_rate": 7.330827067669174e-06, + "loss": 0.2412, + "step": 17750 + }, + { + "epoch": 26.71, + "grad_norm": 4.75433349609375, + "learning_rate": 7.3293233082706776e-06, + "loss": 0.3262, + "step": 17760 + }, + { + "epoch": 26.72, + "grad_norm": 9.549809455871582, + "learning_rate": 7.327819548872182e-06, + "loss": 0.2673, + "step": 17770 + }, + { + "epoch": 26.74, + "grad_norm": 7.770995616912842, + "learning_rate": 7.326315789473685e-06, + "loss": 0.3227, + "step": 17780 + }, + { + "epoch": 26.75, + "grad_norm": 2.771538257598877, + "learning_rate": 7.324812030075189e-06, + "loss": 0.2894, + "step": 17790 + }, + { + "epoch": 26.77, + "grad_norm": 5.751589298248291, + "learning_rate": 7.3233082706766925e-06, + "loss": 0.281, + "step": 17800 + }, + { + "epoch": 26.78, + "grad_norm": 6.880566596984863, + "learning_rate": 7.321804511278197e-06, + "loss": 0.3191, + "step": 17810 + }, + { + "epoch": 26.8, + "grad_norm": 5.7594218254089355, + "learning_rate": 7.3203007518796995e-06, + "loss": 0.2659, + "step": 17820 + }, + { + "epoch": 26.81, + "grad_norm": 3.8072729110717773, + "learning_rate": 7.318796992481204e-06, + "loss": 0.2825, + "step": 17830 + }, + { + "epoch": 26.83, + "grad_norm": 7.311132431030273, + "learning_rate": 7.317293233082707e-06, + "loss": 0.3758, + "step": 17840 + }, + { + "epoch": 26.84, + "grad_norm": 6.675231456756592, + "learning_rate": 7.315789473684212e-06, + "loss": 0.2856, + "step": 17850 + }, + { + "epoch": 26.86, + "grad_norm": 9.13115119934082, + "learning_rate": 7.314285714285715e-06, + "loss": 0.266, + "step": 17860 + }, + { + "epoch": 26.87, + "grad_norm": 5.860391616821289, + "learning_rate": 7.312781954887218e-06, + "loss": 0.3354, + "step": 17870 + }, + { + "epoch": 26.89, + "grad_norm": 5.685858249664307, + "learning_rate": 7.311278195488722e-06, + "loss": 0.3235, + "step": 17880 + }, + { + "epoch": 26.9, + "grad_norm": 6.518139839172363, + "learning_rate": 7.309774436090226e-06, + "loss": 0.313, + "step": 17890 + }, + { + "epoch": 26.92, + "grad_norm": 14.772072792053223, + "learning_rate": 7.30827067669173e-06, + "loss": 0.3201, + "step": 17900 + }, + { + "epoch": 26.93, + "grad_norm": 4.325756072998047, + "learning_rate": 7.306766917293233e-06, + "loss": 0.2681, + "step": 17910 + }, + { + "epoch": 26.95, + "grad_norm": 4.291186809539795, + "learning_rate": 7.305263157894737e-06, + "loss": 0.3831, + "step": 17920 + }, + { + "epoch": 26.96, + "grad_norm": 8.093550682067871, + "learning_rate": 7.303759398496241e-06, + "loss": 0.2855, + "step": 17930 + }, + { + "epoch": 26.98, + "grad_norm": 4.962594985961914, + "learning_rate": 7.302255639097745e-06, + "loss": 0.2369, + "step": 17940 + }, + { + "epoch": 26.99, + "grad_norm": 5.591581344604492, + "learning_rate": 7.300751879699249e-06, + "loss": 0.3624, + "step": 17950 + }, + { + "epoch": 27.0, + "eval_accuracy": 0.9316, + "eval_loss": 0.2865539491176605, + "eval_runtime": 84.8448, + "eval_samples_per_second": 117.862, + "eval_steps_per_second": 0.471, + "step": 17955 + }, + { + "epoch": 27.01, + "grad_norm": 4.0642547607421875, + "learning_rate": 7.299248120300752e-06, + "loss": 0.2308, + "step": 17960 + }, + { + "epoch": 27.02, + "grad_norm": 5.927849292755127, + "learning_rate": 7.297744360902256e-06, + "loss": 0.2986, + "step": 17970 + }, + { + "epoch": 27.04, + "grad_norm": 6.369534492492676, + "learning_rate": 7.29624060150376e-06, + "loss": 0.2482, + "step": 17980 + }, + { + "epoch": 27.05, + "grad_norm": 4.340782165527344, + "learning_rate": 7.2947368421052636e-06, + "loss": 0.2583, + "step": 17990 + }, + { + "epoch": 27.07, + "grad_norm": 4.993666648864746, + "learning_rate": 7.293233082706768e-06, + "loss": 0.237, + "step": 18000 + }, + { + "epoch": 27.08, + "grad_norm": 5.864019870758057, + "learning_rate": 7.291729323308271e-06, + "loss": 0.3072, + "step": 18010 + }, + { + "epoch": 27.1, + "grad_norm": 6.993382930755615, + "learning_rate": 7.290225563909775e-06, + "loss": 0.2972, + "step": 18020 + }, + { + "epoch": 27.11, + "grad_norm": 5.060710430145264, + "learning_rate": 7.2887218045112785e-06, + "loss": 0.269, + "step": 18030 + }, + { + "epoch": 27.13, + "grad_norm": 3.3099091053009033, + "learning_rate": 7.287218045112783e-06, + "loss": 0.3175, + "step": 18040 + }, + { + "epoch": 27.14, + "grad_norm": 5.775263786315918, + "learning_rate": 7.285714285714286e-06, + "loss": 0.2844, + "step": 18050 + }, + { + "epoch": 27.16, + "grad_norm": 5.468781471252441, + "learning_rate": 7.28421052631579e-06, + "loss": 0.2273, + "step": 18060 + }, + { + "epoch": 27.17, + "grad_norm": 6.351372241973877, + "learning_rate": 7.282706766917293e-06, + "loss": 0.3525, + "step": 18070 + }, + { + "epoch": 27.19, + "grad_norm": 7.59521484375, + "learning_rate": 7.281203007518798e-06, + "loss": 0.3068, + "step": 18080 + }, + { + "epoch": 27.2, + "grad_norm": 3.0641674995422363, + "learning_rate": 7.279699248120301e-06, + "loss": 0.2865, + "step": 18090 + }, + { + "epoch": 27.22, + "grad_norm": 6.9775238037109375, + "learning_rate": 7.278195488721806e-06, + "loss": 0.3022, + "step": 18100 + }, + { + "epoch": 27.23, + "grad_norm": 4.060088634490967, + "learning_rate": 7.276691729323308e-06, + "loss": 0.3148, + "step": 18110 + }, + { + "epoch": 27.25, + "grad_norm": 7.767538070678711, + "learning_rate": 7.275187969924813e-06, + "loss": 0.229, + "step": 18120 + }, + { + "epoch": 27.26, + "grad_norm": 4.900406360626221, + "learning_rate": 7.273684210526316e-06, + "loss": 0.2672, + "step": 18130 + }, + { + "epoch": 27.28, + "grad_norm": 6.7180962562561035, + "learning_rate": 7.272180451127821e-06, + "loss": 0.24, + "step": 18140 + }, + { + "epoch": 27.29, + "grad_norm": 6.126708507537842, + "learning_rate": 7.270676691729323e-06, + "loss": 0.284, + "step": 18150 + }, + { + "epoch": 27.31, + "grad_norm": 3.9110794067382812, + "learning_rate": 7.269172932330828e-06, + "loss": 0.2719, + "step": 18160 + }, + { + "epoch": 27.32, + "grad_norm": 6.212155342102051, + "learning_rate": 7.267669172932331e-06, + "loss": 0.3151, + "step": 18170 + }, + { + "epoch": 27.34, + "grad_norm": 3.0043139457702637, + "learning_rate": 7.2661654135338355e-06, + "loss": 0.2592, + "step": 18180 + }, + { + "epoch": 27.35, + "grad_norm": 3.2263989448547363, + "learning_rate": 7.264661654135339e-06, + "loss": 0.3309, + "step": 18190 + }, + { + "epoch": 27.37, + "grad_norm": 4.7933197021484375, + "learning_rate": 7.263157894736843e-06, + "loss": 0.3262, + "step": 18200 + }, + { + "epoch": 27.38, + "grad_norm": 2.227823495864868, + "learning_rate": 7.261654135338346e-06, + "loss": 0.2645, + "step": 18210 + }, + { + "epoch": 27.4, + "grad_norm": 3.900000810623169, + "learning_rate": 7.2601503759398504e-06, + "loss": 0.3359, + "step": 18220 + }, + { + "epoch": 27.41, + "grad_norm": 3.5033364295959473, + "learning_rate": 7.258646616541354e-06, + "loss": 0.3467, + "step": 18230 + }, + { + "epoch": 27.43, + "grad_norm": 6.635146617889404, + "learning_rate": 7.257142857142858e-06, + "loss": 0.3276, + "step": 18240 + }, + { + "epoch": 27.44, + "grad_norm": 4.950628757476807, + "learning_rate": 7.255639097744361e-06, + "loss": 0.2459, + "step": 18250 + }, + { + "epoch": 27.46, + "grad_norm": 8.33364486694336, + "learning_rate": 7.254135338345865e-06, + "loss": 0.3154, + "step": 18260 + }, + { + "epoch": 27.47, + "grad_norm": 6.362850666046143, + "learning_rate": 7.252631578947369e-06, + "loss": 0.3349, + "step": 18270 + }, + { + "epoch": 27.49, + "grad_norm": 2.358121395111084, + "learning_rate": 7.251127819548873e-06, + "loss": 0.247, + "step": 18280 + }, + { + "epoch": 27.5, + "grad_norm": 7.640157699584961, + "learning_rate": 7.249624060150377e-06, + "loss": 0.259, + "step": 18290 + }, + { + "epoch": 27.52, + "grad_norm": 4.780099868774414, + "learning_rate": 7.248120300751881e-06, + "loss": 0.2818, + "step": 18300 + }, + { + "epoch": 27.53, + "grad_norm": 4.6383256912231445, + "learning_rate": 7.246616541353384e-06, + "loss": 0.2534, + "step": 18310 + }, + { + "epoch": 27.55, + "grad_norm": 4.298056125640869, + "learning_rate": 7.245112781954888e-06, + "loss": 0.2586, + "step": 18320 + }, + { + "epoch": 27.56, + "grad_norm": 7.920955181121826, + "learning_rate": 7.243609022556392e-06, + "loss": 0.2583, + "step": 18330 + }, + { + "epoch": 27.58, + "grad_norm": 2.8801300525665283, + "learning_rate": 7.242105263157896e-06, + "loss": 0.2478, + "step": 18340 + }, + { + "epoch": 27.59, + "grad_norm": 5.475898265838623, + "learning_rate": 7.240601503759399e-06, + "loss": 0.3193, + "step": 18350 + }, + { + "epoch": 27.61, + "grad_norm": 26.629573822021484, + "learning_rate": 7.239097744360903e-06, + "loss": 0.2725, + "step": 18360 + }, + { + "epoch": 27.62, + "grad_norm": 5.835658073425293, + "learning_rate": 7.237593984962407e-06, + "loss": 0.274, + "step": 18370 + }, + { + "epoch": 27.64, + "grad_norm": 7.282139301300049, + "learning_rate": 7.23609022556391e-06, + "loss": 0.288, + "step": 18380 + }, + { + "epoch": 27.65, + "grad_norm": 11.25385856628418, + "learning_rate": 7.2345864661654145e-06, + "loss": 0.2611, + "step": 18390 + }, + { + "epoch": 27.67, + "grad_norm": 6.181314945220947, + "learning_rate": 7.233082706766917e-06, + "loss": 0.3109, + "step": 18400 + }, + { + "epoch": 27.68, + "grad_norm": 5.535643100738525, + "learning_rate": 7.2315789473684215e-06, + "loss": 0.2892, + "step": 18410 + }, + { + "epoch": 27.7, + "grad_norm": 7.809751510620117, + "learning_rate": 7.230075187969925e-06, + "loss": 0.3626, + "step": 18420 + }, + { + "epoch": 27.71, + "grad_norm": 5.477492332458496, + "learning_rate": 7.2285714285714294e-06, + "loss": 0.3423, + "step": 18430 + }, + { + "epoch": 27.73, + "grad_norm": 5.311155796051025, + "learning_rate": 7.227067669172932e-06, + "loss": 0.2859, + "step": 18440 + }, + { + "epoch": 27.74, + "grad_norm": 6.58491325378418, + "learning_rate": 7.2255639097744365e-06, + "loss": 0.3323, + "step": 18450 + }, + { + "epoch": 27.76, + "grad_norm": 10.060500144958496, + "learning_rate": 7.22406015037594e-06, + "loss": 0.2803, + "step": 18460 + }, + { + "epoch": 27.77, + "grad_norm": 3.332087278366089, + "learning_rate": 7.222556390977444e-06, + "loss": 0.3011, + "step": 18470 + }, + { + "epoch": 27.79, + "grad_norm": 3.494025945663452, + "learning_rate": 7.221052631578948e-06, + "loss": 0.2756, + "step": 18480 + }, + { + "epoch": 27.8, + "grad_norm": 6.302395820617676, + "learning_rate": 7.219548872180452e-06, + "loss": 0.2454, + "step": 18490 + }, + { + "epoch": 27.82, + "grad_norm": 4.8681793212890625, + "learning_rate": 7.218045112781955e-06, + "loss": 0.349, + "step": 18500 + }, + { + "epoch": 27.83, + "grad_norm": 5.359793663024902, + "learning_rate": 7.216541353383459e-06, + "loss": 0.3316, + "step": 18510 + }, + { + "epoch": 27.85, + "grad_norm": 7.901219367980957, + "learning_rate": 7.215037593984963e-06, + "loss": 0.255, + "step": 18520 + }, + { + "epoch": 27.86, + "grad_norm": 6.270127773284912, + "learning_rate": 7.213533834586467e-06, + "loss": 0.3856, + "step": 18530 + }, + { + "epoch": 27.88, + "grad_norm": 3.3462986946105957, + "learning_rate": 7.21203007518797e-06, + "loss": 0.2656, + "step": 18540 + }, + { + "epoch": 27.89, + "grad_norm": 5.023731708526611, + "learning_rate": 7.210526315789474e-06, + "loss": 0.3367, + "step": 18550 + }, + { + "epoch": 27.91, + "grad_norm": 7.475311756134033, + "learning_rate": 7.209022556390978e-06, + "loss": 0.3652, + "step": 18560 + }, + { + "epoch": 27.92, + "grad_norm": 5.593987464904785, + "learning_rate": 7.207518796992482e-06, + "loss": 0.2514, + "step": 18570 + }, + { + "epoch": 27.94, + "grad_norm": 52.27437973022461, + "learning_rate": 7.206015037593986e-06, + "loss": 0.3338, + "step": 18580 + }, + { + "epoch": 27.95, + "grad_norm": 6.0754499435424805, + "learning_rate": 7.20451127819549e-06, + "loss": 0.2033, + "step": 18590 + }, + { + "epoch": 27.97, + "grad_norm": 13.490704536437988, + "learning_rate": 7.203007518796993e-06, + "loss": 0.328, + "step": 18600 + }, + { + "epoch": 27.98, + "grad_norm": 8.996991157531738, + "learning_rate": 7.201503759398497e-06, + "loss": 0.3082, + "step": 18610 + }, + { + "epoch": 28.0, + "grad_norm": 1.09871506690979, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.2957, + "step": 18620 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.932, + "eval_loss": 0.2707752585411072, + "eval_runtime": 84.4192, + "eval_samples_per_second": 118.456, + "eval_steps_per_second": 0.474, + "step": 18620 + }, + { + "epoch": 28.02, + "grad_norm": 4.388790130615234, + "learning_rate": 7.198496240601505e-06, + "loss": 0.2982, + "step": 18630 + }, + { + "epoch": 28.03, + "grad_norm": 11.981380462646484, + "learning_rate": 7.1969924812030076e-06, + "loss": 0.2254, + "step": 18640 + }, + { + "epoch": 28.05, + "grad_norm": 5.74471378326416, + "learning_rate": 7.195488721804512e-06, + "loss": 0.2806, + "step": 18650 + }, + { + "epoch": 28.06, + "grad_norm": 6.635989189147949, + "learning_rate": 7.1939849624060154e-06, + "loss": 0.3081, + "step": 18660 + }, + { + "epoch": 28.08, + "grad_norm": 8.036993026733398, + "learning_rate": 7.19248120300752e-06, + "loss": 0.2684, + "step": 18670 + }, + { + "epoch": 28.09, + "grad_norm": 5.605617523193359, + "learning_rate": 7.190977443609023e-06, + "loss": 0.3037, + "step": 18680 + }, + { + "epoch": 28.11, + "grad_norm": 5.046873569488525, + "learning_rate": 7.189473684210527e-06, + "loss": 0.2489, + "step": 18690 + }, + { + "epoch": 28.12, + "grad_norm": 7.5858154296875, + "learning_rate": 7.18796992481203e-06, + "loss": 0.2868, + "step": 18700 + }, + { + "epoch": 28.14, + "grad_norm": 5.2180399894714355, + "learning_rate": 7.186466165413535e-06, + "loss": 0.282, + "step": 18710 + }, + { + "epoch": 28.15, + "grad_norm": 5.189599990844727, + "learning_rate": 7.184962406015038e-06, + "loss": 0.3185, + "step": 18720 + }, + { + "epoch": 28.17, + "grad_norm": 3.309946298599243, + "learning_rate": 7.183458646616543e-06, + "loss": 0.3342, + "step": 18730 + }, + { + "epoch": 28.18, + "grad_norm": 4.891626834869385, + "learning_rate": 7.181954887218045e-06, + "loss": 0.2972, + "step": 18740 + }, + { + "epoch": 28.2, + "grad_norm": 4.263134002685547, + "learning_rate": 7.18045112781955e-06, + "loss": 0.2384, + "step": 18750 + }, + { + "epoch": 28.21, + "grad_norm": 4.385184288024902, + "learning_rate": 7.178947368421053e-06, + "loss": 0.2699, + "step": 18760 + }, + { + "epoch": 28.23, + "grad_norm": 10.802248001098633, + "learning_rate": 7.1774436090225575e-06, + "loss": 0.3162, + "step": 18770 + }, + { + "epoch": 28.24, + "grad_norm": 4.073564052581787, + "learning_rate": 7.175939849624061e-06, + "loss": 0.2886, + "step": 18780 + }, + { + "epoch": 28.26, + "grad_norm": 4.669020175933838, + "learning_rate": 7.174436090225565e-06, + "loss": 0.3056, + "step": 18790 + }, + { + "epoch": 28.27, + "grad_norm": 7.317276477813721, + "learning_rate": 7.172932330827068e-06, + "loss": 0.3272, + "step": 18800 + }, + { + "epoch": 28.29, + "grad_norm": 2.6165945529937744, + "learning_rate": 7.1714285714285725e-06, + "loss": 0.2903, + "step": 18810 + }, + { + "epoch": 28.3, + "grad_norm": 4.007689476013184, + "learning_rate": 7.169924812030076e-06, + "loss": 0.3343, + "step": 18820 + }, + { + "epoch": 28.32, + "grad_norm": 4.415470123291016, + "learning_rate": 7.16842105263158e-06, + "loss": 0.2953, + "step": 18830 + }, + { + "epoch": 28.33, + "grad_norm": 3.293377637863159, + "learning_rate": 7.166917293233083e-06, + "loss": 0.2722, + "step": 18840 + }, + { + "epoch": 28.35, + "grad_norm": 5.47039270401001, + "learning_rate": 7.165413533834587e-06, + "loss": 0.2798, + "step": 18850 + }, + { + "epoch": 28.36, + "grad_norm": 5.256561756134033, + "learning_rate": 7.163909774436091e-06, + "loss": 0.2649, + "step": 18860 + }, + { + "epoch": 28.38, + "grad_norm": 3.793679714202881, + "learning_rate": 7.1624060150375944e-06, + "loss": 0.2919, + "step": 18870 + }, + { + "epoch": 28.39, + "grad_norm": 3.6800131797790527, + "learning_rate": 7.160902255639098e-06, + "loss": 0.2941, + "step": 18880 + }, + { + "epoch": 28.41, + "grad_norm": 6.140032768249512, + "learning_rate": 7.1593984962406015e-06, + "loss": 0.307, + "step": 18890 + }, + { + "epoch": 28.42, + "grad_norm": 5.978692531585693, + "learning_rate": 7.157894736842106e-06, + "loss": 0.2973, + "step": 18900 + }, + { + "epoch": 28.44, + "grad_norm": 6.257145404815674, + "learning_rate": 7.156390977443609e-06, + "loss": 0.3043, + "step": 18910 + }, + { + "epoch": 28.45, + "grad_norm": 6.124258518218994, + "learning_rate": 7.154887218045114e-06, + "loss": 0.2877, + "step": 18920 + }, + { + "epoch": 28.47, + "grad_norm": 4.110513687133789, + "learning_rate": 7.153383458646616e-06, + "loss": 0.2723, + "step": 18930 + }, + { + "epoch": 28.48, + "grad_norm": 8.390369415283203, + "learning_rate": 7.151879699248121e-06, + "loss": 0.2854, + "step": 18940 + }, + { + "epoch": 28.5, + "grad_norm": 4.641889572143555, + "learning_rate": 7.150375939849624e-06, + "loss": 0.2325, + "step": 18950 + }, + { + "epoch": 28.51, + "grad_norm": 9.678013801574707, + "learning_rate": 7.148872180451129e-06, + "loss": 0.2325, + "step": 18960 + }, + { + "epoch": 28.53, + "grad_norm": 4.375498294830322, + "learning_rate": 7.147368421052631e-06, + "loss": 0.2814, + "step": 18970 + }, + { + "epoch": 28.54, + "grad_norm": 11.331188201904297, + "learning_rate": 7.145864661654136e-06, + "loss": 0.2659, + "step": 18980 + }, + { + "epoch": 28.56, + "grad_norm": 4.929275035858154, + "learning_rate": 7.144360902255639e-06, + "loss": 0.3207, + "step": 18990 + }, + { + "epoch": 28.57, + "grad_norm": 6.941195487976074, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.2677, + "step": 19000 + }, + { + "epoch": 28.59, + "grad_norm": 7.25919771194458, + "learning_rate": 7.141353383458647e-06, + "loss": 0.265, + "step": 19010 + }, + { + "epoch": 28.6, + "grad_norm": 4.009887218475342, + "learning_rate": 7.1398496240601514e-06, + "loss": 0.289, + "step": 19020 + }, + { + "epoch": 28.62, + "grad_norm": 7.65335750579834, + "learning_rate": 7.138345864661654e-06, + "loss": 0.2977, + "step": 19030 + }, + { + "epoch": 28.63, + "grad_norm": 7.200645923614502, + "learning_rate": 7.1368421052631585e-06, + "loss": 0.3497, + "step": 19040 + }, + { + "epoch": 28.65, + "grad_norm": 4.218509674072266, + "learning_rate": 7.135338345864662e-06, + "loss": 0.2505, + "step": 19050 + }, + { + "epoch": 28.66, + "grad_norm": 5.385000228881836, + "learning_rate": 7.133834586466166e-06, + "loss": 0.2603, + "step": 19060 + }, + { + "epoch": 28.68, + "grad_norm": 6.183909893035889, + "learning_rate": 7.132330827067669e-06, + "loss": 0.3193, + "step": 19070 + }, + { + "epoch": 28.69, + "grad_norm": 7.682650089263916, + "learning_rate": 7.130827067669173e-06, + "loss": 0.3485, + "step": 19080 + }, + { + "epoch": 28.71, + "grad_norm": 6.737081527709961, + "learning_rate": 7.129323308270677e-06, + "loss": 0.4199, + "step": 19090 + }, + { + "epoch": 28.72, + "grad_norm": 6.185298442840576, + "learning_rate": 7.127819548872181e-06, + "loss": 0.3144, + "step": 19100 + }, + { + "epoch": 28.74, + "grad_norm": 3.423515558242798, + "learning_rate": 7.126315789473685e-06, + "loss": 0.2768, + "step": 19110 + }, + { + "epoch": 28.75, + "grad_norm": 7.198062896728516, + "learning_rate": 7.124812030075189e-06, + "loss": 0.3282, + "step": 19120 + }, + { + "epoch": 28.77, + "grad_norm": 5.3529372215271, + "learning_rate": 7.123308270676692e-06, + "loss": 0.2467, + "step": 19130 + }, + { + "epoch": 28.78, + "grad_norm": 7.850579261779785, + "learning_rate": 7.121804511278196e-06, + "loss": 0.3204, + "step": 19140 + }, + { + "epoch": 28.8, + "grad_norm": 8.140227317810059, + "learning_rate": 7.1203007518797e-06, + "loss": 0.3087, + "step": 19150 + }, + { + "epoch": 28.81, + "grad_norm": 6.9892778396606445, + "learning_rate": 7.118796992481204e-06, + "loss": 0.3148, + "step": 19160 + }, + { + "epoch": 28.83, + "grad_norm": 6.189218521118164, + "learning_rate": 7.117293233082707e-06, + "loss": 0.2779, + "step": 19170 + }, + { + "epoch": 28.84, + "grad_norm": 4.910006999969482, + "learning_rate": 7.115789473684211e-06, + "loss": 0.2469, + "step": 19180 + }, + { + "epoch": 28.86, + "grad_norm": 6.6792449951171875, + "learning_rate": 7.114285714285715e-06, + "loss": 0.3604, + "step": 19190 + }, + { + "epoch": 28.87, + "grad_norm": 8.408853530883789, + "learning_rate": 7.112781954887219e-06, + "loss": 0.3493, + "step": 19200 + }, + { + "epoch": 28.89, + "grad_norm": 5.114132404327393, + "learning_rate": 7.1112781954887225e-06, + "loss": 0.2874, + "step": 19210 + }, + { + "epoch": 28.9, + "grad_norm": 7.833899021148682, + "learning_rate": 7.109774436090227e-06, + "loss": 0.2365, + "step": 19220 + }, + { + "epoch": 28.92, + "grad_norm": 3.8841164112091064, + "learning_rate": 7.10827067669173e-06, + "loss": 0.2937, + "step": 19230 + }, + { + "epoch": 28.93, + "grad_norm": 11.486865997314453, + "learning_rate": 7.106766917293234e-06, + "loss": 0.2851, + "step": 19240 + }, + { + "epoch": 28.95, + "grad_norm": 9.058024406433105, + "learning_rate": 7.1052631578947375e-06, + "loss": 0.2317, + "step": 19250 + }, + { + "epoch": 28.96, + "grad_norm": 9.031519889831543, + "learning_rate": 7.103759398496242e-06, + "loss": 0.4064, + "step": 19260 + }, + { + "epoch": 28.98, + "grad_norm": 5.929012298583984, + "learning_rate": 7.1022556390977445e-06, + "loss": 0.2661, + "step": 19270 + }, + { + "epoch": 28.99, + "grad_norm": 2.968751907348633, + "learning_rate": 7.100751879699249e-06, + "loss": 0.3013, + "step": 19280 + }, + { + "epoch": 29.0, + "eval_accuracy": 0.932, + "eval_loss": 0.28813642263412476, + "eval_runtime": 84.4456, + "eval_samples_per_second": 118.419, + "eval_steps_per_second": 0.474, + "step": 19285 + }, + { + "epoch": 29.01, + "grad_norm": 5.468835830688477, + "learning_rate": 7.099248120300752e-06, + "loss": 0.2397, + "step": 19290 + }, + { + "epoch": 29.02, + "grad_norm": 7.077803134918213, + "learning_rate": 7.097744360902257e-06, + "loss": 0.3068, + "step": 19300 + }, + { + "epoch": 29.04, + "grad_norm": 5.543747425079346, + "learning_rate": 7.09624060150376e-06, + "loss": 0.2652, + "step": 19310 + }, + { + "epoch": 29.05, + "grad_norm": 7.105257034301758, + "learning_rate": 7.094736842105265e-06, + "loss": 0.3016, + "step": 19320 + }, + { + "epoch": 29.07, + "grad_norm": 7.57663106918335, + "learning_rate": 7.093233082706767e-06, + "loss": 0.3195, + "step": 19330 + }, + { + "epoch": 29.08, + "grad_norm": 3.0557851791381836, + "learning_rate": 7.091729323308272e-06, + "loss": 0.2686, + "step": 19340 + }, + { + "epoch": 29.1, + "grad_norm": 6.048924446105957, + "learning_rate": 7.090225563909775e-06, + "loss": 0.2936, + "step": 19350 + }, + { + "epoch": 29.11, + "grad_norm": 5.691340446472168, + "learning_rate": 7.088721804511278e-06, + "loss": 0.2832, + "step": 19360 + }, + { + "epoch": 29.13, + "grad_norm": 0.4570540189743042, + "learning_rate": 7.087218045112782e-06, + "loss": 0.2204, + "step": 19370 + }, + { + "epoch": 29.14, + "grad_norm": 7.902672290802002, + "learning_rate": 7.085714285714286e-06, + "loss": 0.2519, + "step": 19380 + }, + { + "epoch": 29.16, + "grad_norm": 6.772716045379639, + "learning_rate": 7.08421052631579e-06, + "loss": 0.3449, + "step": 19390 + }, + { + "epoch": 29.17, + "grad_norm": 5.602826118469238, + "learning_rate": 7.082706766917294e-06, + "loss": 0.2434, + "step": 19400 + }, + { + "epoch": 29.19, + "grad_norm": 8.90966510772705, + "learning_rate": 7.081203007518798e-06, + "loss": 0.3094, + "step": 19410 + }, + { + "epoch": 29.2, + "grad_norm": 4.287824630737305, + "learning_rate": 7.079699248120301e-06, + "loss": 0.2961, + "step": 19420 + }, + { + "epoch": 29.22, + "grad_norm": 5.467939853668213, + "learning_rate": 7.078195488721805e-06, + "loss": 0.2937, + "step": 19430 + }, + { + "epoch": 29.23, + "grad_norm": 4.856724739074707, + "learning_rate": 7.0766917293233086e-06, + "loss": 0.3074, + "step": 19440 + }, + { + "epoch": 29.25, + "grad_norm": 4.881340980529785, + "learning_rate": 7.075187969924813e-06, + "loss": 0.3334, + "step": 19450 + }, + { + "epoch": 29.26, + "grad_norm": 6.680235862731934, + "learning_rate": 7.073684210526316e-06, + "loss": 0.2404, + "step": 19460 + }, + { + "epoch": 29.28, + "grad_norm": 3.575651168823242, + "learning_rate": 7.07218045112782e-06, + "loss": 0.2544, + "step": 19470 + }, + { + "epoch": 29.29, + "grad_norm": 3.3242225646972656, + "learning_rate": 7.0706766917293235e-06, + "loss": 0.2732, + "step": 19480 + }, + { + "epoch": 29.31, + "grad_norm": 3.5619566440582275, + "learning_rate": 7.069172932330828e-06, + "loss": 0.3299, + "step": 19490 + }, + { + "epoch": 29.32, + "grad_norm": 6.326242923736572, + "learning_rate": 7.067669172932331e-06, + "loss": 0.3078, + "step": 19500 + }, + { + "epoch": 29.34, + "grad_norm": 3.983919620513916, + "learning_rate": 7.066165413533836e-06, + "loss": 0.2805, + "step": 19510 + }, + { + "epoch": 29.35, + "grad_norm": 7.784421443939209, + "learning_rate": 7.064661654135338e-06, + "loss": 0.3146, + "step": 19520 + }, + { + "epoch": 29.37, + "grad_norm": 7.162094593048096, + "learning_rate": 7.063157894736843e-06, + "loss": 0.2942, + "step": 19530 + }, + { + "epoch": 29.38, + "grad_norm": 4.964542388916016, + "learning_rate": 7.061654135338346e-06, + "loss": 0.3356, + "step": 19540 + }, + { + "epoch": 29.4, + "grad_norm": 6.946933269500732, + "learning_rate": 7.060150375939851e-06, + "loss": 0.312, + "step": 19550 + }, + { + "epoch": 29.41, + "grad_norm": 6.954648494720459, + "learning_rate": 7.058646616541353e-06, + "loss": 0.3188, + "step": 19560 + }, + { + "epoch": 29.43, + "grad_norm": 6.528570175170898, + "learning_rate": 7.057142857142858e-06, + "loss": 0.2966, + "step": 19570 + }, + { + "epoch": 29.44, + "grad_norm": 5.95659875869751, + "learning_rate": 7.055639097744361e-06, + "loss": 0.3398, + "step": 19580 + }, + { + "epoch": 29.46, + "grad_norm": 2.7002809047698975, + "learning_rate": 7.054135338345866e-06, + "loss": 0.2737, + "step": 19590 + }, + { + "epoch": 29.47, + "grad_norm": 4.33442497253418, + "learning_rate": 7.052631578947369e-06, + "loss": 0.3002, + "step": 19600 + }, + { + "epoch": 29.49, + "grad_norm": 6.4437713623046875, + "learning_rate": 7.051127819548873e-06, + "loss": 0.28, + "step": 19610 + }, + { + "epoch": 29.5, + "grad_norm": 3.9194564819335938, + "learning_rate": 7.049624060150376e-06, + "loss": 0.3069, + "step": 19620 + }, + { + "epoch": 29.52, + "grad_norm": 5.887040615081787, + "learning_rate": 7.0481203007518805e-06, + "loss": 0.2634, + "step": 19630 + }, + { + "epoch": 29.53, + "grad_norm": 4.538432598114014, + "learning_rate": 7.046616541353384e-06, + "loss": 0.2684, + "step": 19640 + }, + { + "epoch": 29.55, + "grad_norm": 7.666621208190918, + "learning_rate": 7.045112781954888e-06, + "loss": 0.2344, + "step": 19650 + }, + { + "epoch": 29.56, + "grad_norm": 5.944088935852051, + "learning_rate": 7.043609022556391e-06, + "loss": 0.2358, + "step": 19660 + }, + { + "epoch": 29.58, + "grad_norm": 4.063729286193848, + "learning_rate": 7.0421052631578954e-06, + "loss": 0.2532, + "step": 19670 + }, + { + "epoch": 29.59, + "grad_norm": 3.9652276039123535, + "learning_rate": 7.040601503759399e-06, + "loss": 0.3471, + "step": 19680 + }, + { + "epoch": 29.61, + "grad_norm": 5.847537040710449, + "learning_rate": 7.039097744360903e-06, + "loss": 0.2433, + "step": 19690 + }, + { + "epoch": 29.62, + "grad_norm": 4.780336380004883, + "learning_rate": 7.037593984962407e-06, + "loss": 0.3358, + "step": 19700 + }, + { + "epoch": 29.64, + "grad_norm": 3.0303971767425537, + "learning_rate": 7.03609022556391e-06, + "loss": 0.2168, + "step": 19710 + }, + { + "epoch": 29.65, + "grad_norm": 3.0998427867889404, + "learning_rate": 7.034586466165414e-06, + "loss": 0.2995, + "step": 19720 + }, + { + "epoch": 29.67, + "grad_norm": 2.832582473754883, + "learning_rate": 7.033082706766918e-06, + "loss": 0.2413, + "step": 19730 + }, + { + "epoch": 29.68, + "grad_norm": 7.660305023193359, + "learning_rate": 7.031578947368422e-06, + "loss": 0.2962, + "step": 19740 + }, + { + "epoch": 29.7, + "grad_norm": 5.547909259796143, + "learning_rate": 7.030075187969926e-06, + "loss": 0.3422, + "step": 19750 + }, + { + "epoch": 29.71, + "grad_norm": 8.43567180633545, + "learning_rate": 7.028571428571429e-06, + "loss": 0.3488, + "step": 19760 + }, + { + "epoch": 29.73, + "grad_norm": 8.269464492797852, + "learning_rate": 7.027067669172933e-06, + "loss": 0.2961, + "step": 19770 + }, + { + "epoch": 29.74, + "grad_norm": 10.270258903503418, + "learning_rate": 7.025563909774437e-06, + "loss": 0.2522, + "step": 19780 + }, + { + "epoch": 29.76, + "grad_norm": 5.3482537269592285, + "learning_rate": 7.024060150375941e-06, + "loss": 0.2841, + "step": 19790 + }, + { + "epoch": 29.77, + "grad_norm": 5.868763446807861, + "learning_rate": 7.022556390977444e-06, + "loss": 0.2728, + "step": 19800 + }, + { + "epoch": 29.79, + "grad_norm": 12.772025108337402, + "learning_rate": 7.021052631578948e-06, + "loss": 0.2202, + "step": 19810 + }, + { + "epoch": 29.8, + "grad_norm": 9.140650749206543, + "learning_rate": 7.019548872180452e-06, + "loss": 0.3589, + "step": 19820 + }, + { + "epoch": 29.82, + "grad_norm": 5.013607978820801, + "learning_rate": 7.018045112781956e-06, + "loss": 0.2229, + "step": 19830 + }, + { + "epoch": 29.83, + "grad_norm": 8.760464668273926, + "learning_rate": 7.0165413533834595e-06, + "loss": 0.3337, + "step": 19840 + }, + { + "epoch": 29.85, + "grad_norm": 7.938315391540527, + "learning_rate": 7.015037593984964e-06, + "loss": 0.3052, + "step": 19850 + }, + { + "epoch": 29.86, + "grad_norm": 7.728393077850342, + "learning_rate": 7.0135338345864665e-06, + "loss": 0.2861, + "step": 19860 + }, + { + "epoch": 29.88, + "grad_norm": 6.810389995574951, + "learning_rate": 7.01203007518797e-06, + "loss": 0.3451, + "step": 19870 + }, + { + "epoch": 29.89, + "grad_norm": 3.8391294479370117, + "learning_rate": 7.010526315789474e-06, + "loss": 0.2302, + "step": 19880 + }, + { + "epoch": 29.91, + "grad_norm": 8.415011405944824, + "learning_rate": 7.009022556390977e-06, + "loss": 0.3055, + "step": 19890 + }, + { + "epoch": 29.92, + "grad_norm": 3.6795806884765625, + "learning_rate": 7.0075187969924815e-06, + "loss": 0.3007, + "step": 19900 + }, + { + "epoch": 29.94, + "grad_norm": 6.91939640045166, + "learning_rate": 7.006015037593985e-06, + "loss": 0.296, + "step": 19910 + }, + { + "epoch": 29.95, + "grad_norm": 4.2215681076049805, + "learning_rate": 7.004511278195489e-06, + "loss": 0.26, + "step": 19920 + }, + { + "epoch": 29.97, + "grad_norm": 3.9634735584259033, + "learning_rate": 7.003007518796993e-06, + "loss": 0.2929, + "step": 19930 + }, + { + "epoch": 29.98, + "grad_norm": 4.356061935424805, + "learning_rate": 7.001503759398497e-06, + "loss": 0.2259, + "step": 19940 + }, + { + "epoch": 30.0, + "grad_norm": 1.2935045957565308, + "learning_rate": 7e-06, + "loss": 0.2811, + "step": 19950 + }, + { + "epoch": 30.0, + "eval_accuracy": 0.9304, + "eval_loss": 0.2939961552619934, + "eval_runtime": 85.2279, + "eval_samples_per_second": 117.333, + "eval_steps_per_second": 0.469, + "step": 19950 + }, + { + "epoch": 30.02, + "grad_norm": 4.659907817840576, + "learning_rate": 6.998496240601504e-06, + "loss": 0.2815, + "step": 19960 + }, + { + "epoch": 30.03, + "grad_norm": 7.678098201751709, + "learning_rate": 6.996992481203008e-06, + "loss": 0.2224, + "step": 19970 + }, + { + "epoch": 30.05, + "grad_norm": 3.063901662826538, + "learning_rate": 6.995488721804512e-06, + "loss": 0.2971, + "step": 19980 + }, + { + "epoch": 30.06, + "grad_norm": 5.704662322998047, + "learning_rate": 6.993984962406015e-06, + "loss": 0.276, + "step": 19990 + }, + { + "epoch": 30.08, + "grad_norm": 5.150294780731201, + "learning_rate": 6.992481203007519e-06, + "loss": 0.3514, + "step": 20000 + }, + { + "epoch": 30.09, + "grad_norm": 6.354630947113037, + "learning_rate": 6.990977443609023e-06, + "loss": 0.2391, + "step": 20010 + }, + { + "epoch": 30.11, + "grad_norm": 3.2037034034729004, + "learning_rate": 6.989473684210527e-06, + "loss": 0.2754, + "step": 20020 + }, + { + "epoch": 30.12, + "grad_norm": 9.49938678741455, + "learning_rate": 6.987969924812031e-06, + "loss": 0.3042, + "step": 20030 + }, + { + "epoch": 30.14, + "grad_norm": 7.142296314239502, + "learning_rate": 6.986466165413535e-06, + "loss": 0.2314, + "step": 20040 + }, + { + "epoch": 30.15, + "grad_norm": 7.547910690307617, + "learning_rate": 6.984962406015038e-06, + "loss": 0.2545, + "step": 20050 + }, + { + "epoch": 30.17, + "grad_norm": 3.509197235107422, + "learning_rate": 6.983458646616542e-06, + "loss": 0.2241, + "step": 20060 + }, + { + "epoch": 30.18, + "grad_norm": 5.678248882293701, + "learning_rate": 6.9819548872180455e-06, + "loss": 0.2485, + "step": 20070 + }, + { + "epoch": 30.2, + "grad_norm": 6.966419696807861, + "learning_rate": 6.98045112781955e-06, + "loss": 0.2342, + "step": 20080 + }, + { + "epoch": 30.21, + "grad_norm": 5.888791084289551, + "learning_rate": 6.9789473684210525e-06, + "loss": 0.2433, + "step": 20090 + }, + { + "epoch": 30.23, + "grad_norm": 6.074886322021484, + "learning_rate": 6.977443609022557e-06, + "loss": 0.3245, + "step": 20100 + }, + { + "epoch": 30.24, + "grad_norm": 8.158697128295898, + "learning_rate": 6.9759398496240604e-06, + "loss": 0.2656, + "step": 20110 + }, + { + "epoch": 30.26, + "grad_norm": 4.1151909828186035, + "learning_rate": 6.974436090225565e-06, + "loss": 0.2862, + "step": 20120 + }, + { + "epoch": 30.27, + "grad_norm": 5.835092067718506, + "learning_rate": 6.972932330827068e-06, + "loss": 0.3154, + "step": 20130 + }, + { + "epoch": 30.29, + "grad_norm": 7.01699161529541, + "learning_rate": 6.971428571428573e-06, + "loss": 0.3146, + "step": 20140 + }, + { + "epoch": 30.3, + "grad_norm": 4.701117515563965, + "learning_rate": 6.969924812030075e-06, + "loss": 0.2476, + "step": 20150 + }, + { + "epoch": 30.32, + "grad_norm": 7.054307460784912, + "learning_rate": 6.96842105263158e-06, + "loss": 0.2467, + "step": 20160 + }, + { + "epoch": 30.33, + "grad_norm": 6.54818868637085, + "learning_rate": 6.966917293233083e-06, + "loss": 0.2588, + "step": 20170 + }, + { + "epoch": 30.35, + "grad_norm": 4.79921817779541, + "learning_rate": 6.965413533834588e-06, + "loss": 0.2452, + "step": 20180 + }, + { + "epoch": 30.36, + "grad_norm": 7.635383129119873, + "learning_rate": 6.96390977443609e-06, + "loss": 0.2971, + "step": 20190 + }, + { + "epoch": 30.38, + "grad_norm": 6.174952507019043, + "learning_rate": 6.962406015037595e-06, + "loss": 0.2438, + "step": 20200 + }, + { + "epoch": 30.39, + "grad_norm": 8.053534507751465, + "learning_rate": 6.960902255639098e-06, + "loss": 0.3147, + "step": 20210 + }, + { + "epoch": 30.41, + "grad_norm": 5.930545806884766, + "learning_rate": 6.9593984962406025e-06, + "loss": 0.3246, + "step": 20220 + }, + { + "epoch": 30.42, + "grad_norm": 5.0360493659973145, + "learning_rate": 6.957894736842106e-06, + "loss": 0.3028, + "step": 20230 + }, + { + "epoch": 30.44, + "grad_norm": 5.536324977874756, + "learning_rate": 6.95639097744361e-06, + "loss": 0.2762, + "step": 20240 + }, + { + "epoch": 30.45, + "grad_norm": 3.6296212673187256, + "learning_rate": 6.954887218045113e-06, + "loss": 0.2799, + "step": 20250 + }, + { + "epoch": 30.47, + "grad_norm": 5.655179977416992, + "learning_rate": 6.9533834586466175e-06, + "loss": 0.3003, + "step": 20260 + }, + { + "epoch": 30.48, + "grad_norm": 5.260562419891357, + "learning_rate": 6.951879699248121e-06, + "loss": 0.2221, + "step": 20270 + }, + { + "epoch": 30.5, + "grad_norm": 4.750830173492432, + "learning_rate": 6.950375939849625e-06, + "loss": 0.2439, + "step": 20280 + }, + { + "epoch": 30.51, + "grad_norm": 6.058375835418701, + "learning_rate": 6.948872180451128e-06, + "loss": 0.2716, + "step": 20290 + }, + { + "epoch": 30.53, + "grad_norm": 4.154491901397705, + "learning_rate": 6.947368421052632e-06, + "loss": 0.2311, + "step": 20300 + }, + { + "epoch": 30.54, + "grad_norm": 4.645699977874756, + "learning_rate": 6.945864661654136e-06, + "loss": 0.3293, + "step": 20310 + }, + { + "epoch": 30.56, + "grad_norm": 4.276103973388672, + "learning_rate": 6.94436090225564e-06, + "loss": 0.2236, + "step": 20320 + }, + { + "epoch": 30.57, + "grad_norm": 5.338918685913086, + "learning_rate": 6.942857142857144e-06, + "loss": 0.3162, + "step": 20330 + }, + { + "epoch": 30.59, + "grad_norm": 4.5537285804748535, + "learning_rate": 6.941353383458648e-06, + "loss": 0.2801, + "step": 20340 + }, + { + "epoch": 30.6, + "grad_norm": 4.682258605957031, + "learning_rate": 6.939849624060151e-06, + "loss": 0.2674, + "step": 20350 + }, + { + "epoch": 30.62, + "grad_norm": 5.651388168334961, + "learning_rate": 6.938345864661654e-06, + "loss": 0.3353, + "step": 20360 + }, + { + "epoch": 30.63, + "grad_norm": 4.2494797706604, + "learning_rate": 6.936842105263159e-06, + "loss": 0.3508, + "step": 20370 + }, + { + "epoch": 30.65, + "grad_norm": 5.229772090911865, + "learning_rate": 6.935338345864661e-06, + "loss": 0.2578, + "step": 20380 + }, + { + "epoch": 30.66, + "grad_norm": 1.8404473066329956, + "learning_rate": 6.933834586466166e-06, + "loss": 0.2398, + "step": 20390 + }, + { + "epoch": 30.68, + "grad_norm": 3.122826337814331, + "learning_rate": 6.932330827067669e-06, + "loss": 0.2732, + "step": 20400 + }, + { + "epoch": 30.69, + "grad_norm": 3.0387120246887207, + "learning_rate": 6.930827067669174e-06, + "loss": 0.2722, + "step": 20410 + }, + { + "epoch": 30.71, + "grad_norm": 3.9582316875457764, + "learning_rate": 6.929323308270677e-06, + "loss": 0.2502, + "step": 20420 + }, + { + "epoch": 30.72, + "grad_norm": 4.217288494110107, + "learning_rate": 6.9278195488721815e-06, + "loss": 0.2763, + "step": 20430 + }, + { + "epoch": 30.74, + "grad_norm": 5.593020915985107, + "learning_rate": 6.926315789473684e-06, + "loss": 0.3234, + "step": 20440 + }, + { + "epoch": 30.75, + "grad_norm": 6.763814926147461, + "learning_rate": 6.9248120300751886e-06, + "loss": 0.3275, + "step": 20450 + }, + { + "epoch": 30.77, + "grad_norm": 4.887606620788574, + "learning_rate": 6.923308270676692e-06, + "loss": 0.266, + "step": 20460 + }, + { + "epoch": 30.78, + "grad_norm": 2.9521090984344482, + "learning_rate": 6.9218045112781964e-06, + "loss": 0.2542, + "step": 20470 + }, + { + "epoch": 30.8, + "grad_norm": 6.818195819854736, + "learning_rate": 6.920300751879699e-06, + "loss": 0.3018, + "step": 20480 + }, + { + "epoch": 30.81, + "grad_norm": 9.507380485534668, + "learning_rate": 6.9187969924812035e-06, + "loss": 0.2462, + "step": 20490 + }, + { + "epoch": 30.83, + "grad_norm": 6.438077926635742, + "learning_rate": 6.917293233082707e-06, + "loss": 0.2951, + "step": 20500 + }, + { + "epoch": 30.84, + "grad_norm": 2.8363983631134033, + "learning_rate": 6.915789473684211e-06, + "loss": 0.3104, + "step": 20510 + }, + { + "epoch": 30.86, + "grad_norm": 4.733820915222168, + "learning_rate": 6.914285714285715e-06, + "loss": 0.2756, + "step": 20520 + }, + { + "epoch": 30.87, + "grad_norm": 6.546677112579346, + "learning_rate": 6.912781954887218e-06, + "loss": 0.3449, + "step": 20530 + }, + { + "epoch": 30.89, + "grad_norm": 7.878147125244141, + "learning_rate": 6.911278195488722e-06, + "loss": 0.2393, + "step": 20540 + }, + { + "epoch": 30.9, + "grad_norm": 4.541097164154053, + "learning_rate": 6.909774436090226e-06, + "loss": 0.3316, + "step": 20550 + }, + { + "epoch": 30.92, + "grad_norm": 6.208898544311523, + "learning_rate": 6.90827067669173e-06, + "loss": 0.3414, + "step": 20560 + }, + { + "epoch": 30.93, + "grad_norm": 5.853148460388184, + "learning_rate": 6.906766917293234e-06, + "loss": 0.356, + "step": 20570 + }, + { + "epoch": 30.95, + "grad_norm": 5.502560615539551, + "learning_rate": 6.905263157894737e-06, + "loss": 0.2357, + "step": 20580 + }, + { + "epoch": 30.96, + "grad_norm": 3.3541431427001953, + "learning_rate": 6.903759398496241e-06, + "loss": 0.2527, + "step": 20590 + }, + { + "epoch": 30.98, + "grad_norm": 20.790668487548828, + "learning_rate": 6.902255639097745e-06, + "loss": 0.3085, + "step": 20600 + }, + { + "epoch": 30.99, + "grad_norm": 6.3316216468811035, + "learning_rate": 6.900751879699249e-06, + "loss": 0.2031, + "step": 20610 + }, + { + "epoch": 31.0, + "eval_accuracy": 0.9335, + "eval_loss": 0.2801915109157562, + "eval_runtime": 84.9003, + "eval_samples_per_second": 117.785, + "eval_steps_per_second": 0.471, + "step": 20615 + }, + { + "epoch": 31.01, + "grad_norm": 7.1735148429870605, + "learning_rate": 6.899248120300753e-06, + "loss": 0.2616, + "step": 20620 + }, + { + "epoch": 31.02, + "grad_norm": 3.6667120456695557, + "learning_rate": 6.897744360902256e-06, + "loss": 0.354, + "step": 20630 + }, + { + "epoch": 31.04, + "grad_norm": 5.409661293029785, + "learning_rate": 6.89624060150376e-06, + "loss": 0.323, + "step": 20640 + }, + { + "epoch": 31.05, + "grad_norm": 4.91942834854126, + "learning_rate": 6.894736842105264e-06, + "loss": 0.3082, + "step": 20650 + }, + { + "epoch": 31.07, + "grad_norm": 7.898626804351807, + "learning_rate": 6.8932330827067675e-06, + "loss": 0.2154, + "step": 20660 + }, + { + "epoch": 31.08, + "grad_norm": 5.627191543579102, + "learning_rate": 6.891729323308272e-06, + "loss": 0.3336, + "step": 20670 + }, + { + "epoch": 31.1, + "grad_norm": 6.512294769287109, + "learning_rate": 6.8902255639097746e-06, + "loss": 0.2925, + "step": 20680 + }, + { + "epoch": 31.11, + "grad_norm": 5.055330276489258, + "learning_rate": 6.888721804511279e-06, + "loss": 0.195, + "step": 20690 + }, + { + "epoch": 31.13, + "grad_norm": 4.006707191467285, + "learning_rate": 6.8872180451127825e-06, + "loss": 0.2374, + "step": 20700 + }, + { + "epoch": 31.14, + "grad_norm": 6.585967063903809, + "learning_rate": 6.885714285714287e-06, + "loss": 0.2938, + "step": 20710 + }, + { + "epoch": 31.16, + "grad_norm": 7.993644714355469, + "learning_rate": 6.8842105263157895e-06, + "loss": 0.2862, + "step": 20720 + }, + { + "epoch": 31.17, + "grad_norm": 6.300648212432861, + "learning_rate": 6.882706766917294e-06, + "loss": 0.3122, + "step": 20730 + }, + { + "epoch": 31.19, + "grad_norm": 6.135032653808594, + "learning_rate": 6.881203007518797e-06, + "loss": 0.2494, + "step": 20740 + }, + { + "epoch": 31.2, + "grad_norm": 3.280155658721924, + "learning_rate": 6.879699248120302e-06, + "loss": 0.3043, + "step": 20750 + }, + { + "epoch": 31.22, + "grad_norm": 6.118671417236328, + "learning_rate": 6.878195488721805e-06, + "loss": 0.2779, + "step": 20760 + }, + { + "epoch": 31.23, + "grad_norm": 8.142518043518066, + "learning_rate": 6.87669172932331e-06, + "loss": 0.3617, + "step": 20770 + }, + { + "epoch": 31.25, + "grad_norm": 5.192366123199463, + "learning_rate": 6.875187969924812e-06, + "loss": 0.3242, + "step": 20780 + }, + { + "epoch": 31.26, + "grad_norm": 5.72282075881958, + "learning_rate": 6.873684210526317e-06, + "loss": 0.2499, + "step": 20790 + }, + { + "epoch": 31.28, + "grad_norm": 6.699811935424805, + "learning_rate": 6.87218045112782e-06, + "loss": 0.2588, + "step": 20800 + }, + { + "epoch": 31.29, + "grad_norm": 5.5423173904418945, + "learning_rate": 6.8706766917293246e-06, + "loss": 0.3162, + "step": 20810 + }, + { + "epoch": 31.31, + "grad_norm": 6.422053813934326, + "learning_rate": 6.869172932330827e-06, + "loss": 0.2665, + "step": 20820 + }, + { + "epoch": 31.32, + "grad_norm": 6.014066219329834, + "learning_rate": 6.867669172932332e-06, + "loss": 0.2328, + "step": 20830 + }, + { + "epoch": 31.34, + "grad_norm": 13.063108444213867, + "learning_rate": 6.866165413533835e-06, + "loss": 0.2635, + "step": 20840 + }, + { + "epoch": 31.35, + "grad_norm": 5.6524882316589355, + "learning_rate": 6.864661654135339e-06, + "loss": 0.3334, + "step": 20850 + }, + { + "epoch": 31.37, + "grad_norm": 4.057956218719482, + "learning_rate": 6.863157894736843e-06, + "loss": 0.3186, + "step": 20860 + }, + { + "epoch": 31.38, + "grad_norm": 4.220489025115967, + "learning_rate": 6.861654135338346e-06, + "loss": 0.2551, + "step": 20870 + }, + { + "epoch": 31.4, + "grad_norm": 7.07074499130249, + "learning_rate": 6.86015037593985e-06, + "loss": 0.2236, + "step": 20880 + }, + { + "epoch": 31.41, + "grad_norm": 4.640635967254639, + "learning_rate": 6.8586466165413536e-06, + "loss": 0.2972, + "step": 20890 + }, + { + "epoch": 31.43, + "grad_norm": 11.102641105651855, + "learning_rate": 6.857142857142858e-06, + "loss": 0.2724, + "step": 20900 + }, + { + "epoch": 31.44, + "grad_norm": 8.299273490905762, + "learning_rate": 6.855639097744361e-06, + "loss": 0.267, + "step": 20910 + }, + { + "epoch": 31.46, + "grad_norm": 6.167851448059082, + "learning_rate": 6.854135338345865e-06, + "loss": 0.2695, + "step": 20920 + }, + { + "epoch": 31.47, + "grad_norm": 6.481257438659668, + "learning_rate": 6.8526315789473685e-06, + "loss": 0.2886, + "step": 20930 + }, + { + "epoch": 31.49, + "grad_norm": 6.720365047454834, + "learning_rate": 6.851127819548873e-06, + "loss": 0.2833, + "step": 20940 + }, + { + "epoch": 31.5, + "grad_norm": 6.032981872558594, + "learning_rate": 6.849624060150376e-06, + "loss": 0.2235, + "step": 20950 + }, + { + "epoch": 31.52, + "grad_norm": 7.538634300231934, + "learning_rate": 6.848120300751881e-06, + "loss": 0.261, + "step": 20960 + }, + { + "epoch": 31.53, + "grad_norm": 3.033374071121216, + "learning_rate": 6.846616541353383e-06, + "loss": 0.2572, + "step": 20970 + }, + { + "epoch": 31.55, + "grad_norm": 4.783783435821533, + "learning_rate": 6.845112781954888e-06, + "loss": 0.268, + "step": 20980 + }, + { + "epoch": 31.56, + "grad_norm": 5.95822811126709, + "learning_rate": 6.843609022556391e-06, + "loss": 0.265, + "step": 20990 + }, + { + "epoch": 31.58, + "grad_norm": 8.217907905578613, + "learning_rate": 6.842105263157896e-06, + "loss": 0.3141, + "step": 21000 + }, + { + "epoch": 31.59, + "grad_norm": 9.597149848937988, + "learning_rate": 6.840601503759398e-06, + "loss": 0.2925, + "step": 21010 + }, + { + "epoch": 31.61, + "grad_norm": 7.296209812164307, + "learning_rate": 6.839097744360903e-06, + "loss": 0.3045, + "step": 21020 + }, + { + "epoch": 31.62, + "grad_norm": 5.84061336517334, + "learning_rate": 6.837593984962406e-06, + "loss": 0.3293, + "step": 21030 + }, + { + "epoch": 31.64, + "grad_norm": 4.444825172424316, + "learning_rate": 6.8360902255639106e-06, + "loss": 0.2902, + "step": 21040 + }, + { + "epoch": 31.65, + "grad_norm": 12.595419883728027, + "learning_rate": 6.834586466165414e-06, + "loss": 0.253, + "step": 21050 + }, + { + "epoch": 31.67, + "grad_norm": 8.716811180114746, + "learning_rate": 6.8330827067669185e-06, + "loss": 0.2794, + "step": 21060 + }, + { + "epoch": 31.68, + "grad_norm": 6.067722320556641, + "learning_rate": 6.831578947368421e-06, + "loss": 0.2841, + "step": 21070 + }, + { + "epoch": 31.7, + "grad_norm": 4.765297889709473, + "learning_rate": 6.8300751879699255e-06, + "loss": 0.2732, + "step": 21080 + }, + { + "epoch": 31.71, + "grad_norm": 7.085923671722412, + "learning_rate": 6.828571428571429e-06, + "loss": 0.2453, + "step": 21090 + }, + { + "epoch": 31.73, + "grad_norm": 4.44352912902832, + "learning_rate": 6.827067669172933e-06, + "loss": 0.3153, + "step": 21100 + }, + { + "epoch": 31.74, + "grad_norm": 6.792245864868164, + "learning_rate": 6.825563909774436e-06, + "loss": 0.2952, + "step": 21110 + }, + { + "epoch": 31.76, + "grad_norm": 4.471166133880615, + "learning_rate": 6.82406015037594e-06, + "loss": 0.282, + "step": 21120 + }, + { + "epoch": 31.77, + "grad_norm": 57.37477493286133, + "learning_rate": 6.822556390977444e-06, + "loss": 0.2059, + "step": 21130 + }, + { + "epoch": 31.79, + "grad_norm": 4.992650985717773, + "learning_rate": 6.821052631578948e-06, + "loss": 0.2746, + "step": 21140 + }, + { + "epoch": 31.8, + "grad_norm": 4.329148292541504, + "learning_rate": 6.819548872180452e-06, + "loss": 0.2674, + "step": 21150 + }, + { + "epoch": 31.82, + "grad_norm": 4.691008567810059, + "learning_rate": 6.818045112781956e-06, + "loss": 0.3004, + "step": 21160 + }, + { + "epoch": 31.83, + "grad_norm": 7.85280704498291, + "learning_rate": 6.816541353383459e-06, + "loss": 0.2769, + "step": 21170 + }, + { + "epoch": 31.85, + "grad_norm": 7.473185062408447, + "learning_rate": 6.815037593984963e-06, + "loss": 0.3217, + "step": 21180 + }, + { + "epoch": 31.86, + "grad_norm": 5.13551664352417, + "learning_rate": 6.813533834586467e-06, + "loss": 0.2425, + "step": 21190 + }, + { + "epoch": 31.88, + "grad_norm": 4.801725387573242, + "learning_rate": 6.812030075187971e-06, + "loss": 0.3008, + "step": 21200 + }, + { + "epoch": 31.89, + "grad_norm": 6.320078372955322, + "learning_rate": 6.810526315789474e-06, + "loss": 0.2616, + "step": 21210 + }, + { + "epoch": 31.91, + "grad_norm": 7.002920150756836, + "learning_rate": 6.809022556390978e-06, + "loss": 0.2647, + "step": 21220 + }, + { + "epoch": 31.92, + "grad_norm": 7.263726711273193, + "learning_rate": 6.807518796992482e-06, + "loss": 0.3434, + "step": 21230 + }, + { + "epoch": 31.94, + "grad_norm": 6.2287139892578125, + "learning_rate": 6.806015037593986e-06, + "loss": 0.2775, + "step": 21240 + }, + { + "epoch": 31.95, + "grad_norm": 3.209961175918579, + "learning_rate": 6.8045112781954896e-06, + "loss": 0.2689, + "step": 21250 + }, + { + "epoch": 31.97, + "grad_norm": 5.191007614135742, + "learning_rate": 6.803007518796994e-06, + "loss": 0.2824, + "step": 21260 + }, + { + "epoch": 31.98, + "grad_norm": 7.311644077301025, + "learning_rate": 6.801503759398497e-06, + "loss": 0.3207, + "step": 21270 + }, + { + "epoch": 32.0, + "grad_norm": 33.7657356262207, + "learning_rate": 6.800000000000001e-06, + "loss": 0.3268, + "step": 21280 + }, + { + "epoch": 32.0, + "eval_accuracy": 0.9312, + "eval_loss": 0.2803204655647278, + "eval_runtime": 84.7157, + "eval_samples_per_second": 118.042, + "eval_steps_per_second": 0.472, + "step": 21280 + }, + { + "epoch": 32.02, + "grad_norm": 8.19914722442627, + "learning_rate": 6.7984962406015045e-06, + "loss": 0.3492, + "step": 21290 + }, + { + "epoch": 32.03, + "grad_norm": 5.234302043914795, + "learning_rate": 6.796992481203009e-06, + "loss": 0.2722, + "step": 21300 + }, + { + "epoch": 32.05, + "grad_norm": 5.019562244415283, + "learning_rate": 6.7954887218045115e-06, + "loss": 0.245, + "step": 21310 + }, + { + "epoch": 32.06, + "grad_norm": 4.851130962371826, + "learning_rate": 6.793984962406016e-06, + "loss": 0.3073, + "step": 21320 + }, + { + "epoch": 32.08, + "grad_norm": 4.4718241691589355, + "learning_rate": 6.792481203007519e-06, + "loss": 0.2929, + "step": 21330 + }, + { + "epoch": 32.09, + "grad_norm": 4.94941520690918, + "learning_rate": 6.790977443609023e-06, + "loss": 0.2941, + "step": 21340 + }, + { + "epoch": 32.11, + "grad_norm": 5.517939567565918, + "learning_rate": 6.789473684210527e-06, + "loss": 0.2528, + "step": 21350 + }, + { + "epoch": 32.12, + "grad_norm": 6.751891136169434, + "learning_rate": 6.78796992481203e-06, + "loss": 0.3063, + "step": 21360 + }, + { + "epoch": 32.14, + "grad_norm": 6.197683811187744, + "learning_rate": 6.786466165413534e-06, + "loss": 0.2686, + "step": 21370 + }, + { + "epoch": 32.15, + "grad_norm": 5.212826728820801, + "learning_rate": 6.784962406015038e-06, + "loss": 0.2153, + "step": 21380 + }, + { + "epoch": 32.17, + "grad_norm": 6.147881984710693, + "learning_rate": 6.783458646616542e-06, + "loss": 0.3603, + "step": 21390 + }, + { + "epoch": 32.18, + "grad_norm": 6.068096160888672, + "learning_rate": 6.781954887218045e-06, + "loss": 0.3144, + "step": 21400 + }, + { + "epoch": 32.2, + "grad_norm": 6.553431987762451, + "learning_rate": 6.780451127819549e-06, + "loss": 0.2902, + "step": 21410 + }, + { + "epoch": 32.21, + "grad_norm": 6.4035515785217285, + "learning_rate": 6.778947368421053e-06, + "loss": 0.2156, + "step": 21420 + }, + { + "epoch": 32.23, + "grad_norm": 4.220142841339111, + "learning_rate": 6.777443609022557e-06, + "loss": 0.2905, + "step": 21430 + }, + { + "epoch": 32.24, + "grad_norm": 4.17386531829834, + "learning_rate": 6.775939849624061e-06, + "loss": 0.3002, + "step": 21440 + }, + { + "epoch": 32.26, + "grad_norm": 3.2023849487304688, + "learning_rate": 6.774436090225564e-06, + "loss": 0.3396, + "step": 21450 + }, + { + "epoch": 32.27, + "grad_norm": 2.8168628215789795, + "learning_rate": 6.772932330827068e-06, + "loss": 0.248, + "step": 21460 + }, + { + "epoch": 32.29, + "grad_norm": 4.326303482055664, + "learning_rate": 6.771428571428572e-06, + "loss": 0.2821, + "step": 21470 + }, + { + "epoch": 32.3, + "grad_norm": 3.3244757652282715, + "learning_rate": 6.769924812030076e-06, + "loss": 0.2907, + "step": 21480 + }, + { + "epoch": 32.32, + "grad_norm": 5.797328948974609, + "learning_rate": 6.76842105263158e-06, + "loss": 0.2961, + "step": 21490 + }, + { + "epoch": 32.33, + "grad_norm": 3.571841239929199, + "learning_rate": 6.766917293233083e-06, + "loss": 0.1967, + "step": 21500 + }, + { + "epoch": 32.35, + "grad_norm": 5.261970520019531, + "learning_rate": 6.765413533834587e-06, + "loss": 0.241, + "step": 21510 + }, + { + "epoch": 32.36, + "grad_norm": 6.464574813842773, + "learning_rate": 6.7639097744360905e-06, + "loss": 0.2727, + "step": 21520 + }, + { + "epoch": 32.38, + "grad_norm": 4.337974548339844, + "learning_rate": 6.762406015037595e-06, + "loss": 0.2508, + "step": 21530 + }, + { + "epoch": 32.39, + "grad_norm": 3.5019898414611816, + "learning_rate": 6.760902255639098e-06, + "loss": 0.2173, + "step": 21540 + }, + { + "epoch": 32.41, + "grad_norm": 7.147461891174316, + "learning_rate": 6.759398496240602e-06, + "loss": 0.3561, + "step": 21550 + }, + { + "epoch": 32.42, + "grad_norm": 5.347845077514648, + "learning_rate": 6.7578947368421054e-06, + "loss": 0.2572, + "step": 21560 + }, + { + "epoch": 32.44, + "grad_norm": 3.541206121444702, + "learning_rate": 6.75639097744361e-06, + "loss": 0.266, + "step": 21570 + }, + { + "epoch": 32.45, + "grad_norm": 4.880330562591553, + "learning_rate": 6.754887218045113e-06, + "loss": 0.2968, + "step": 21580 + }, + { + "epoch": 32.47, + "grad_norm": 6.621898174285889, + "learning_rate": 6.753383458646618e-06, + "loss": 0.1805, + "step": 21590 + }, + { + "epoch": 32.48, + "grad_norm": 7.193774700164795, + "learning_rate": 6.75187969924812e-06, + "loss": 0.2282, + "step": 21600 + }, + { + "epoch": 32.5, + "grad_norm": 5.0164408683776855, + "learning_rate": 6.750375939849625e-06, + "loss": 0.3077, + "step": 21610 + }, + { + "epoch": 32.51, + "grad_norm": 8.061211585998535, + "learning_rate": 6.748872180451128e-06, + "loss": 0.2758, + "step": 21620 + }, + { + "epoch": 32.53, + "grad_norm": 6.3904852867126465, + "learning_rate": 6.747368421052633e-06, + "loss": 0.2499, + "step": 21630 + }, + { + "epoch": 32.54, + "grad_norm": 8.771563529968262, + "learning_rate": 6.745864661654135e-06, + "loss": 0.3302, + "step": 21640 + }, + { + "epoch": 32.56, + "grad_norm": 7.61922550201416, + "learning_rate": 6.74436090225564e-06, + "loss": 0.2691, + "step": 21650 + }, + { + "epoch": 32.57, + "grad_norm": 7.187370300292969, + "learning_rate": 6.742857142857143e-06, + "loss": 0.3071, + "step": 21660 + }, + { + "epoch": 32.59, + "grad_norm": 3.4237334728240967, + "learning_rate": 6.7413533834586475e-06, + "loss": 0.296, + "step": 21670 + }, + { + "epoch": 32.6, + "grad_norm": 4.828934192657471, + "learning_rate": 6.739849624060151e-06, + "loss": 0.3158, + "step": 21680 + }, + { + "epoch": 32.62, + "grad_norm": 5.043696880340576, + "learning_rate": 6.738345864661655e-06, + "loss": 0.2656, + "step": 21690 + }, + { + "epoch": 32.63, + "grad_norm": 6.4835004806518555, + "learning_rate": 6.736842105263158e-06, + "loss": 0.2796, + "step": 21700 + }, + { + "epoch": 32.65, + "grad_norm": 2.3352560997009277, + "learning_rate": 6.7353383458646624e-06, + "loss": 0.294, + "step": 21710 + }, + { + "epoch": 32.66, + "grad_norm": 7.649822235107422, + "learning_rate": 6.733834586466166e-06, + "loss": 0.2857, + "step": 21720 + }, + { + "epoch": 32.68, + "grad_norm": 5.9862565994262695, + "learning_rate": 6.73233082706767e-06, + "loss": 0.2401, + "step": 21730 + }, + { + "epoch": 32.69, + "grad_norm": 5.723448276519775, + "learning_rate": 6.730827067669173e-06, + "loss": 0.3426, + "step": 21740 + }, + { + "epoch": 32.71, + "grad_norm": 2.8246076107025146, + "learning_rate": 6.729323308270677e-06, + "loss": 0.2431, + "step": 21750 + }, + { + "epoch": 32.72, + "grad_norm": 7.292664527893066, + "learning_rate": 6.727819548872181e-06, + "loss": 0.2624, + "step": 21760 + }, + { + "epoch": 32.74, + "grad_norm": 4.674510478973389, + "learning_rate": 6.726315789473685e-06, + "loss": 0.3025, + "step": 21770 + }, + { + "epoch": 32.75, + "grad_norm": 4.479643821716309, + "learning_rate": 6.724812030075189e-06, + "loss": 0.2175, + "step": 21780 + }, + { + "epoch": 32.77, + "grad_norm": 4.182172775268555, + "learning_rate": 6.723308270676693e-06, + "loss": 0.3039, + "step": 21790 + }, + { + "epoch": 32.78, + "grad_norm": 5.148652076721191, + "learning_rate": 6.721804511278196e-06, + "loss": 0.2735, + "step": 21800 + }, + { + "epoch": 32.8, + "grad_norm": 4.162076950073242, + "learning_rate": 6.7203007518797e-06, + "loss": 0.2652, + "step": 21810 + }, + { + "epoch": 32.81, + "grad_norm": 5.020504474639893, + "learning_rate": 6.718796992481204e-06, + "loss": 0.3151, + "step": 21820 + }, + { + "epoch": 32.83, + "grad_norm": 3.3114843368530273, + "learning_rate": 6.717293233082708e-06, + "loss": 0.3088, + "step": 21830 + }, + { + "epoch": 32.84, + "grad_norm": 7.768105983734131, + "learning_rate": 6.715789473684211e-06, + "loss": 0.2574, + "step": 21840 + }, + { + "epoch": 32.86, + "grad_norm": 5.251821517944336, + "learning_rate": 6.714285714285714e-06, + "loss": 0.2308, + "step": 21850 + }, + { + "epoch": 32.87, + "grad_norm": 3.617433786392212, + "learning_rate": 6.712781954887219e-06, + "loss": 0.2609, + "step": 21860 + }, + { + "epoch": 32.89, + "grad_norm": 7.9688615798950195, + "learning_rate": 6.711278195488722e-06, + "loss": 0.2844, + "step": 21870 + }, + { + "epoch": 32.9, + "grad_norm": 3.2969768047332764, + "learning_rate": 6.7097744360902265e-06, + "loss": 0.2347, + "step": 21880 + }, + { + "epoch": 32.92, + "grad_norm": 7.453803539276123, + "learning_rate": 6.708270676691729e-06, + "loss": 0.2808, + "step": 21890 + }, + { + "epoch": 32.93, + "grad_norm": 4.9906744956970215, + "learning_rate": 6.7067669172932335e-06, + "loss": 0.2362, + "step": 21900 + }, + { + "epoch": 32.95, + "grad_norm": 3.7684195041656494, + "learning_rate": 6.705263157894737e-06, + "loss": 0.2816, + "step": 21910 + }, + { + "epoch": 32.96, + "grad_norm": 4.246355056762695, + "learning_rate": 6.7037593984962414e-06, + "loss": 0.2608, + "step": 21920 + }, + { + "epoch": 32.98, + "grad_norm": 10.584210395812988, + "learning_rate": 6.702255639097744e-06, + "loss": 0.3571, + "step": 21930 + }, + { + "epoch": 32.99, + "grad_norm": 7.687515735626221, + "learning_rate": 6.7007518796992485e-06, + "loss": 0.218, + "step": 21940 + }, + { + "epoch": 33.0, + "eval_accuracy": 0.9307, + "eval_loss": 0.28834185004234314, + "eval_runtime": 84.8678, + "eval_samples_per_second": 117.83, + "eval_steps_per_second": 0.471, + "step": 21945 + }, + { + "epoch": 33.01, + "grad_norm": 2.6484274864196777, + "learning_rate": 6.699248120300752e-06, + "loss": 0.2635, + "step": 21950 + }, + { + "epoch": 33.02, + "grad_norm": 6.136809825897217, + "learning_rate": 6.697744360902256e-06, + "loss": 0.2438, + "step": 21960 + }, + { + "epoch": 33.04, + "grad_norm": 2.4499707221984863, + "learning_rate": 6.69624060150376e-06, + "loss": 0.2448, + "step": 21970 + }, + { + "epoch": 33.05, + "grad_norm": 8.61922550201416, + "learning_rate": 6.694736842105264e-06, + "loss": 0.2895, + "step": 21980 + }, + { + "epoch": 33.07, + "grad_norm": 4.760676383972168, + "learning_rate": 6.693233082706767e-06, + "loss": 0.2138, + "step": 21990 + }, + { + "epoch": 33.08, + "grad_norm": 4.3553853034973145, + "learning_rate": 6.691729323308271e-06, + "loss": 0.3182, + "step": 22000 + }, + { + "epoch": 33.1, + "grad_norm": 6.192696571350098, + "learning_rate": 6.690225563909775e-06, + "loss": 0.2446, + "step": 22010 + }, + { + "epoch": 33.11, + "grad_norm": 8.466830253601074, + "learning_rate": 6.688721804511279e-06, + "loss": 0.3202, + "step": 22020 + }, + { + "epoch": 33.13, + "grad_norm": 5.688916206359863, + "learning_rate": 6.687218045112782e-06, + "loss": 0.3136, + "step": 22030 + }, + { + "epoch": 33.14, + "grad_norm": 7.089521408081055, + "learning_rate": 6.685714285714286e-06, + "loss": 0.3003, + "step": 22040 + }, + { + "epoch": 33.16, + "grad_norm": 5.240951061248779, + "learning_rate": 6.68421052631579e-06, + "loss": 0.2772, + "step": 22050 + }, + { + "epoch": 33.17, + "grad_norm": 4.90128755569458, + "learning_rate": 6.682706766917294e-06, + "loss": 0.2949, + "step": 22060 + }, + { + "epoch": 33.19, + "grad_norm": 1.9220607280731201, + "learning_rate": 6.681203007518798e-06, + "loss": 0.2716, + "step": 22070 + }, + { + "epoch": 33.2, + "grad_norm": 11.761991500854492, + "learning_rate": 6.679699248120302e-06, + "loss": 0.2519, + "step": 22080 + }, + { + "epoch": 33.22, + "grad_norm": 3.534703016281128, + "learning_rate": 6.678195488721805e-06, + "loss": 0.2398, + "step": 22090 + }, + { + "epoch": 33.23, + "grad_norm": 5.2441887855529785, + "learning_rate": 6.676691729323309e-06, + "loss": 0.2638, + "step": 22100 + }, + { + "epoch": 33.25, + "grad_norm": 5.488763809204102, + "learning_rate": 6.6751879699248125e-06, + "loss": 0.2381, + "step": 22110 + }, + { + "epoch": 33.26, + "grad_norm": 4.989120006561279, + "learning_rate": 6.673684210526317e-06, + "loss": 0.2582, + "step": 22120 + }, + { + "epoch": 33.28, + "grad_norm": 5.641976833343506, + "learning_rate": 6.6721804511278196e-06, + "loss": 0.2608, + "step": 22130 + }, + { + "epoch": 33.29, + "grad_norm": 5.41449499130249, + "learning_rate": 6.670676691729324e-06, + "loss": 0.3076, + "step": 22140 + }, + { + "epoch": 33.31, + "grad_norm": 5.977665424346924, + "learning_rate": 6.6691729323308274e-06, + "loss": 0.2189, + "step": 22150 + }, + { + "epoch": 33.32, + "grad_norm": 6.3149213790893555, + "learning_rate": 6.667669172932332e-06, + "loss": 0.3281, + "step": 22160 + }, + { + "epoch": 33.34, + "grad_norm": 2.910731077194214, + "learning_rate": 6.666165413533835e-06, + "loss": 0.2563, + "step": 22170 + }, + { + "epoch": 33.35, + "grad_norm": 5.496181011199951, + "learning_rate": 6.664661654135339e-06, + "loss": 0.2087, + "step": 22180 + }, + { + "epoch": 33.37, + "grad_norm": 4.933781623840332, + "learning_rate": 6.663157894736842e-06, + "loss": 0.2249, + "step": 22190 + }, + { + "epoch": 33.38, + "grad_norm": 6.685660362243652, + "learning_rate": 6.661654135338347e-06, + "loss": 0.3751, + "step": 22200 + }, + { + "epoch": 33.4, + "grad_norm": 3.7350828647613525, + "learning_rate": 6.66015037593985e-06, + "loss": 0.2126, + "step": 22210 + }, + { + "epoch": 33.41, + "grad_norm": 7.765669822692871, + "learning_rate": 6.658646616541355e-06, + "loss": 0.3009, + "step": 22220 + }, + { + "epoch": 33.43, + "grad_norm": 5.381826877593994, + "learning_rate": 6.657142857142857e-06, + "loss": 0.3344, + "step": 22230 + }, + { + "epoch": 33.44, + "grad_norm": 3.3078157901763916, + "learning_rate": 6.655639097744362e-06, + "loss": 0.3077, + "step": 22240 + }, + { + "epoch": 33.46, + "grad_norm": 3.342693328857422, + "learning_rate": 6.654135338345865e-06, + "loss": 0.2235, + "step": 22250 + }, + { + "epoch": 33.47, + "grad_norm": 5.725019454956055, + "learning_rate": 6.6526315789473695e-06, + "loss": 0.3253, + "step": 22260 + }, + { + "epoch": 33.49, + "grad_norm": 7.685015678405762, + "learning_rate": 6.651127819548873e-06, + "loss": 0.2343, + "step": 22270 + }, + { + "epoch": 33.5, + "grad_norm": 3.291060447692871, + "learning_rate": 6.649624060150377e-06, + "loss": 0.2907, + "step": 22280 + }, + { + "epoch": 33.52, + "grad_norm": 5.01746940612793, + "learning_rate": 6.64812030075188e-06, + "loss": 0.2124, + "step": 22290 + }, + { + "epoch": 33.53, + "grad_norm": 8.099120140075684, + "learning_rate": 6.6466165413533845e-06, + "loss": 0.2862, + "step": 22300 + }, + { + "epoch": 33.55, + "grad_norm": 6.425881862640381, + "learning_rate": 6.645112781954888e-06, + "loss": 0.216, + "step": 22310 + }, + { + "epoch": 33.56, + "grad_norm": 6.16859769821167, + "learning_rate": 6.643609022556392e-06, + "loss": 0.3214, + "step": 22320 + }, + { + "epoch": 33.58, + "grad_norm": 4.426242828369141, + "learning_rate": 6.642105263157895e-06, + "loss": 0.2471, + "step": 22330 + }, + { + "epoch": 33.59, + "grad_norm": 4.199456691741943, + "learning_rate": 6.6406015037593985e-06, + "loss": 0.2696, + "step": 22340 + }, + { + "epoch": 33.61, + "grad_norm": 4.16933536529541, + "learning_rate": 6.639097744360903e-06, + "loss": 0.1926, + "step": 22350 + }, + { + "epoch": 33.62, + "grad_norm": 7.852532863616943, + "learning_rate": 6.6375939849624064e-06, + "loss": 0.2983, + "step": 22360 + }, + { + "epoch": 33.64, + "grad_norm": 5.3104119300842285, + "learning_rate": 6.63609022556391e-06, + "loss": 0.2412, + "step": 22370 + }, + { + "epoch": 33.65, + "grad_norm": 4.502700328826904, + "learning_rate": 6.6345864661654135e-06, + "loss": 0.3023, + "step": 22380 + }, + { + "epoch": 33.67, + "grad_norm": 4.96920108795166, + "learning_rate": 6.633082706766918e-06, + "loss": 0.2634, + "step": 22390 + }, + { + "epoch": 33.68, + "grad_norm": 7.31613826751709, + "learning_rate": 6.631578947368421e-06, + "loss": 0.3551, + "step": 22400 + }, + { + "epoch": 33.7, + "grad_norm": 4.635395526885986, + "learning_rate": 6.630075187969926e-06, + "loss": 0.2552, + "step": 22410 + }, + { + "epoch": 33.71, + "grad_norm": 7.699193477630615, + "learning_rate": 6.628571428571428e-06, + "loss": 0.2955, + "step": 22420 + }, + { + "epoch": 33.73, + "grad_norm": 5.194084644317627, + "learning_rate": 6.627067669172933e-06, + "loss": 0.2743, + "step": 22430 + }, + { + "epoch": 33.74, + "grad_norm": 9.023811340332031, + "learning_rate": 6.625563909774436e-06, + "loss": 0.2817, + "step": 22440 + }, + { + "epoch": 33.76, + "grad_norm": 4.306658744812012, + "learning_rate": 6.624060150375941e-06, + "loss": 0.3008, + "step": 22450 + }, + { + "epoch": 33.77, + "grad_norm": 2.301297187805176, + "learning_rate": 6.622556390977443e-06, + "loss": 0.2354, + "step": 22460 + }, + { + "epoch": 33.79, + "grad_norm": 4.28262186050415, + "learning_rate": 6.621052631578948e-06, + "loss": 0.2656, + "step": 22470 + }, + { + "epoch": 33.8, + "grad_norm": 6.531603813171387, + "learning_rate": 6.619548872180451e-06, + "loss": 0.3753, + "step": 22480 + }, + { + "epoch": 33.82, + "grad_norm": 8.290949821472168, + "learning_rate": 6.6180451127819556e-06, + "loss": 0.2842, + "step": 22490 + }, + { + "epoch": 33.83, + "grad_norm": 3.5719752311706543, + "learning_rate": 6.616541353383459e-06, + "loss": 0.2617, + "step": 22500 + }, + { + "epoch": 33.85, + "grad_norm": 6.209147930145264, + "learning_rate": 6.6150375939849635e-06, + "loss": 0.3299, + "step": 22510 + }, + { + "epoch": 33.86, + "grad_norm": 3.604736566543579, + "learning_rate": 6.613533834586466e-06, + "loss": 0.1786, + "step": 22520 + }, + { + "epoch": 33.88, + "grad_norm": 8.47739028930664, + "learning_rate": 6.6120300751879705e-06, + "loss": 0.3399, + "step": 22530 + }, + { + "epoch": 33.89, + "grad_norm": 4.806243896484375, + "learning_rate": 6.610526315789474e-06, + "loss": 0.2474, + "step": 22540 + }, + { + "epoch": 33.91, + "grad_norm": 4.551697731018066, + "learning_rate": 6.609022556390978e-06, + "loss": 0.2451, + "step": 22550 + }, + { + "epoch": 33.92, + "grad_norm": 4.965244293212891, + "learning_rate": 6.607518796992481e-06, + "loss": 0.2971, + "step": 22560 + }, + { + "epoch": 33.94, + "grad_norm": 4.888105392456055, + "learning_rate": 6.606015037593985e-06, + "loss": 0.3368, + "step": 22570 + }, + { + "epoch": 33.95, + "grad_norm": 7.72111701965332, + "learning_rate": 6.604511278195489e-06, + "loss": 0.2713, + "step": 22580 + }, + { + "epoch": 33.97, + "grad_norm": 4.930686950683594, + "learning_rate": 6.603007518796993e-06, + "loss": 0.2445, + "step": 22590 + }, + { + "epoch": 33.98, + "grad_norm": 6.420777797698975, + "learning_rate": 6.601503759398497e-06, + "loss": 0.2183, + "step": 22600 + }, + { + "epoch": 34.0, + "grad_norm": 0.24922990798950195, + "learning_rate": 6.600000000000001e-06, + "loss": 0.217, + "step": 22610 + }, + { + "epoch": 34.0, + "eval_accuracy": 0.9356, + "eval_loss": 0.2865731716156006, + "eval_runtime": 84.2017, + "eval_samples_per_second": 118.763, + "eval_steps_per_second": 0.475, + "step": 22610 + }, + { + "epoch": 34.02, + "grad_norm": 5.616089820861816, + "learning_rate": 6.598496240601504e-06, + "loss": 0.3432, + "step": 22620 + }, + { + "epoch": 34.03, + "grad_norm": 1.8118317127227783, + "learning_rate": 6.596992481203008e-06, + "loss": 0.2664, + "step": 22630 + }, + { + "epoch": 34.05, + "grad_norm": 5.037443161010742, + "learning_rate": 6.595488721804512e-06, + "loss": 0.27, + "step": 22640 + }, + { + "epoch": 34.06, + "grad_norm": 3.1154747009277344, + "learning_rate": 6.593984962406016e-06, + "loss": 0.2335, + "step": 22650 + }, + { + "epoch": 34.08, + "grad_norm": 5.144960880279541, + "learning_rate": 6.592481203007519e-06, + "loss": 0.3204, + "step": 22660 + }, + { + "epoch": 34.09, + "grad_norm": 5.48844575881958, + "learning_rate": 6.590977443609023e-06, + "loss": 0.3072, + "step": 22670 + }, + { + "epoch": 34.11, + "grad_norm": 8.889720916748047, + "learning_rate": 6.589473684210527e-06, + "loss": 0.2398, + "step": 22680 + }, + { + "epoch": 34.12, + "grad_norm": 4.211205959320068, + "learning_rate": 6.587969924812031e-06, + "loss": 0.2425, + "step": 22690 + }, + { + "epoch": 34.14, + "grad_norm": 6.309680938720703, + "learning_rate": 6.5864661654135345e-06, + "loss": 0.2828, + "step": 22700 + }, + { + "epoch": 34.15, + "grad_norm": 5.038494110107422, + "learning_rate": 6.584962406015039e-06, + "loss": 0.2633, + "step": 22710 + }, + { + "epoch": 34.17, + "grad_norm": 1.1278971433639526, + "learning_rate": 6.583458646616542e-06, + "loss": 0.2935, + "step": 22720 + }, + { + "epoch": 34.18, + "grad_norm": 5.858100891113281, + "learning_rate": 6.581954887218046e-06, + "loss": 0.2771, + "step": 22730 + }, + { + "epoch": 34.2, + "grad_norm": 6.1508402824401855, + "learning_rate": 6.5804511278195495e-06, + "loss": 0.3256, + "step": 22740 + }, + { + "epoch": 34.21, + "grad_norm": 5.791116237640381, + "learning_rate": 6.578947368421054e-06, + "loss": 0.2354, + "step": 22750 + }, + { + "epoch": 34.23, + "grad_norm": 6.087039470672607, + "learning_rate": 6.5774436090225565e-06, + "loss": 0.2098, + "step": 22760 + }, + { + "epoch": 34.24, + "grad_norm": 6.835604667663574, + "learning_rate": 6.575939849624061e-06, + "loss": 0.285, + "step": 22770 + }, + { + "epoch": 34.26, + "grad_norm": 6.213393688201904, + "learning_rate": 6.574436090225564e-06, + "loss": 0.2488, + "step": 22780 + }, + { + "epoch": 34.27, + "grad_norm": 6.763580322265625, + "learning_rate": 6.572932330827069e-06, + "loss": 0.2043, + "step": 22790 + }, + { + "epoch": 34.29, + "grad_norm": 5.975349426269531, + "learning_rate": 6.571428571428572e-06, + "loss": 0.3282, + "step": 22800 + }, + { + "epoch": 34.3, + "grad_norm": 5.7898406982421875, + "learning_rate": 6.569924812030077e-06, + "loss": 0.2369, + "step": 22810 + }, + { + "epoch": 34.32, + "grad_norm": 3.7875592708587646, + "learning_rate": 6.568421052631579e-06, + "loss": 0.3036, + "step": 22820 + }, + { + "epoch": 34.33, + "grad_norm": 7.54277229309082, + "learning_rate": 6.566917293233083e-06, + "loss": 0.2441, + "step": 22830 + }, + { + "epoch": 34.35, + "grad_norm": 5.216723442077637, + "learning_rate": 6.565413533834587e-06, + "loss": 0.2537, + "step": 22840 + }, + { + "epoch": 34.36, + "grad_norm": 5.119565486907959, + "learning_rate": 6.56390977443609e-06, + "loss": 0.3228, + "step": 22850 + }, + { + "epoch": 34.38, + "grad_norm": 10.754874229431152, + "learning_rate": 6.562406015037594e-06, + "loss": 0.1848, + "step": 22860 + }, + { + "epoch": 34.39, + "grad_norm": 8.560099601745605, + "learning_rate": 6.560902255639098e-06, + "loss": 0.2648, + "step": 22870 + }, + { + "epoch": 34.41, + "grad_norm": 6.255738258361816, + "learning_rate": 6.559398496240602e-06, + "loss": 0.2562, + "step": 22880 + }, + { + "epoch": 34.42, + "grad_norm": 4.35892391204834, + "learning_rate": 6.557894736842106e-06, + "loss": 0.2838, + "step": 22890 + }, + { + "epoch": 34.44, + "grad_norm": 6.469654083251953, + "learning_rate": 6.55639097744361e-06, + "loss": 0.2885, + "step": 22900 + }, + { + "epoch": 34.45, + "grad_norm": 7.00548791885376, + "learning_rate": 6.554887218045113e-06, + "loss": 0.327, + "step": 22910 + }, + { + "epoch": 34.47, + "grad_norm": 6.044936180114746, + "learning_rate": 6.553383458646617e-06, + "loss": 0.2582, + "step": 22920 + }, + { + "epoch": 34.48, + "grad_norm": 6.903324127197266, + "learning_rate": 6.5518796992481206e-06, + "loss": 0.2673, + "step": 22930 + }, + { + "epoch": 34.5, + "grad_norm": 4.936728477478027, + "learning_rate": 6.550375939849625e-06, + "loss": 0.2469, + "step": 22940 + }, + { + "epoch": 34.51, + "grad_norm": 5.086190223693848, + "learning_rate": 6.548872180451128e-06, + "loss": 0.2323, + "step": 22950 + }, + { + "epoch": 34.53, + "grad_norm": 2.871715784072876, + "learning_rate": 6.547368421052632e-06, + "loss": 0.2158, + "step": 22960 + }, + { + "epoch": 34.54, + "grad_norm": 4.698546886444092, + "learning_rate": 6.5458646616541355e-06, + "loss": 0.257, + "step": 22970 + }, + { + "epoch": 34.56, + "grad_norm": 6.885629177093506, + "learning_rate": 6.54436090225564e-06, + "loss": 0.2187, + "step": 22980 + }, + { + "epoch": 34.57, + "grad_norm": 3.128361463546753, + "learning_rate": 6.542857142857143e-06, + "loss": 0.2876, + "step": 22990 + }, + { + "epoch": 34.59, + "grad_norm": 3.115068197250366, + "learning_rate": 6.541353383458648e-06, + "loss": 0.2565, + "step": 23000 + }, + { + "epoch": 34.6, + "grad_norm": 4.966819763183594, + "learning_rate": 6.53984962406015e-06, + "loss": 0.3301, + "step": 23010 + }, + { + "epoch": 34.62, + "grad_norm": 3.8072173595428467, + "learning_rate": 6.538345864661655e-06, + "loss": 0.2815, + "step": 23020 + }, + { + "epoch": 34.63, + "grad_norm": 7.346180438995361, + "learning_rate": 6.536842105263158e-06, + "loss": 0.2456, + "step": 23030 + }, + { + "epoch": 34.65, + "grad_norm": 5.6631317138671875, + "learning_rate": 6.535338345864663e-06, + "loss": 0.2629, + "step": 23040 + }, + { + "epoch": 34.66, + "grad_norm": 4.316861629486084, + "learning_rate": 6.533834586466165e-06, + "loss": 0.218, + "step": 23050 + }, + { + "epoch": 34.68, + "grad_norm": 5.202853202819824, + "learning_rate": 6.53233082706767e-06, + "loss": 0.3226, + "step": 23060 + }, + { + "epoch": 34.69, + "grad_norm": 5.017209529876709, + "learning_rate": 6.530827067669173e-06, + "loss": 0.3087, + "step": 23070 + }, + { + "epoch": 34.71, + "grad_norm": 9.907183647155762, + "learning_rate": 6.529323308270678e-06, + "loss": 0.2096, + "step": 23080 + }, + { + "epoch": 34.72, + "grad_norm": 5.446949005126953, + "learning_rate": 6.527819548872181e-06, + "loss": 0.2725, + "step": 23090 + }, + { + "epoch": 34.74, + "grad_norm": 5.2780537605285645, + "learning_rate": 6.526315789473685e-06, + "loss": 0.2429, + "step": 23100 + }, + { + "epoch": 34.75, + "grad_norm": 3.5522546768188477, + "learning_rate": 6.524812030075188e-06, + "loss": 0.257, + "step": 23110 + }, + { + "epoch": 34.77, + "grad_norm": 6.132695198059082, + "learning_rate": 6.5233082706766925e-06, + "loss": 0.3128, + "step": 23120 + }, + { + "epoch": 34.78, + "grad_norm": 7.575121879577637, + "learning_rate": 6.521804511278196e-06, + "loss": 0.2529, + "step": 23130 + }, + { + "epoch": 34.8, + "grad_norm": 6.6957244873046875, + "learning_rate": 6.5203007518797e-06, + "loss": 0.2612, + "step": 23140 + }, + { + "epoch": 34.81, + "grad_norm": 4.827077865600586, + "learning_rate": 6.518796992481203e-06, + "loss": 0.307, + "step": 23150 + }, + { + "epoch": 34.83, + "grad_norm": 7.132809638977051, + "learning_rate": 6.5172932330827074e-06, + "loss": 0.2111, + "step": 23160 + }, + { + "epoch": 34.84, + "grad_norm": 3.0563974380493164, + "learning_rate": 6.515789473684211e-06, + "loss": 0.2332, + "step": 23170 + }, + { + "epoch": 34.86, + "grad_norm": 5.9987592697143555, + "learning_rate": 6.514285714285715e-06, + "loss": 0.3418, + "step": 23180 + }, + { + "epoch": 34.87, + "grad_norm": 4.280242443084717, + "learning_rate": 6.512781954887219e-06, + "loss": 0.2447, + "step": 23190 + }, + { + "epoch": 34.89, + "grad_norm": 4.552205562591553, + "learning_rate": 6.511278195488722e-06, + "loss": 0.2697, + "step": 23200 + }, + { + "epoch": 34.9, + "grad_norm": 5.720810890197754, + "learning_rate": 6.509774436090226e-06, + "loss": 0.3134, + "step": 23210 + }, + { + "epoch": 34.92, + "grad_norm": 5.941717147827148, + "learning_rate": 6.50827067669173e-06, + "loss": 0.2535, + "step": 23220 + }, + { + "epoch": 34.93, + "grad_norm": 5.756423473358154, + "learning_rate": 6.506766917293234e-06, + "loss": 0.2717, + "step": 23230 + }, + { + "epoch": 34.95, + "grad_norm": 6.748666763305664, + "learning_rate": 6.505263157894738e-06, + "loss": 0.2522, + "step": 23240 + }, + { + "epoch": 34.96, + "grad_norm": 3.929311990737915, + "learning_rate": 6.503759398496241e-06, + "loss": 0.3033, + "step": 23250 + }, + { + "epoch": 34.98, + "grad_norm": 5.266348838806152, + "learning_rate": 6.502255639097745e-06, + "loss": 0.2428, + "step": 23260 + }, + { + "epoch": 34.99, + "grad_norm": 6.238527774810791, + "learning_rate": 6.500751879699249e-06, + "loss": 0.2032, + "step": 23270 + }, + { + "epoch": 35.0, + "eval_accuracy": 0.9317, + "eval_loss": 0.29047271609306335, + "eval_runtime": 84.6472, + "eval_samples_per_second": 118.137, + "eval_steps_per_second": 0.473, + "step": 23275 + }, + { + "epoch": 35.01, + "grad_norm": 4.792929172515869, + "learning_rate": 6.499248120300753e-06, + "loss": 0.2373, + "step": 23280 + }, + { + "epoch": 35.02, + "grad_norm": 5.885370254516602, + "learning_rate": 6.497744360902256e-06, + "loss": 0.2296, + "step": 23290 + }, + { + "epoch": 35.04, + "grad_norm": 3.2437541484832764, + "learning_rate": 6.49624060150376e-06, + "loss": 0.3242, + "step": 23300 + }, + { + "epoch": 35.05, + "grad_norm": 4.990633964538574, + "learning_rate": 6.494736842105264e-06, + "loss": 0.3038, + "step": 23310 + }, + { + "epoch": 35.07, + "grad_norm": 4.416833877563477, + "learning_rate": 6.493233082706768e-06, + "loss": 0.2001, + "step": 23320 + }, + { + "epoch": 35.08, + "grad_norm": 8.694164276123047, + "learning_rate": 6.4917293233082715e-06, + "loss": 0.2442, + "step": 23330 + }, + { + "epoch": 35.1, + "grad_norm": 9.367162704467773, + "learning_rate": 6.490225563909774e-06, + "loss": 0.3088, + "step": 23340 + }, + { + "epoch": 35.11, + "grad_norm": 3.4011213779449463, + "learning_rate": 6.4887218045112785e-06, + "loss": 0.2324, + "step": 23350 + }, + { + "epoch": 35.13, + "grad_norm": 5.543578624725342, + "learning_rate": 6.487218045112782e-06, + "loss": 0.2571, + "step": 23360 + }, + { + "epoch": 35.14, + "grad_norm": 6.542150974273682, + "learning_rate": 6.485714285714286e-06, + "loss": 0.233, + "step": 23370 + }, + { + "epoch": 35.16, + "grad_norm": 3.572125196456909, + "learning_rate": 6.484210526315789e-06, + "loss": 0.317, + "step": 23380 + }, + { + "epoch": 35.17, + "grad_norm": 6.729075908660889, + "learning_rate": 6.4827067669172935e-06, + "loss": 0.1965, + "step": 23390 + }, + { + "epoch": 35.19, + "grad_norm": 5.777679920196533, + "learning_rate": 6.481203007518797e-06, + "loss": 0.2543, + "step": 23400 + }, + { + "epoch": 35.2, + "grad_norm": 4.606062412261963, + "learning_rate": 6.479699248120301e-06, + "loss": 0.2478, + "step": 23410 + }, + { + "epoch": 35.22, + "grad_norm": 4.347212791442871, + "learning_rate": 6.478195488721805e-06, + "loss": 0.2716, + "step": 23420 + }, + { + "epoch": 35.23, + "grad_norm": 6.0750732421875, + "learning_rate": 6.476691729323309e-06, + "loss": 0.3095, + "step": 23430 + }, + { + "epoch": 35.25, + "grad_norm": 3.2727339267730713, + "learning_rate": 6.475187969924812e-06, + "loss": 0.2904, + "step": 23440 + }, + { + "epoch": 35.26, + "grad_norm": 10.178025245666504, + "learning_rate": 6.473684210526316e-06, + "loss": 0.2603, + "step": 23450 + }, + { + "epoch": 35.28, + "grad_norm": 6.457759380340576, + "learning_rate": 6.47218045112782e-06, + "loss": 0.3092, + "step": 23460 + }, + { + "epoch": 35.29, + "grad_norm": 1.357625126838684, + "learning_rate": 6.470676691729324e-06, + "loss": 0.1796, + "step": 23470 + }, + { + "epoch": 35.31, + "grad_norm": 6.197760105133057, + "learning_rate": 6.469172932330827e-06, + "loss": 0.331, + "step": 23480 + }, + { + "epoch": 35.32, + "grad_norm": 3.364699602127075, + "learning_rate": 6.467669172932331e-06, + "loss": 0.2559, + "step": 23490 + }, + { + "epoch": 35.34, + "grad_norm": 6.3956618309021, + "learning_rate": 6.466165413533835e-06, + "loss": 0.2026, + "step": 23500 + }, + { + "epoch": 35.35, + "grad_norm": 6.079551696777344, + "learning_rate": 6.464661654135339e-06, + "loss": 0.2687, + "step": 23510 + }, + { + "epoch": 35.37, + "grad_norm": 5.305349349975586, + "learning_rate": 6.463157894736843e-06, + "loss": 0.2628, + "step": 23520 + }, + { + "epoch": 35.38, + "grad_norm": 3.124565362930298, + "learning_rate": 6.461654135338347e-06, + "loss": 0.2969, + "step": 23530 + }, + { + "epoch": 35.4, + "grad_norm": 5.130784511566162, + "learning_rate": 6.46015037593985e-06, + "loss": 0.2696, + "step": 23540 + }, + { + "epoch": 35.41, + "grad_norm": 5.65298318862915, + "learning_rate": 6.458646616541354e-06, + "loss": 0.2035, + "step": 23550 + }, + { + "epoch": 35.43, + "grad_norm": 5.994067192077637, + "learning_rate": 6.4571428571428575e-06, + "loss": 0.3008, + "step": 23560 + }, + { + "epoch": 35.44, + "grad_norm": 4.849883556365967, + "learning_rate": 6.455639097744362e-06, + "loss": 0.2681, + "step": 23570 + }, + { + "epoch": 35.46, + "grad_norm": 7.175223350524902, + "learning_rate": 6.4541353383458646e-06, + "loss": 0.3364, + "step": 23580 + }, + { + "epoch": 35.47, + "grad_norm": 5.904483318328857, + "learning_rate": 6.452631578947369e-06, + "loss": 0.2725, + "step": 23590 + }, + { + "epoch": 35.49, + "grad_norm": 6.1249260902404785, + "learning_rate": 6.4511278195488724e-06, + "loss": 0.3174, + "step": 23600 + }, + { + "epoch": 35.5, + "grad_norm": 5.8022332191467285, + "learning_rate": 6.449624060150377e-06, + "loss": 0.2296, + "step": 23610 + }, + { + "epoch": 35.52, + "grad_norm": 4.012286186218262, + "learning_rate": 6.44812030075188e-06, + "loss": 0.2594, + "step": 23620 + }, + { + "epoch": 35.53, + "grad_norm": 5.536571502685547, + "learning_rate": 6.446616541353385e-06, + "loss": 0.2567, + "step": 23630 + }, + { + "epoch": 35.55, + "grad_norm": 5.743587017059326, + "learning_rate": 6.445112781954887e-06, + "loss": 0.2928, + "step": 23640 + }, + { + "epoch": 35.56, + "grad_norm": 9.780457496643066, + "learning_rate": 6.443609022556392e-06, + "loss": 0.2897, + "step": 23650 + }, + { + "epoch": 35.58, + "grad_norm": 4.584736347198486, + "learning_rate": 6.442105263157895e-06, + "loss": 0.1786, + "step": 23660 + }, + { + "epoch": 35.59, + "grad_norm": 1.9408100843429565, + "learning_rate": 6.4406015037594e-06, + "loss": 0.3392, + "step": 23670 + }, + { + "epoch": 35.61, + "grad_norm": 6.093573570251465, + "learning_rate": 6.439097744360902e-06, + "loss": 0.286, + "step": 23680 + }, + { + "epoch": 35.62, + "grad_norm": 7.220097541809082, + "learning_rate": 6.437593984962407e-06, + "loss": 0.297, + "step": 23690 + }, + { + "epoch": 35.64, + "grad_norm": 5.479585647583008, + "learning_rate": 6.43609022556391e-06, + "loss": 0.269, + "step": 23700 + }, + { + "epoch": 35.65, + "grad_norm": 6.430034160614014, + "learning_rate": 6.4345864661654145e-06, + "loss": 0.2266, + "step": 23710 + }, + { + "epoch": 35.67, + "grad_norm": 2.1765003204345703, + "learning_rate": 6.433082706766918e-06, + "loss": 0.2206, + "step": 23720 + }, + { + "epoch": 35.68, + "grad_norm": 4.447315216064453, + "learning_rate": 6.431578947368422e-06, + "loss": 0.235, + "step": 23730 + }, + { + "epoch": 35.7, + "grad_norm": 8.542555809020996, + "learning_rate": 6.430075187969925e-06, + "loss": 0.2793, + "step": 23740 + }, + { + "epoch": 35.71, + "grad_norm": 7.156379699707031, + "learning_rate": 6.4285714285714295e-06, + "loss": 0.258, + "step": 23750 + }, + { + "epoch": 35.73, + "grad_norm": 5.014525413513184, + "learning_rate": 6.427067669172933e-06, + "loss": 0.2489, + "step": 23760 + }, + { + "epoch": 35.74, + "grad_norm": 3.7913882732391357, + "learning_rate": 6.425563909774437e-06, + "loss": 0.2875, + "step": 23770 + }, + { + "epoch": 35.76, + "grad_norm": 5.546550750732422, + "learning_rate": 6.42406015037594e-06, + "loss": 0.2528, + "step": 23780 + }, + { + "epoch": 35.77, + "grad_norm": 5.281984329223633, + "learning_rate": 6.422556390977444e-06, + "loss": 0.2684, + "step": 23790 + }, + { + "epoch": 35.79, + "grad_norm": 4.275721073150635, + "learning_rate": 6.421052631578948e-06, + "loss": 0.2681, + "step": 23800 + }, + { + "epoch": 35.8, + "grad_norm": 5.552488803863525, + "learning_rate": 6.419548872180452e-06, + "loss": 0.2607, + "step": 23810 + }, + { + "epoch": 35.82, + "grad_norm": 4.155028343200684, + "learning_rate": 6.418045112781956e-06, + "loss": 0.246, + "step": 23820 + }, + { + "epoch": 35.83, + "grad_norm": 10.015910148620605, + "learning_rate": 6.4165413533834585e-06, + "loss": 0.3312, + "step": 23830 + }, + { + "epoch": 35.85, + "grad_norm": 6.853920936584473, + "learning_rate": 6.415037593984963e-06, + "loss": 0.3212, + "step": 23840 + }, + { + "epoch": 35.86, + "grad_norm": 7.707338809967041, + "learning_rate": 6.413533834586466e-06, + "loss": 0.2773, + "step": 23850 + }, + { + "epoch": 35.88, + "grad_norm": 6.877796649932861, + "learning_rate": 6.412030075187971e-06, + "loss": 0.306, + "step": 23860 + }, + { + "epoch": 35.89, + "grad_norm": 6.9555487632751465, + "learning_rate": 6.410526315789473e-06, + "loss": 0.3005, + "step": 23870 + }, + { + "epoch": 35.91, + "grad_norm": 5.553525924682617, + "learning_rate": 6.409022556390978e-06, + "loss": 0.3166, + "step": 23880 + }, + { + "epoch": 35.92, + "grad_norm": 3.6754038333892822, + "learning_rate": 6.407518796992481e-06, + "loss": 0.2731, + "step": 23890 + }, + { + "epoch": 35.94, + "grad_norm": 11.875730514526367, + "learning_rate": 6.406015037593986e-06, + "loss": 0.3046, + "step": 23900 + }, + { + "epoch": 35.95, + "grad_norm": 5.181977272033691, + "learning_rate": 6.404511278195489e-06, + "loss": 0.2291, + "step": 23910 + }, + { + "epoch": 35.97, + "grad_norm": 4.224656105041504, + "learning_rate": 6.4030075187969935e-06, + "loss": 0.2335, + "step": 23920 + }, + { + "epoch": 35.98, + "grad_norm": 3.9968817234039307, + "learning_rate": 6.401503759398496e-06, + "loss": 0.3294, + "step": 23930 + }, + { + "epoch": 36.0, + "grad_norm": 0.03734013810753822, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.2539, + "step": 23940 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.9313, + "eval_loss": 0.28184354305267334, + "eval_runtime": 84.4602, + "eval_samples_per_second": 118.399, + "eval_steps_per_second": 0.474, + "step": 23940 + }, + { + "epoch": 36.02, + "grad_norm": 4.844832897186279, + "learning_rate": 6.398496240601504e-06, + "loss": 0.247, + "step": 23950 + }, + { + "epoch": 36.03, + "grad_norm": 8.93395709991455, + "learning_rate": 6.3969924812030084e-06, + "loss": 0.2341, + "step": 23960 + }, + { + "epoch": 36.05, + "grad_norm": 6.799169063568115, + "learning_rate": 6.395488721804511e-06, + "loss": 0.1804, + "step": 23970 + }, + { + "epoch": 36.06, + "grad_norm": 6.8090901374816895, + "learning_rate": 6.3939849624060155e-06, + "loss": 0.3, + "step": 23980 + }, + { + "epoch": 36.08, + "grad_norm": 4.476424217224121, + "learning_rate": 6.392481203007519e-06, + "loss": 0.2273, + "step": 23990 + }, + { + "epoch": 36.09, + "grad_norm": 5.188058853149414, + "learning_rate": 6.390977443609023e-06, + "loss": 0.3322, + "step": 24000 + }, + { + "epoch": 36.11, + "grad_norm": 6.895328998565674, + "learning_rate": 6.389473684210527e-06, + "loss": 0.2552, + "step": 24010 + }, + { + "epoch": 36.12, + "grad_norm": 6.052617073059082, + "learning_rate": 6.38796992481203e-06, + "loss": 0.3099, + "step": 24020 + }, + { + "epoch": 36.14, + "grad_norm": 4.1569366455078125, + "learning_rate": 6.386466165413534e-06, + "loss": 0.2408, + "step": 24030 + }, + { + "epoch": 36.15, + "grad_norm": 10.491902351379395, + "learning_rate": 6.384962406015038e-06, + "loss": 0.3039, + "step": 24040 + }, + { + "epoch": 36.17, + "grad_norm": 1.8690141439437866, + "learning_rate": 6.383458646616542e-06, + "loss": 0.2466, + "step": 24050 + }, + { + "epoch": 36.18, + "grad_norm": 6.9134650230407715, + "learning_rate": 6.381954887218046e-06, + "loss": 0.2627, + "step": 24060 + }, + { + "epoch": 36.2, + "grad_norm": 6.333374977111816, + "learning_rate": 6.380451127819549e-06, + "loss": 0.2438, + "step": 24070 + }, + { + "epoch": 36.21, + "grad_norm": 5.581048488616943, + "learning_rate": 6.378947368421053e-06, + "loss": 0.2583, + "step": 24080 + }, + { + "epoch": 36.23, + "grad_norm": 14.36394214630127, + "learning_rate": 6.377443609022557e-06, + "loss": 0.29, + "step": 24090 + }, + { + "epoch": 36.24, + "grad_norm": 3.045477867126465, + "learning_rate": 6.375939849624061e-06, + "loss": 0.238, + "step": 24100 + }, + { + "epoch": 36.26, + "grad_norm": 7.110077857971191, + "learning_rate": 6.374436090225565e-06, + "loss": 0.3271, + "step": 24110 + }, + { + "epoch": 36.27, + "grad_norm": 5.218198299407959, + "learning_rate": 6.372932330827068e-06, + "loss": 0.2952, + "step": 24120 + }, + { + "epoch": 36.29, + "grad_norm": 10.754549026489258, + "learning_rate": 6.371428571428572e-06, + "loss": 0.2661, + "step": 24130 + }, + { + "epoch": 36.3, + "grad_norm": 3.623293876647949, + "learning_rate": 6.369924812030076e-06, + "loss": 0.189, + "step": 24140 + }, + { + "epoch": 36.32, + "grad_norm": 5.730505466461182, + "learning_rate": 6.3684210526315795e-06, + "loss": 0.2611, + "step": 24150 + }, + { + "epoch": 36.33, + "grad_norm": 6.075179100036621, + "learning_rate": 6.366917293233084e-06, + "loss": 0.3031, + "step": 24160 + }, + { + "epoch": 36.35, + "grad_norm": 5.460084915161133, + "learning_rate": 6.365413533834587e-06, + "loss": 0.2325, + "step": 24170 + }, + { + "epoch": 36.36, + "grad_norm": 5.883102893829346, + "learning_rate": 6.363909774436091e-06, + "loss": 0.3088, + "step": 24180 + }, + { + "epoch": 36.38, + "grad_norm": 2.4602198600769043, + "learning_rate": 6.3624060150375945e-06, + "loss": 0.2467, + "step": 24190 + }, + { + "epoch": 36.39, + "grad_norm": 7.747447490692139, + "learning_rate": 6.360902255639099e-06, + "loss": 0.2818, + "step": 24200 + }, + { + "epoch": 36.41, + "grad_norm": 5.207604885101318, + "learning_rate": 6.3593984962406015e-06, + "loss": 0.2313, + "step": 24210 + }, + { + "epoch": 36.42, + "grad_norm": 3.6121327877044678, + "learning_rate": 6.357894736842106e-06, + "loss": 0.2216, + "step": 24220 + }, + { + "epoch": 36.44, + "grad_norm": 8.649584770202637, + "learning_rate": 6.356390977443609e-06, + "loss": 0.2514, + "step": 24230 + }, + { + "epoch": 36.45, + "grad_norm": 8.641491889953613, + "learning_rate": 6.354887218045114e-06, + "loss": 0.2535, + "step": 24240 + }, + { + "epoch": 36.47, + "grad_norm": 5.185530662536621, + "learning_rate": 6.353383458646617e-06, + "loss": 0.1882, + "step": 24250 + }, + { + "epoch": 36.48, + "grad_norm": 6.481791019439697, + "learning_rate": 6.351879699248122e-06, + "loss": 0.2954, + "step": 24260 + }, + { + "epoch": 36.5, + "grad_norm": 3.8771073818206787, + "learning_rate": 6.350375939849624e-06, + "loss": 0.2594, + "step": 24270 + }, + { + "epoch": 36.51, + "grad_norm": 3.2521779537200928, + "learning_rate": 6.348872180451129e-06, + "loss": 0.1955, + "step": 24280 + }, + { + "epoch": 36.53, + "grad_norm": 4.331993103027344, + "learning_rate": 6.347368421052632e-06, + "loss": 0.2964, + "step": 24290 + }, + { + "epoch": 36.54, + "grad_norm": 5.428518295288086, + "learning_rate": 6.3458646616541366e-06, + "loss": 0.3339, + "step": 24300 + }, + { + "epoch": 36.56, + "grad_norm": 5.912198543548584, + "learning_rate": 6.344360902255639e-06, + "loss": 0.2986, + "step": 24310 + }, + { + "epoch": 36.57, + "grad_norm": 6.707327365875244, + "learning_rate": 6.342857142857143e-06, + "loss": 0.2508, + "step": 24320 + }, + { + "epoch": 36.59, + "grad_norm": 5.767425060272217, + "learning_rate": 6.341353383458647e-06, + "loss": 0.2468, + "step": 24330 + }, + { + "epoch": 36.6, + "grad_norm": 6.399289608001709, + "learning_rate": 6.339849624060151e-06, + "loss": 0.2553, + "step": 24340 + }, + { + "epoch": 36.62, + "grad_norm": 4.582364082336426, + "learning_rate": 6.338345864661655e-06, + "loss": 0.2534, + "step": 24350 + }, + { + "epoch": 36.63, + "grad_norm": 3.3339309692382812, + "learning_rate": 6.336842105263158e-06, + "loss": 0.2576, + "step": 24360 + }, + { + "epoch": 36.65, + "grad_norm": 4.761340141296387, + "learning_rate": 6.335338345864662e-06, + "loss": 0.2606, + "step": 24370 + }, + { + "epoch": 36.66, + "grad_norm": 4.896874904632568, + "learning_rate": 6.3338345864661656e-06, + "loss": 0.2359, + "step": 24380 + }, + { + "epoch": 36.68, + "grad_norm": 8.0689697265625, + "learning_rate": 6.33233082706767e-06, + "loss": 0.2927, + "step": 24390 + }, + { + "epoch": 36.69, + "grad_norm": 7.064751148223877, + "learning_rate": 6.330827067669173e-06, + "loss": 0.2416, + "step": 24400 + }, + { + "epoch": 36.71, + "grad_norm": 4.654138565063477, + "learning_rate": 6.329323308270677e-06, + "loss": 0.2719, + "step": 24410 + }, + { + "epoch": 36.72, + "grad_norm": 3.553248167037964, + "learning_rate": 6.3278195488721805e-06, + "loss": 0.2545, + "step": 24420 + }, + { + "epoch": 36.74, + "grad_norm": 4.331087589263916, + "learning_rate": 6.326315789473685e-06, + "loss": 0.2351, + "step": 24430 + }, + { + "epoch": 36.75, + "grad_norm": 5.628582954406738, + "learning_rate": 6.324812030075188e-06, + "loss": 0.3051, + "step": 24440 + }, + { + "epoch": 36.77, + "grad_norm": 6.611728191375732, + "learning_rate": 6.323308270676693e-06, + "loss": 0.24, + "step": 24450 + }, + { + "epoch": 36.78, + "grad_norm": 5.588087558746338, + "learning_rate": 6.321804511278195e-06, + "loss": 0.2702, + "step": 24460 + }, + { + "epoch": 36.8, + "grad_norm": 2.716883420944214, + "learning_rate": 6.3203007518797e-06, + "loss": 0.2452, + "step": 24470 + }, + { + "epoch": 36.81, + "grad_norm": 3.0164976119995117, + "learning_rate": 6.318796992481203e-06, + "loss": 0.2998, + "step": 24480 + }, + { + "epoch": 36.83, + "grad_norm": 9.347999572753906, + "learning_rate": 6.317293233082708e-06, + "loss": 0.2446, + "step": 24490 + }, + { + "epoch": 36.84, + "grad_norm": 13.030348777770996, + "learning_rate": 6.31578947368421e-06, + "loss": 0.2017, + "step": 24500 + }, + { + "epoch": 36.86, + "grad_norm": 4.36257791519165, + "learning_rate": 6.314285714285715e-06, + "loss": 0.331, + "step": 24510 + }, + { + "epoch": 36.87, + "grad_norm": 6.986863136291504, + "learning_rate": 6.312781954887218e-06, + "loss": 0.289, + "step": 24520 + }, + { + "epoch": 36.89, + "grad_norm": 2.9986495971679688, + "learning_rate": 6.311278195488723e-06, + "loss": 0.2632, + "step": 24530 + }, + { + "epoch": 36.9, + "grad_norm": 6.963717460632324, + "learning_rate": 6.309774436090226e-06, + "loss": 0.283, + "step": 24540 + }, + { + "epoch": 36.92, + "grad_norm": 2.926877975463867, + "learning_rate": 6.3082706766917305e-06, + "loss": 0.2462, + "step": 24550 + }, + { + "epoch": 36.93, + "grad_norm": 2.751361131668091, + "learning_rate": 6.306766917293233e-06, + "loss": 0.2986, + "step": 24560 + }, + { + "epoch": 36.95, + "grad_norm": 7.566164493560791, + "learning_rate": 6.3052631578947375e-06, + "loss": 0.2437, + "step": 24570 + }, + { + "epoch": 36.96, + "grad_norm": 10.196669578552246, + "learning_rate": 6.303759398496241e-06, + "loss": 0.3045, + "step": 24580 + }, + { + "epoch": 36.98, + "grad_norm": 3.9259514808654785, + "learning_rate": 6.302255639097745e-06, + "loss": 0.2523, + "step": 24590 + }, + { + "epoch": 36.99, + "grad_norm": 3.9328627586364746, + "learning_rate": 6.300751879699248e-06, + "loss": 0.2104, + "step": 24600 + }, + { + "epoch": 37.0, + "eval_accuracy": 0.9329, + "eval_loss": 0.2907086908817291, + "eval_runtime": 84.7416, + "eval_samples_per_second": 118.006, + "eval_steps_per_second": 0.472, + "step": 24605 + }, + { + "epoch": 37.01, + "grad_norm": 5.213517189025879, + "learning_rate": 6.2992481203007524e-06, + "loss": 0.2175, + "step": 24610 + }, + { + "epoch": 37.02, + "grad_norm": 5.479267120361328, + "learning_rate": 6.297744360902256e-06, + "loss": 0.2723, + "step": 24620 + }, + { + "epoch": 37.04, + "grad_norm": 5.910623073577881, + "learning_rate": 6.29624060150376e-06, + "loss": 0.2277, + "step": 24630 + }, + { + "epoch": 37.05, + "grad_norm": 3.9250035285949707, + "learning_rate": 6.294736842105264e-06, + "loss": 0.2364, + "step": 24640 + }, + { + "epoch": 37.07, + "grad_norm": 5.7687201499938965, + "learning_rate": 6.293233082706768e-06, + "loss": 0.2434, + "step": 24650 + }, + { + "epoch": 37.08, + "grad_norm": 6.585549354553223, + "learning_rate": 6.291729323308271e-06, + "loss": 0.2501, + "step": 24660 + }, + { + "epoch": 37.1, + "grad_norm": 9.130806922912598, + "learning_rate": 6.290225563909775e-06, + "loss": 0.3176, + "step": 24670 + }, + { + "epoch": 37.11, + "grad_norm": 5.826327800750732, + "learning_rate": 6.288721804511279e-06, + "loss": 0.259, + "step": 24680 + }, + { + "epoch": 37.13, + "grad_norm": 3.6899287700653076, + "learning_rate": 6.287218045112783e-06, + "loss": 0.225, + "step": 24690 + }, + { + "epoch": 37.14, + "grad_norm": 7.567675590515137, + "learning_rate": 6.285714285714286e-06, + "loss": 0.3511, + "step": 24700 + }, + { + "epoch": 37.16, + "grad_norm": 4.670961856842041, + "learning_rate": 6.28421052631579e-06, + "loss": 0.2718, + "step": 24710 + }, + { + "epoch": 37.17, + "grad_norm": 6.8157057762146, + "learning_rate": 6.282706766917294e-06, + "loss": 0.2676, + "step": 24720 + }, + { + "epoch": 37.19, + "grad_norm": 9.021499633789062, + "learning_rate": 6.281203007518798e-06, + "loss": 0.2615, + "step": 24730 + }, + { + "epoch": 37.2, + "grad_norm": 5.826773166656494, + "learning_rate": 6.2796992481203016e-06, + "loss": 0.2652, + "step": 24740 + }, + { + "epoch": 37.22, + "grad_norm": 6.296904563903809, + "learning_rate": 6.278195488721806e-06, + "loss": 0.2632, + "step": 24750 + }, + { + "epoch": 37.23, + "grad_norm": 3.506654739379883, + "learning_rate": 6.276691729323309e-06, + "loss": 0.2946, + "step": 24760 + }, + { + "epoch": 37.25, + "grad_norm": 6.164925575256348, + "learning_rate": 6.275187969924813e-06, + "loss": 0.2214, + "step": 24770 + }, + { + "epoch": 37.26, + "grad_norm": 5.420670986175537, + "learning_rate": 6.2736842105263165e-06, + "loss": 0.2709, + "step": 24780 + }, + { + "epoch": 37.28, + "grad_norm": 5.200439453125, + "learning_rate": 6.272180451127821e-06, + "loss": 0.2599, + "step": 24790 + }, + { + "epoch": 37.29, + "grad_norm": 5.116550445556641, + "learning_rate": 6.2706766917293235e-06, + "loss": 0.2643, + "step": 24800 + }, + { + "epoch": 37.31, + "grad_norm": 4.102775573730469, + "learning_rate": 6.269172932330828e-06, + "loss": 0.2768, + "step": 24810 + }, + { + "epoch": 37.32, + "grad_norm": 5.1978254318237305, + "learning_rate": 6.267669172932331e-06, + "loss": 0.2132, + "step": 24820 + }, + { + "epoch": 37.34, + "grad_norm": 7.613659858703613, + "learning_rate": 6.266165413533835e-06, + "loss": 0.3066, + "step": 24830 + }, + { + "epoch": 37.35, + "grad_norm": 7.9959917068481445, + "learning_rate": 6.264661654135339e-06, + "loss": 0.2219, + "step": 24840 + }, + { + "epoch": 37.37, + "grad_norm": 7.0650811195373535, + "learning_rate": 6.263157894736842e-06, + "loss": 0.2497, + "step": 24850 + }, + { + "epoch": 37.38, + "grad_norm": 4.512048721313477, + "learning_rate": 6.261654135338346e-06, + "loss": 0.243, + "step": 24860 + }, + { + "epoch": 37.4, + "grad_norm": 3.7339611053466797, + "learning_rate": 6.26015037593985e-06, + "loss": 0.1984, + "step": 24870 + }, + { + "epoch": 37.41, + "grad_norm": 5.475858211517334, + "learning_rate": 6.258646616541354e-06, + "loss": 0.2657, + "step": 24880 + }, + { + "epoch": 37.43, + "grad_norm": 5.605356693267822, + "learning_rate": 6.257142857142857e-06, + "loss": 0.2905, + "step": 24890 + }, + { + "epoch": 37.44, + "grad_norm": 9.095086097717285, + "learning_rate": 6.255639097744361e-06, + "loss": 0.3472, + "step": 24900 + }, + { + "epoch": 37.46, + "grad_norm": 6.019258975982666, + "learning_rate": 6.254135338345865e-06, + "loss": 0.3156, + "step": 24910 + }, + { + "epoch": 37.47, + "grad_norm": 1.5480716228485107, + "learning_rate": 6.252631578947369e-06, + "loss": 0.244, + "step": 24920 + }, + { + "epoch": 37.49, + "grad_norm": 5.178092002868652, + "learning_rate": 6.251127819548873e-06, + "loss": 0.2058, + "step": 24930 + }, + { + "epoch": 37.5, + "grad_norm": 5.011317729949951, + "learning_rate": 6.249624060150376e-06, + "loss": 0.2563, + "step": 24940 + }, + { + "epoch": 37.52, + "grad_norm": 6.064964771270752, + "learning_rate": 6.24812030075188e-06, + "loss": 0.2321, + "step": 24950 + }, + { + "epoch": 37.53, + "grad_norm": 4.71431827545166, + "learning_rate": 6.246616541353384e-06, + "loss": 0.2923, + "step": 24960 + }, + { + "epoch": 37.55, + "grad_norm": 7.876862525939941, + "learning_rate": 6.245112781954888e-06, + "loss": 0.3188, + "step": 24970 + }, + { + "epoch": 37.56, + "grad_norm": 5.848691940307617, + "learning_rate": 6.243609022556392e-06, + "loss": 0.2144, + "step": 24980 + }, + { + "epoch": 37.58, + "grad_norm": 10.56225299835205, + "learning_rate": 6.242105263157895e-06, + "loss": 0.2761, + "step": 24990 + }, + { + "epoch": 37.59, + "grad_norm": 4.970668792724609, + "learning_rate": 6.240601503759399e-06, + "loss": 0.2745, + "step": 25000 + }, + { + "epoch": 37.61, + "grad_norm": 9.470555305480957, + "learning_rate": 6.2390977443609025e-06, + "loss": 0.2858, + "step": 25010 + }, + { + "epoch": 37.62, + "grad_norm": 7.466954708099365, + "learning_rate": 6.237593984962407e-06, + "loss": 0.2461, + "step": 25020 + }, + { + "epoch": 37.64, + "grad_norm": 7.034529685974121, + "learning_rate": 6.23609022556391e-06, + "loss": 0.2675, + "step": 25030 + }, + { + "epoch": 37.65, + "grad_norm": 11.616925239562988, + "learning_rate": 6.234586466165414e-06, + "loss": 0.2296, + "step": 25040 + }, + { + "epoch": 37.67, + "grad_norm": 4.31875467300415, + "learning_rate": 6.2330827067669174e-06, + "loss": 0.2634, + "step": 25050 + }, + { + "epoch": 37.68, + "grad_norm": 5.886952877044678, + "learning_rate": 6.231578947368422e-06, + "loss": 0.2379, + "step": 25060 + }, + { + "epoch": 37.7, + "grad_norm": 4.664031982421875, + "learning_rate": 6.230075187969925e-06, + "loss": 0.2202, + "step": 25070 + }, + { + "epoch": 37.71, + "grad_norm": 4.743470191955566, + "learning_rate": 6.22857142857143e-06, + "loss": 0.2281, + "step": 25080 + }, + { + "epoch": 37.73, + "grad_norm": 2.948579788208008, + "learning_rate": 6.227067669172932e-06, + "loss": 0.225, + "step": 25090 + }, + { + "epoch": 37.74, + "grad_norm": 4.467003345489502, + "learning_rate": 6.225563909774437e-06, + "loss": 0.241, + "step": 25100 + }, + { + "epoch": 37.76, + "grad_norm": 5.358434677124023, + "learning_rate": 6.22406015037594e-06, + "loss": 0.2336, + "step": 25110 + }, + { + "epoch": 37.77, + "grad_norm": 3.714233875274658, + "learning_rate": 6.222556390977445e-06, + "loss": 0.2112, + "step": 25120 + }, + { + "epoch": 37.79, + "grad_norm": 6.145929336547852, + "learning_rate": 6.221052631578947e-06, + "loss": 0.3233, + "step": 25130 + }, + { + "epoch": 37.8, + "grad_norm": 5.789137840270996, + "learning_rate": 6.219548872180452e-06, + "loss": 0.2396, + "step": 25140 + }, + { + "epoch": 37.82, + "grad_norm": 7.418402671813965, + "learning_rate": 6.218045112781955e-06, + "loss": 0.2444, + "step": 25150 + }, + { + "epoch": 37.83, + "grad_norm": 4.341475486755371, + "learning_rate": 6.2165413533834595e-06, + "loss": 0.2258, + "step": 25160 + }, + { + "epoch": 37.85, + "grad_norm": 5.642782211303711, + "learning_rate": 6.215037593984963e-06, + "loss": 0.2946, + "step": 25170 + }, + { + "epoch": 37.86, + "grad_norm": 6.9304680824279785, + "learning_rate": 6.213533834586467e-06, + "loss": 0.2476, + "step": 25180 + }, + { + "epoch": 37.88, + "grad_norm": 5.448428153991699, + "learning_rate": 6.21203007518797e-06, + "loss": 0.2455, + "step": 25190 + }, + { + "epoch": 37.89, + "grad_norm": 5.857348918914795, + "learning_rate": 6.2105263157894745e-06, + "loss": 0.2786, + "step": 25200 + }, + { + "epoch": 37.91, + "grad_norm": 3.1548938751220703, + "learning_rate": 6.209022556390978e-06, + "loss": 0.2123, + "step": 25210 + }, + { + "epoch": 37.92, + "grad_norm": 5.268825531005859, + "learning_rate": 6.207518796992482e-06, + "loss": 0.1931, + "step": 25220 + }, + { + "epoch": 37.94, + "grad_norm": 5.20718240737915, + "learning_rate": 6.206015037593985e-06, + "loss": 0.2774, + "step": 25230 + }, + { + "epoch": 37.95, + "grad_norm": 7.931674957275391, + "learning_rate": 6.204511278195489e-06, + "loss": 0.2652, + "step": 25240 + }, + { + "epoch": 37.97, + "grad_norm": 4.287258625030518, + "learning_rate": 6.203007518796993e-06, + "loss": 0.3186, + "step": 25250 + }, + { + "epoch": 37.98, + "grad_norm": 8.485033988952637, + "learning_rate": 6.201503759398497e-06, + "loss": 0.2543, + "step": 25260 + }, + { + "epoch": 38.0, + "grad_norm": 0.33357012271881104, + "learning_rate": 6.200000000000001e-06, + "loss": 0.264, + "step": 25270 + }, + { + "epoch": 38.0, + "eval_accuracy": 0.9298, + "eval_loss": 0.3029746413230896, + "eval_runtime": 84.4594, + "eval_samples_per_second": 118.4, + "eval_steps_per_second": 0.474, + "step": 25270 + }, + { + "epoch": 38.02, + "grad_norm": 5.222786903381348, + "learning_rate": 6.198496240601505e-06, + "loss": 0.2575, + "step": 25280 + }, + { + "epoch": 38.03, + "grad_norm": 2.386619806289673, + "learning_rate": 6.196992481203008e-06, + "loss": 0.2419, + "step": 25290 + }, + { + "epoch": 38.05, + "grad_norm": 3.8002138137817383, + "learning_rate": 6.195488721804512e-06, + "loss": 0.2632, + "step": 25300 + }, + { + "epoch": 38.06, + "grad_norm": 4.710455417633057, + "learning_rate": 6.193984962406016e-06, + "loss": 0.1841, + "step": 25310 + }, + { + "epoch": 38.08, + "grad_norm": 3.5531041622161865, + "learning_rate": 6.192481203007518e-06, + "loss": 0.2501, + "step": 25320 + }, + { + "epoch": 38.09, + "grad_norm": 4.9958062171936035, + "learning_rate": 6.190977443609023e-06, + "loss": 0.2728, + "step": 25330 + }, + { + "epoch": 38.11, + "grad_norm": 3.301966428756714, + "learning_rate": 6.189473684210526e-06, + "loss": 0.2564, + "step": 25340 + }, + { + "epoch": 38.12, + "grad_norm": 4.437223434448242, + "learning_rate": 6.187969924812031e-06, + "loss": 0.3406, + "step": 25350 + }, + { + "epoch": 38.14, + "grad_norm": 3.1734423637390137, + "learning_rate": 6.186466165413534e-06, + "loss": 0.2296, + "step": 25360 + }, + { + "epoch": 38.15, + "grad_norm": 8.440972328186035, + "learning_rate": 6.1849624060150385e-06, + "loss": 0.2089, + "step": 25370 + }, + { + "epoch": 38.17, + "grad_norm": 3.5104334354400635, + "learning_rate": 6.183458646616541e-06, + "loss": 0.3243, + "step": 25380 + }, + { + "epoch": 38.18, + "grad_norm": 5.661349296569824, + "learning_rate": 6.1819548872180455e-06, + "loss": 0.2984, + "step": 25390 + }, + { + "epoch": 38.2, + "grad_norm": 4.282464981079102, + "learning_rate": 6.180451127819549e-06, + "loss": 0.2493, + "step": 25400 + }, + { + "epoch": 38.21, + "grad_norm": 4.8217926025390625, + "learning_rate": 6.1789473684210534e-06, + "loss": 0.289, + "step": 25410 + }, + { + "epoch": 38.23, + "grad_norm": 5.01459264755249, + "learning_rate": 6.177443609022556e-06, + "loss": 0.3262, + "step": 25420 + }, + { + "epoch": 38.24, + "grad_norm": 5.024478435516357, + "learning_rate": 6.1759398496240605e-06, + "loss": 0.297, + "step": 25430 + }, + { + "epoch": 38.26, + "grad_norm": 9.888729095458984, + "learning_rate": 6.174436090225564e-06, + "loss": 0.2414, + "step": 25440 + }, + { + "epoch": 38.27, + "grad_norm": 6.393784046173096, + "learning_rate": 6.172932330827068e-06, + "loss": 0.2329, + "step": 25450 + }, + { + "epoch": 38.29, + "grad_norm": 4.191177845001221, + "learning_rate": 6.171428571428572e-06, + "loss": 0.303, + "step": 25460 + }, + { + "epoch": 38.3, + "grad_norm": 5.586199760437012, + "learning_rate": 6.169924812030076e-06, + "loss": 0.2868, + "step": 25470 + }, + { + "epoch": 38.32, + "grad_norm": 4.708113670349121, + "learning_rate": 6.168421052631579e-06, + "loss": 0.2418, + "step": 25480 + }, + { + "epoch": 38.33, + "grad_norm": 8.40892505645752, + "learning_rate": 6.166917293233083e-06, + "loss": 0.2396, + "step": 25490 + }, + { + "epoch": 38.35, + "grad_norm": 6.81309175491333, + "learning_rate": 6.165413533834587e-06, + "loss": 0.2496, + "step": 25500 + }, + { + "epoch": 38.36, + "grad_norm": 8.350881576538086, + "learning_rate": 6.163909774436091e-06, + "loss": 0.2002, + "step": 25510 + }, + { + "epoch": 38.38, + "grad_norm": 3.867704153060913, + "learning_rate": 6.162406015037594e-06, + "loss": 0.2544, + "step": 25520 + }, + { + "epoch": 38.39, + "grad_norm": 2.9632339477539062, + "learning_rate": 6.160902255639098e-06, + "loss": 0.2444, + "step": 25530 + }, + { + "epoch": 38.41, + "grad_norm": 2.540008544921875, + "learning_rate": 6.159398496240602e-06, + "loss": 0.1792, + "step": 25540 + }, + { + "epoch": 38.42, + "grad_norm": 5.204866886138916, + "learning_rate": 6.157894736842106e-06, + "loss": 0.2785, + "step": 25550 + }, + { + "epoch": 38.44, + "grad_norm": 5.661520957946777, + "learning_rate": 6.15639097744361e-06, + "loss": 0.2656, + "step": 25560 + }, + { + "epoch": 38.45, + "grad_norm": 6.089839458465576, + "learning_rate": 6.154887218045114e-06, + "loss": 0.2698, + "step": 25570 + }, + { + "epoch": 38.47, + "grad_norm": 6.78479528427124, + "learning_rate": 6.153383458646617e-06, + "loss": 0.2754, + "step": 25580 + }, + { + "epoch": 38.48, + "grad_norm": 5.978816032409668, + "learning_rate": 6.151879699248121e-06, + "loss": 0.2906, + "step": 25590 + }, + { + "epoch": 38.5, + "grad_norm": 6.391565799713135, + "learning_rate": 6.1503759398496245e-06, + "loss": 0.2373, + "step": 25600 + }, + { + "epoch": 38.51, + "grad_norm": 6.544304847717285, + "learning_rate": 6.148872180451129e-06, + "loss": 0.256, + "step": 25610 + }, + { + "epoch": 38.53, + "grad_norm": 3.7540414333343506, + "learning_rate": 6.1473684210526316e-06, + "loss": 0.2462, + "step": 25620 + }, + { + "epoch": 38.54, + "grad_norm": 4.831808567047119, + "learning_rate": 6.145864661654136e-06, + "loss": 0.2307, + "step": 25630 + }, + { + "epoch": 38.56, + "grad_norm": 3.9876370429992676, + "learning_rate": 6.1443609022556395e-06, + "loss": 0.182, + "step": 25640 + }, + { + "epoch": 38.57, + "grad_norm": 4.689981460571289, + "learning_rate": 6.142857142857144e-06, + "loss": 0.2577, + "step": 25650 + }, + { + "epoch": 38.59, + "grad_norm": 6.713221073150635, + "learning_rate": 6.141353383458647e-06, + "loss": 0.2902, + "step": 25660 + }, + { + "epoch": 38.6, + "grad_norm": 5.03303861618042, + "learning_rate": 6.139849624060152e-06, + "loss": 0.1913, + "step": 25670 + }, + { + "epoch": 38.62, + "grad_norm": 6.612110137939453, + "learning_rate": 6.138345864661654e-06, + "loss": 0.2724, + "step": 25680 + }, + { + "epoch": 38.63, + "grad_norm": 0.28649356961250305, + "learning_rate": 6.136842105263159e-06, + "loss": 0.2571, + "step": 25690 + }, + { + "epoch": 38.65, + "grad_norm": 3.211318016052246, + "learning_rate": 6.135338345864662e-06, + "loss": 0.2778, + "step": 25700 + }, + { + "epoch": 38.66, + "grad_norm": 2.4958066940307617, + "learning_rate": 6.133834586466167e-06, + "loss": 0.2794, + "step": 25710 + }, + { + "epoch": 38.68, + "grad_norm": 7.281109809875488, + "learning_rate": 6.132330827067669e-06, + "loss": 0.2804, + "step": 25720 + }, + { + "epoch": 38.69, + "grad_norm": 5.622795581817627, + "learning_rate": 6.130827067669174e-06, + "loss": 0.2422, + "step": 25730 + }, + { + "epoch": 38.71, + "grad_norm": 6.729783058166504, + "learning_rate": 6.129323308270677e-06, + "loss": 0.252, + "step": 25740 + }, + { + "epoch": 38.72, + "grad_norm": 8.58928108215332, + "learning_rate": 6.1278195488721816e-06, + "loss": 0.2239, + "step": 25750 + }, + { + "epoch": 38.74, + "grad_norm": 4.566402912139893, + "learning_rate": 6.126315789473685e-06, + "loss": 0.2211, + "step": 25760 + }, + { + "epoch": 38.75, + "grad_norm": 13.606522560119629, + "learning_rate": 6.124812030075189e-06, + "loss": 0.2715, + "step": 25770 + }, + { + "epoch": 38.77, + "grad_norm": 7.567572593688965, + "learning_rate": 6.123308270676692e-06, + "loss": 0.223, + "step": 25780 + }, + { + "epoch": 38.78, + "grad_norm": 5.111462116241455, + "learning_rate": 6.1218045112781965e-06, + "loss": 0.2691, + "step": 25790 + }, + { + "epoch": 38.8, + "grad_norm": 3.890907049179077, + "learning_rate": 6.1203007518797e-06, + "loss": 0.2056, + "step": 25800 + }, + { + "epoch": 38.81, + "grad_norm": 8.117431640625, + "learning_rate": 6.118796992481203e-06, + "loss": 0.2391, + "step": 25810 + }, + { + "epoch": 38.83, + "grad_norm": 3.895110845565796, + "learning_rate": 6.117293233082707e-06, + "loss": 0.2405, + "step": 25820 + }, + { + "epoch": 38.84, + "grad_norm": 5.332907676696777, + "learning_rate": 6.1157894736842106e-06, + "loss": 0.2132, + "step": 25830 + }, + { + "epoch": 38.86, + "grad_norm": 5.767151355743408, + "learning_rate": 6.114285714285715e-06, + "loss": 0.2321, + "step": 25840 + }, + { + "epoch": 38.87, + "grad_norm": 6.4265899658203125, + "learning_rate": 6.1127819548872184e-06, + "loss": 0.265, + "step": 25850 + }, + { + "epoch": 38.89, + "grad_norm": 5.724833011627197, + "learning_rate": 6.111278195488722e-06, + "loss": 0.2308, + "step": 25860 + }, + { + "epoch": 38.9, + "grad_norm": 4.019651412963867, + "learning_rate": 6.1097744360902255e-06, + "loss": 0.2451, + "step": 25870 + }, + { + "epoch": 38.92, + "grad_norm": 6.344007968902588, + "learning_rate": 6.10827067669173e-06, + "loss": 0.2445, + "step": 25880 + }, + { + "epoch": 38.93, + "grad_norm": 4.17384672164917, + "learning_rate": 6.106766917293233e-06, + "loss": 0.2344, + "step": 25890 + }, + { + "epoch": 38.95, + "grad_norm": 7.522851467132568, + "learning_rate": 6.105263157894738e-06, + "loss": 0.2443, + "step": 25900 + }, + { + "epoch": 38.96, + "grad_norm": 5.275865077972412, + "learning_rate": 6.10375939849624e-06, + "loss": 0.2393, + "step": 25910 + }, + { + "epoch": 38.98, + "grad_norm": 12.723134994506836, + "learning_rate": 6.102255639097745e-06, + "loss": 0.2157, + "step": 25920 + }, + { + "epoch": 38.99, + "grad_norm": 6.258028507232666, + "learning_rate": 6.100751879699248e-06, + "loss": 0.3343, + "step": 25930 + }, + { + "epoch": 39.0, + "eval_accuracy": 0.9299, + "eval_loss": 0.30296048521995544, + "eval_runtime": 85.0044, + "eval_samples_per_second": 117.641, + "eval_steps_per_second": 0.471, + "step": 25935 + }, + { + "epoch": 39.01, + "grad_norm": 7.032735347747803, + "learning_rate": 6.099248120300753e-06, + "loss": 0.2688, + "step": 25940 + }, + { + "epoch": 39.02, + "grad_norm": 4.064879417419434, + "learning_rate": 6.097744360902256e-06, + "loss": 0.2388, + "step": 25950 + }, + { + "epoch": 39.04, + "grad_norm": 10.355484008789062, + "learning_rate": 6.09624060150376e-06, + "loss": 0.236, + "step": 25960 + }, + { + "epoch": 39.05, + "grad_norm": 5.589707374572754, + "learning_rate": 6.094736842105263e-06, + "loss": 0.284, + "step": 25970 + }, + { + "epoch": 39.07, + "grad_norm": 5.585343360900879, + "learning_rate": 6.0932330827067676e-06, + "loss": 0.2678, + "step": 25980 + }, + { + "epoch": 39.08, + "grad_norm": 5.8805670738220215, + "learning_rate": 6.091729323308271e-06, + "loss": 0.2468, + "step": 25990 + }, + { + "epoch": 39.1, + "grad_norm": 3.8114516735076904, + "learning_rate": 6.0902255639097755e-06, + "loss": 0.2651, + "step": 26000 + }, + { + "epoch": 39.11, + "grad_norm": 2.4582362174987793, + "learning_rate": 6.088721804511278e-06, + "loss": 0.2564, + "step": 26010 + }, + { + "epoch": 39.13, + "grad_norm": 7.29220724105835, + "learning_rate": 6.0872180451127825e-06, + "loss": 0.2869, + "step": 26020 + }, + { + "epoch": 39.14, + "grad_norm": 5.818304061889648, + "learning_rate": 6.085714285714286e-06, + "loss": 0.1954, + "step": 26030 + }, + { + "epoch": 39.16, + "grad_norm": 3.6301677227020264, + "learning_rate": 6.08421052631579e-06, + "loss": 0.2483, + "step": 26040 + }, + { + "epoch": 39.17, + "grad_norm": 8.798738479614258, + "learning_rate": 6.082706766917293e-06, + "loss": 0.2372, + "step": 26050 + }, + { + "epoch": 39.19, + "grad_norm": 4.8981404304504395, + "learning_rate": 6.081203007518797e-06, + "loss": 0.1533, + "step": 26060 + }, + { + "epoch": 39.2, + "grad_norm": 3.82854962348938, + "learning_rate": 6.079699248120301e-06, + "loss": 0.1997, + "step": 26070 + }, + { + "epoch": 39.22, + "grad_norm": 5.643593788146973, + "learning_rate": 6.078195488721805e-06, + "loss": 0.195, + "step": 26080 + }, + { + "epoch": 39.23, + "grad_norm": 5.393759727478027, + "learning_rate": 6.076691729323309e-06, + "loss": 0.2615, + "step": 26090 + }, + { + "epoch": 39.25, + "grad_norm": 4.524438381195068, + "learning_rate": 6.075187969924813e-06, + "loss": 0.2547, + "step": 26100 + }, + { + "epoch": 39.26, + "grad_norm": 4.402938365936279, + "learning_rate": 6.073684210526316e-06, + "loss": 0.3235, + "step": 26110 + }, + { + "epoch": 39.28, + "grad_norm": 4.850139141082764, + "learning_rate": 6.07218045112782e-06, + "loss": 0.3356, + "step": 26120 + }, + { + "epoch": 39.29, + "grad_norm": 6.940186023712158, + "learning_rate": 6.070676691729324e-06, + "loss": 0.2929, + "step": 26130 + }, + { + "epoch": 39.31, + "grad_norm": 5.906674861907959, + "learning_rate": 6.069172932330828e-06, + "loss": 0.2722, + "step": 26140 + }, + { + "epoch": 39.32, + "grad_norm": 6.132667541503906, + "learning_rate": 6.067669172932331e-06, + "loss": 0.2593, + "step": 26150 + }, + { + "epoch": 39.34, + "grad_norm": 4.581514835357666, + "learning_rate": 6.066165413533835e-06, + "loss": 0.2535, + "step": 26160 + }, + { + "epoch": 39.35, + "grad_norm": 7.306211471557617, + "learning_rate": 6.064661654135339e-06, + "loss": 0.2108, + "step": 26170 + }, + { + "epoch": 39.37, + "grad_norm": 5.0160040855407715, + "learning_rate": 6.063157894736843e-06, + "loss": 0.2747, + "step": 26180 + }, + { + "epoch": 39.38, + "grad_norm": 6.838048458099365, + "learning_rate": 6.0616541353383466e-06, + "loss": 0.2339, + "step": 26190 + }, + { + "epoch": 39.4, + "grad_norm": 7.156051158905029, + "learning_rate": 6.060150375939851e-06, + "loss": 0.2565, + "step": 26200 + }, + { + "epoch": 39.41, + "grad_norm": 3.844694137573242, + "learning_rate": 6.058646616541354e-06, + "loss": 0.2644, + "step": 26210 + }, + { + "epoch": 39.43, + "grad_norm": 9.594381332397461, + "learning_rate": 6.057142857142858e-06, + "loss": 0.2381, + "step": 26220 + }, + { + "epoch": 39.44, + "grad_norm": 6.174018383026123, + "learning_rate": 6.0556390977443615e-06, + "loss": 0.2974, + "step": 26230 + }, + { + "epoch": 39.46, + "grad_norm": 4.462780952453613, + "learning_rate": 6.054135338345866e-06, + "loss": 0.3042, + "step": 26240 + }, + { + "epoch": 39.47, + "grad_norm": 5.761167049407959, + "learning_rate": 6.0526315789473685e-06, + "loss": 0.2494, + "step": 26250 + }, + { + "epoch": 39.49, + "grad_norm": 6.0231733322143555, + "learning_rate": 6.051127819548873e-06, + "loss": 0.2232, + "step": 26260 + }, + { + "epoch": 39.5, + "grad_norm": 6.92008113861084, + "learning_rate": 6.049624060150376e-06, + "loss": 0.2763, + "step": 26270 + }, + { + "epoch": 39.52, + "grad_norm": 8.816802978515625, + "learning_rate": 6.048120300751881e-06, + "loss": 0.2386, + "step": 26280 + }, + { + "epoch": 39.53, + "grad_norm": 9.278828620910645, + "learning_rate": 6.046616541353384e-06, + "loss": 0.305, + "step": 26290 + }, + { + "epoch": 39.55, + "grad_norm": 6.65963888168335, + "learning_rate": 6.045112781954889e-06, + "loss": 0.2616, + "step": 26300 + }, + { + "epoch": 39.56, + "grad_norm": 6.531272888183594, + "learning_rate": 6.043609022556391e-06, + "loss": 0.2339, + "step": 26310 + }, + { + "epoch": 39.58, + "grad_norm": 5.144484043121338, + "learning_rate": 6.042105263157895e-06, + "loss": 0.3106, + "step": 26320 + }, + { + "epoch": 39.59, + "grad_norm": 5.5032243728637695, + "learning_rate": 6.040601503759399e-06, + "loss": 0.2968, + "step": 26330 + }, + { + "epoch": 39.61, + "grad_norm": 3.399604320526123, + "learning_rate": 6.039097744360902e-06, + "loss": 0.2153, + "step": 26340 + }, + { + "epoch": 39.62, + "grad_norm": 6.184195041656494, + "learning_rate": 6.037593984962406e-06, + "loss": 0.306, + "step": 26350 + }, + { + "epoch": 39.64, + "grad_norm": 5.967001914978027, + "learning_rate": 6.03609022556391e-06, + "loss": 0.3201, + "step": 26360 + }, + { + "epoch": 39.65, + "grad_norm": 3.1237454414367676, + "learning_rate": 6.034586466165414e-06, + "loss": 0.2411, + "step": 26370 + }, + { + "epoch": 39.67, + "grad_norm": 5.807510852813721, + "learning_rate": 6.033082706766918e-06, + "loss": 0.2651, + "step": 26380 + }, + { + "epoch": 39.68, + "grad_norm": 3.013021230697632, + "learning_rate": 6.031578947368422e-06, + "loss": 0.2835, + "step": 26390 + }, + { + "epoch": 39.7, + "grad_norm": 6.245032787322998, + "learning_rate": 6.030075187969925e-06, + "loss": 0.3153, + "step": 26400 + }, + { + "epoch": 39.71, + "grad_norm": 2.45760440826416, + "learning_rate": 6.028571428571429e-06, + "loss": 0.2501, + "step": 26410 + }, + { + "epoch": 39.73, + "grad_norm": 6.795231342315674, + "learning_rate": 6.0270676691729326e-06, + "loss": 0.2885, + "step": 26420 + }, + { + "epoch": 39.74, + "grad_norm": 4.70608377456665, + "learning_rate": 6.025563909774437e-06, + "loss": 0.2741, + "step": 26430 + }, + { + "epoch": 39.76, + "grad_norm": 3.3417322635650635, + "learning_rate": 6.02406015037594e-06, + "loss": 0.31, + "step": 26440 + }, + { + "epoch": 39.77, + "grad_norm": 4.526313781738281, + "learning_rate": 6.022556390977444e-06, + "loss": 0.3074, + "step": 26450 + }, + { + "epoch": 39.79, + "grad_norm": 4.290112495422363, + "learning_rate": 6.0210526315789475e-06, + "loss": 0.2311, + "step": 26460 + }, + { + "epoch": 39.8, + "grad_norm": 7.921751499176025, + "learning_rate": 6.019548872180452e-06, + "loss": 0.2454, + "step": 26470 + }, + { + "epoch": 39.82, + "grad_norm": 5.810790538787842, + "learning_rate": 6.018045112781955e-06, + "loss": 0.2412, + "step": 26480 + }, + { + "epoch": 39.83, + "grad_norm": 7.158168792724609, + "learning_rate": 6.01654135338346e-06, + "loss": 0.2827, + "step": 26490 + }, + { + "epoch": 39.85, + "grad_norm": 5.13280725479126, + "learning_rate": 6.015037593984962e-06, + "loss": 0.2417, + "step": 26500 + }, + { + "epoch": 39.86, + "grad_norm": 5.112004280090332, + "learning_rate": 6.013533834586467e-06, + "loss": 0.2607, + "step": 26510 + }, + { + "epoch": 39.88, + "grad_norm": 3.5853915214538574, + "learning_rate": 6.01203007518797e-06, + "loss": 0.2105, + "step": 26520 + }, + { + "epoch": 39.89, + "grad_norm": 7.726737976074219, + "learning_rate": 6.010526315789475e-06, + "loss": 0.2266, + "step": 26530 + }, + { + "epoch": 39.91, + "grad_norm": 4.511539936065674, + "learning_rate": 6.009022556390977e-06, + "loss": 0.2388, + "step": 26540 + }, + { + "epoch": 39.92, + "grad_norm": 6.07355260848999, + "learning_rate": 6.007518796992482e-06, + "loss": 0.2129, + "step": 26550 + }, + { + "epoch": 39.94, + "grad_norm": 4.063088417053223, + "learning_rate": 6.006015037593985e-06, + "loss": 0.2759, + "step": 26560 + }, + { + "epoch": 39.95, + "grad_norm": 8.524380683898926, + "learning_rate": 6.00451127819549e-06, + "loss": 0.2703, + "step": 26570 + }, + { + "epoch": 39.97, + "grad_norm": 5.095054626464844, + "learning_rate": 6.003007518796993e-06, + "loss": 0.2418, + "step": 26580 + }, + { + "epoch": 39.98, + "grad_norm": 4.896398067474365, + "learning_rate": 6.001503759398497e-06, + "loss": 0.294, + "step": 26590 + }, + { + "epoch": 40.0, + "grad_norm": 41.88473892211914, + "learning_rate": 6e-06, + "loss": 0.2252, + "step": 26600 + }, + { + "epoch": 40.0, + "eval_accuracy": 0.9313, + "eval_loss": 0.2959575057029724, + "eval_runtime": 84.9779, + "eval_samples_per_second": 117.678, + "eval_steps_per_second": 0.471, + "step": 26600 + }, + { + "epoch": 40.02, + "grad_norm": 11.272649765014648, + "learning_rate": 5.9984962406015045e-06, + "loss": 0.2401, + "step": 26610 + }, + { + "epoch": 40.03, + "grad_norm": 6.342247486114502, + "learning_rate": 5.996992481203008e-06, + "loss": 0.1853, + "step": 26620 + }, + { + "epoch": 40.05, + "grad_norm": 8.127535820007324, + "learning_rate": 5.995488721804512e-06, + "loss": 0.2091, + "step": 26630 + }, + { + "epoch": 40.06, + "grad_norm": 7.700329303741455, + "learning_rate": 5.993984962406015e-06, + "loss": 0.2537, + "step": 26640 + }, + { + "epoch": 40.08, + "grad_norm": 4.249181747436523, + "learning_rate": 5.9924812030075194e-06, + "loss": 0.2576, + "step": 26650 + }, + { + "epoch": 40.09, + "grad_norm": 6.067375659942627, + "learning_rate": 5.990977443609023e-06, + "loss": 0.2815, + "step": 26660 + }, + { + "epoch": 40.11, + "grad_norm": 5.851858139038086, + "learning_rate": 5.989473684210527e-06, + "loss": 0.2741, + "step": 26670 + }, + { + "epoch": 40.12, + "grad_norm": 2.2680578231811523, + "learning_rate": 5.987969924812031e-06, + "loss": 0.2571, + "step": 26680 + }, + { + "epoch": 40.14, + "grad_norm": 6.329833984375, + "learning_rate": 5.986466165413534e-06, + "loss": 0.1918, + "step": 26690 + }, + { + "epoch": 40.15, + "grad_norm": 3.3135337829589844, + "learning_rate": 5.984962406015038e-06, + "loss": 0.2999, + "step": 26700 + }, + { + "epoch": 40.17, + "grad_norm": 5.360442161560059, + "learning_rate": 5.983458646616542e-06, + "loss": 0.248, + "step": 26710 + }, + { + "epoch": 40.18, + "grad_norm": 5.619331359863281, + "learning_rate": 5.981954887218046e-06, + "loss": 0.2665, + "step": 26720 + }, + { + "epoch": 40.2, + "grad_norm": 6.200700283050537, + "learning_rate": 5.98045112781955e-06, + "loss": 0.2415, + "step": 26730 + }, + { + "epoch": 40.21, + "grad_norm": 6.159794330596924, + "learning_rate": 5.978947368421053e-06, + "loss": 0.3265, + "step": 26740 + }, + { + "epoch": 40.23, + "grad_norm": 4.464012622833252, + "learning_rate": 5.977443609022557e-06, + "loss": 0.2426, + "step": 26750 + }, + { + "epoch": 40.24, + "grad_norm": 6.896475791931152, + "learning_rate": 5.975939849624061e-06, + "loss": 0.2842, + "step": 26760 + }, + { + "epoch": 40.26, + "grad_norm": 5.030261993408203, + "learning_rate": 5.974436090225565e-06, + "loss": 0.2208, + "step": 26770 + }, + { + "epoch": 40.27, + "grad_norm": 7.4109392166137695, + "learning_rate": 5.972932330827068e-06, + "loss": 0.3284, + "step": 26780 + }, + { + "epoch": 40.29, + "grad_norm": 3.889845132827759, + "learning_rate": 5.971428571428572e-06, + "loss": 0.2688, + "step": 26790 + }, + { + "epoch": 40.3, + "grad_norm": 2.1321589946746826, + "learning_rate": 5.969924812030076e-06, + "loss": 0.1939, + "step": 26800 + }, + { + "epoch": 40.32, + "grad_norm": 10.229703903198242, + "learning_rate": 5.968421052631579e-06, + "loss": 0.2851, + "step": 26810 + }, + { + "epoch": 40.33, + "grad_norm": 2.8729848861694336, + "learning_rate": 5.9669172932330835e-06, + "loss": 0.2451, + "step": 26820 + }, + { + "epoch": 40.35, + "grad_norm": 5.045018672943115, + "learning_rate": 5.965413533834586e-06, + "loss": 0.2655, + "step": 26830 + }, + { + "epoch": 40.36, + "grad_norm": 6.061784744262695, + "learning_rate": 5.9639097744360905e-06, + "loss": 0.2106, + "step": 26840 + }, + { + "epoch": 40.38, + "grad_norm": 4.3670654296875, + "learning_rate": 5.962406015037594e-06, + "loss": 0.2508, + "step": 26850 + }, + { + "epoch": 40.39, + "grad_norm": 4.169200897216797, + "learning_rate": 5.9609022556390984e-06, + "loss": 0.2374, + "step": 26860 + }, + { + "epoch": 40.41, + "grad_norm": 9.870522499084473, + "learning_rate": 5.959398496240601e-06, + "loss": 0.2112, + "step": 26870 + }, + { + "epoch": 40.42, + "grad_norm": 4.387085914611816, + "learning_rate": 5.9578947368421055e-06, + "loss": 0.2245, + "step": 26880 + }, + { + "epoch": 40.44, + "grad_norm": 4.728475093841553, + "learning_rate": 5.956390977443609e-06, + "loss": 0.2252, + "step": 26890 + }, + { + "epoch": 40.45, + "grad_norm": 3.2639694213867188, + "learning_rate": 5.954887218045113e-06, + "loss": 0.2637, + "step": 26900 + }, + { + "epoch": 40.47, + "grad_norm": 3.462743043899536, + "learning_rate": 5.953383458646617e-06, + "loss": 0.2298, + "step": 26910 + }, + { + "epoch": 40.48, + "grad_norm": 7.873288631439209, + "learning_rate": 5.951879699248121e-06, + "loss": 0.2781, + "step": 26920 + }, + { + "epoch": 40.5, + "grad_norm": 5.6623334884643555, + "learning_rate": 5.950375939849624e-06, + "loss": 0.3083, + "step": 26930 + }, + { + "epoch": 40.51, + "grad_norm": 4.990388870239258, + "learning_rate": 5.948872180451128e-06, + "loss": 0.2423, + "step": 26940 + }, + { + "epoch": 40.53, + "grad_norm": 4.813294887542725, + "learning_rate": 5.947368421052632e-06, + "loss": 0.3243, + "step": 26950 + }, + { + "epoch": 40.54, + "grad_norm": 4.974578857421875, + "learning_rate": 5.945864661654136e-06, + "loss": 0.2494, + "step": 26960 + }, + { + "epoch": 40.56, + "grad_norm": 5.326529502868652, + "learning_rate": 5.944360902255639e-06, + "loss": 0.3272, + "step": 26970 + }, + { + "epoch": 40.57, + "grad_norm": 6.8446245193481445, + "learning_rate": 5.942857142857143e-06, + "loss": 0.3175, + "step": 26980 + }, + { + "epoch": 40.59, + "grad_norm": 7.015409469604492, + "learning_rate": 5.941353383458647e-06, + "loss": 0.2259, + "step": 26990 + }, + { + "epoch": 40.6, + "grad_norm": 5.787068843841553, + "learning_rate": 5.939849624060151e-06, + "loss": 0.2027, + "step": 27000 + }, + { + "epoch": 40.62, + "grad_norm": 8.761579513549805, + "learning_rate": 5.938345864661655e-06, + "loss": 0.229, + "step": 27010 + }, + { + "epoch": 40.63, + "grad_norm": 6.765686988830566, + "learning_rate": 5.936842105263159e-06, + "loss": 0.2945, + "step": 27020 + }, + { + "epoch": 40.65, + "grad_norm": 8.93405532836914, + "learning_rate": 5.935338345864662e-06, + "loss": 0.3073, + "step": 27030 + }, + { + "epoch": 40.66, + "grad_norm": 6.20574951171875, + "learning_rate": 5.933834586466166e-06, + "loss": 0.2785, + "step": 27040 + }, + { + "epoch": 40.68, + "grad_norm": 7.853849411010742, + "learning_rate": 5.9323308270676695e-06, + "loss": 0.2294, + "step": 27050 + }, + { + "epoch": 40.69, + "grad_norm": 4.646696090698242, + "learning_rate": 5.930827067669174e-06, + "loss": 0.2995, + "step": 27060 + }, + { + "epoch": 40.71, + "grad_norm": 7.266605377197266, + "learning_rate": 5.9293233082706766e-06, + "loss": 0.259, + "step": 27070 + }, + { + "epoch": 40.72, + "grad_norm": 6.365235805511475, + "learning_rate": 5.927819548872181e-06, + "loss": 0.2905, + "step": 27080 + }, + { + "epoch": 40.74, + "grad_norm": 4.6850457191467285, + "learning_rate": 5.9263157894736844e-06, + "loss": 0.2313, + "step": 27090 + }, + { + "epoch": 40.75, + "grad_norm": 2.9510128498077393, + "learning_rate": 5.924812030075189e-06, + "loss": 0.3019, + "step": 27100 + }, + { + "epoch": 40.77, + "grad_norm": 6.478631973266602, + "learning_rate": 5.923308270676692e-06, + "loss": 0.2318, + "step": 27110 + }, + { + "epoch": 40.78, + "grad_norm": 4.466511249542236, + "learning_rate": 5.921804511278197e-06, + "loss": 0.2816, + "step": 27120 + }, + { + "epoch": 40.8, + "grad_norm": 3.0053863525390625, + "learning_rate": 5.920300751879699e-06, + "loss": 0.2526, + "step": 27130 + }, + { + "epoch": 40.81, + "grad_norm": 4.911371231079102, + "learning_rate": 5.918796992481204e-06, + "loss": 0.2546, + "step": 27140 + }, + { + "epoch": 40.83, + "grad_norm": 7.452986240386963, + "learning_rate": 5.917293233082707e-06, + "loss": 0.2378, + "step": 27150 + }, + { + "epoch": 40.84, + "grad_norm": 6.792994499206543, + "learning_rate": 5.915789473684212e-06, + "loss": 0.3151, + "step": 27160 + }, + { + "epoch": 40.86, + "grad_norm": 5.100854396820068, + "learning_rate": 5.914285714285714e-06, + "loss": 0.253, + "step": 27170 + }, + { + "epoch": 40.87, + "grad_norm": 3.16489577293396, + "learning_rate": 5.912781954887219e-06, + "loss": 0.2431, + "step": 27180 + }, + { + "epoch": 40.89, + "grad_norm": 9.801965713500977, + "learning_rate": 5.911278195488722e-06, + "loss": 0.2013, + "step": 27190 + }, + { + "epoch": 40.9, + "grad_norm": 3.110069990158081, + "learning_rate": 5.9097744360902265e-06, + "loss": 0.2555, + "step": 27200 + }, + { + "epoch": 40.92, + "grad_norm": 4.8249006271362305, + "learning_rate": 5.90827067669173e-06, + "loss": 0.1823, + "step": 27210 + }, + { + "epoch": 40.93, + "grad_norm": 2.37221622467041, + "learning_rate": 5.9067669172932344e-06, + "loss": 0.2177, + "step": 27220 + }, + { + "epoch": 40.95, + "grad_norm": 4.915449142456055, + "learning_rate": 5.905263157894737e-06, + "loss": 0.2638, + "step": 27230 + }, + { + "epoch": 40.96, + "grad_norm": 4.170838832855225, + "learning_rate": 5.9037593984962415e-06, + "loss": 0.2424, + "step": 27240 + }, + { + "epoch": 40.98, + "grad_norm": 6.760472297668457, + "learning_rate": 5.902255639097745e-06, + "loss": 0.237, + "step": 27250 + }, + { + "epoch": 40.99, + "grad_norm": 4.23274564743042, + "learning_rate": 5.900751879699249e-06, + "loss": 0.2453, + "step": 27260 + }, + { + "epoch": 41.0, + "eval_accuracy": 0.9302, + "eval_loss": 0.2976926863193512, + "eval_runtime": 84.6737, + "eval_samples_per_second": 118.1, + "eval_steps_per_second": 0.472, + "step": 27265 + }, + { + "epoch": 41.01, + "grad_norm": 7.621011734008789, + "learning_rate": 5.899248120300752e-06, + "loss": 0.2292, + "step": 27270 + }, + { + "epoch": 41.02, + "grad_norm": 3.8886141777038574, + "learning_rate": 5.897744360902256e-06, + "loss": 0.1851, + "step": 27280 + }, + { + "epoch": 41.04, + "grad_norm": 5.533069133758545, + "learning_rate": 5.89624060150376e-06, + "loss": 0.2613, + "step": 27290 + }, + { + "epoch": 41.05, + "grad_norm": 4.005669116973877, + "learning_rate": 5.8947368421052634e-06, + "loss": 0.1644, + "step": 27300 + }, + { + "epoch": 41.07, + "grad_norm": 3.7740938663482666, + "learning_rate": 5.893233082706768e-06, + "loss": 0.1893, + "step": 27310 + }, + { + "epoch": 41.08, + "grad_norm": 8.077632904052734, + "learning_rate": 5.8917293233082705e-06, + "loss": 0.2166, + "step": 27320 + }, + { + "epoch": 41.1, + "grad_norm": 2.8428125381469727, + "learning_rate": 5.890225563909775e-06, + "loss": 0.2274, + "step": 27330 + }, + { + "epoch": 41.11, + "grad_norm": 2.748422861099243, + "learning_rate": 5.888721804511278e-06, + "loss": 0.1786, + "step": 27340 + }, + { + "epoch": 41.13, + "grad_norm": 6.080495834350586, + "learning_rate": 5.887218045112783e-06, + "loss": 0.3044, + "step": 27350 + }, + { + "epoch": 41.14, + "grad_norm": 6.971614837646484, + "learning_rate": 5.885714285714285e-06, + "loss": 0.2141, + "step": 27360 + }, + { + "epoch": 41.16, + "grad_norm": 3.363379716873169, + "learning_rate": 5.88421052631579e-06, + "loss": 0.3337, + "step": 27370 + }, + { + "epoch": 41.17, + "grad_norm": 4.116064548492432, + "learning_rate": 5.882706766917293e-06, + "loss": 0.2026, + "step": 27380 + }, + { + "epoch": 41.19, + "grad_norm": 2.9727118015289307, + "learning_rate": 5.881203007518798e-06, + "loss": 0.2342, + "step": 27390 + }, + { + "epoch": 41.2, + "grad_norm": 4.2843337059021, + "learning_rate": 5.879699248120301e-06, + "loss": 0.2033, + "step": 27400 + }, + { + "epoch": 41.22, + "grad_norm": 11.28203010559082, + "learning_rate": 5.8781954887218055e-06, + "loss": 0.2623, + "step": 27410 + }, + { + "epoch": 41.23, + "grad_norm": 1.7853152751922607, + "learning_rate": 5.876691729323308e-06, + "loss": 0.1762, + "step": 27420 + }, + { + "epoch": 41.25, + "grad_norm": 2.5484893321990967, + "learning_rate": 5.8751879699248126e-06, + "loss": 0.1423, + "step": 27430 + }, + { + "epoch": 41.26, + "grad_norm": 5.5011186599731445, + "learning_rate": 5.873684210526316e-06, + "loss": 0.2804, + "step": 27440 + }, + { + "epoch": 41.28, + "grad_norm": 4.6855645179748535, + "learning_rate": 5.8721804511278204e-06, + "loss": 0.2465, + "step": 27450 + }, + { + "epoch": 41.29, + "grad_norm": 7.020749092102051, + "learning_rate": 5.870676691729323e-06, + "loss": 0.1858, + "step": 27460 + }, + { + "epoch": 41.31, + "grad_norm": 3.362569808959961, + "learning_rate": 5.8691729323308275e-06, + "loss": 0.2039, + "step": 27470 + }, + { + "epoch": 41.32, + "grad_norm": 7.235138893127441, + "learning_rate": 5.867669172932331e-06, + "loss": 0.2206, + "step": 27480 + }, + { + "epoch": 41.34, + "grad_norm": 8.273979187011719, + "learning_rate": 5.866165413533835e-06, + "loss": 0.2561, + "step": 27490 + }, + { + "epoch": 41.35, + "grad_norm": 9.396625518798828, + "learning_rate": 5.864661654135339e-06, + "loss": 0.2517, + "step": 27500 + }, + { + "epoch": 41.37, + "grad_norm": 6.119128704071045, + "learning_rate": 5.863157894736842e-06, + "loss": 0.2438, + "step": 27510 + }, + { + "epoch": 41.38, + "grad_norm": 6.962316989898682, + "learning_rate": 5.861654135338346e-06, + "loss": 0.267, + "step": 27520 + }, + { + "epoch": 41.4, + "grad_norm": 2.296393632888794, + "learning_rate": 5.86015037593985e-06, + "loss": 0.2358, + "step": 27530 + }, + { + "epoch": 41.41, + "grad_norm": 6.2318830490112305, + "learning_rate": 5.858646616541354e-06, + "loss": 0.2647, + "step": 27540 + }, + { + "epoch": 41.43, + "grad_norm": 5.737059116363525, + "learning_rate": 5.857142857142858e-06, + "loss": 0.2347, + "step": 27550 + }, + { + "epoch": 41.44, + "grad_norm": 6.697840213775635, + "learning_rate": 5.855639097744361e-06, + "loss": 0.1956, + "step": 27560 + }, + { + "epoch": 41.46, + "grad_norm": 4.799352169036865, + "learning_rate": 5.854135338345865e-06, + "loss": 0.2545, + "step": 27570 + }, + { + "epoch": 41.47, + "grad_norm": 9.115018844604492, + "learning_rate": 5.852631578947369e-06, + "loss": 0.2221, + "step": 27580 + }, + { + "epoch": 41.49, + "grad_norm": 3.4361791610717773, + "learning_rate": 5.851127819548873e-06, + "loss": 0.2736, + "step": 27590 + }, + { + "epoch": 41.5, + "grad_norm": 6.11599588394165, + "learning_rate": 5.849624060150377e-06, + "loss": 0.2768, + "step": 27600 + }, + { + "epoch": 41.52, + "grad_norm": 17.091064453125, + "learning_rate": 5.84812030075188e-06, + "loss": 0.2645, + "step": 27610 + }, + { + "epoch": 41.53, + "grad_norm": 2.6468470096588135, + "learning_rate": 5.846616541353384e-06, + "loss": 0.2765, + "step": 27620 + }, + { + "epoch": 41.55, + "grad_norm": 8.783407211303711, + "learning_rate": 5.845112781954888e-06, + "loss": 0.2277, + "step": 27630 + }, + { + "epoch": 41.56, + "grad_norm": 9.349631309509277, + "learning_rate": 5.8436090225563915e-06, + "loss": 0.2197, + "step": 27640 + }, + { + "epoch": 41.58, + "grad_norm": 5.436890125274658, + "learning_rate": 5.842105263157896e-06, + "loss": 0.2632, + "step": 27650 + }, + { + "epoch": 41.59, + "grad_norm": 2.644437313079834, + "learning_rate": 5.840601503759399e-06, + "loss": 0.2113, + "step": 27660 + }, + { + "epoch": 41.61, + "grad_norm": 7.14797306060791, + "learning_rate": 5.839097744360903e-06, + "loss": 0.2395, + "step": 27670 + }, + { + "epoch": 41.62, + "grad_norm": 5.408485412597656, + "learning_rate": 5.8375939849624065e-06, + "loss": 0.23, + "step": 27680 + }, + { + "epoch": 41.64, + "grad_norm": 6.316678524017334, + "learning_rate": 5.836090225563911e-06, + "loss": 0.2506, + "step": 27690 + }, + { + "epoch": 41.65, + "grad_norm": 4.856528282165527, + "learning_rate": 5.8345864661654135e-06, + "loss": 0.2565, + "step": 27700 + }, + { + "epoch": 41.67, + "grad_norm": 3.731872797012329, + "learning_rate": 5.833082706766918e-06, + "loss": 0.2733, + "step": 27710 + }, + { + "epoch": 41.68, + "grad_norm": 10.921002388000488, + "learning_rate": 5.831578947368421e-06, + "loss": 0.233, + "step": 27720 + }, + { + "epoch": 41.7, + "grad_norm": 8.051673889160156, + "learning_rate": 5.830075187969926e-06, + "loss": 0.2357, + "step": 27730 + }, + { + "epoch": 41.71, + "grad_norm": 5.143133640289307, + "learning_rate": 5.828571428571429e-06, + "loss": 0.2399, + "step": 27740 + }, + { + "epoch": 41.73, + "grad_norm": 6.688822269439697, + "learning_rate": 5.827067669172934e-06, + "loss": 0.2199, + "step": 27750 + }, + { + "epoch": 41.74, + "grad_norm": 5.77666711807251, + "learning_rate": 5.825563909774436e-06, + "loss": 0.2227, + "step": 27760 + }, + { + "epoch": 41.76, + "grad_norm": 5.851940155029297, + "learning_rate": 5.824060150375941e-06, + "loss": 0.2161, + "step": 27770 + }, + { + "epoch": 41.77, + "grad_norm": 3.7167811393737793, + "learning_rate": 5.822556390977444e-06, + "loss": 0.2276, + "step": 27780 + }, + { + "epoch": 41.79, + "grad_norm": 1.5249907970428467, + "learning_rate": 5.8210526315789486e-06, + "loss": 0.2857, + "step": 27790 + }, + { + "epoch": 41.8, + "grad_norm": 3.788996458053589, + "learning_rate": 5.819548872180451e-06, + "loss": 0.2901, + "step": 27800 + }, + { + "epoch": 41.82, + "grad_norm": 3.4979562759399414, + "learning_rate": 5.818045112781955e-06, + "loss": 0.2643, + "step": 27810 + }, + { + "epoch": 41.83, + "grad_norm": 5.074125289916992, + "learning_rate": 5.816541353383459e-06, + "loss": 0.2909, + "step": 27820 + }, + { + "epoch": 41.85, + "grad_norm": 7.354587078094482, + "learning_rate": 5.815037593984963e-06, + "loss": 0.2756, + "step": 27830 + }, + { + "epoch": 41.86, + "grad_norm": 5.332225322723389, + "learning_rate": 5.813533834586467e-06, + "loss": 0.2369, + "step": 27840 + }, + { + "epoch": 41.88, + "grad_norm": 3.5564475059509277, + "learning_rate": 5.81203007518797e-06, + "loss": 0.3292, + "step": 27850 + }, + { + "epoch": 41.89, + "grad_norm": 5.977893829345703, + "learning_rate": 5.810526315789474e-06, + "loss": 0.2441, + "step": 27860 + }, + { + "epoch": 41.91, + "grad_norm": 7.151272773742676, + "learning_rate": 5.8090225563909776e-06, + "loss": 0.2375, + "step": 27870 + }, + { + "epoch": 41.92, + "grad_norm": 3.628330707550049, + "learning_rate": 5.807518796992482e-06, + "loss": 0.2102, + "step": 27880 + }, + { + "epoch": 41.94, + "grad_norm": 6.060537338256836, + "learning_rate": 5.806015037593985e-06, + "loss": 0.2511, + "step": 27890 + }, + { + "epoch": 41.95, + "grad_norm": 6.281295299530029, + "learning_rate": 5.804511278195489e-06, + "loss": 0.2389, + "step": 27900 + }, + { + "epoch": 41.97, + "grad_norm": 6.898097515106201, + "learning_rate": 5.8030075187969925e-06, + "loss": 0.282, + "step": 27910 + }, + { + "epoch": 41.98, + "grad_norm": 4.405340671539307, + "learning_rate": 5.801503759398497e-06, + "loss": 0.2674, + "step": 27920 + }, + { + "epoch": 42.0, + "grad_norm": 4.30500602722168, + "learning_rate": 5.8e-06, + "loss": 0.2467, + "step": 27930 + }, + { + "epoch": 42.0, + "eval_accuracy": 0.9293, + "eval_loss": 0.30341148376464844, + "eval_runtime": 84.726, + "eval_samples_per_second": 118.028, + "eval_steps_per_second": 0.472, + "step": 27930 + }, + { + "epoch": 42.02, + "grad_norm": 6.07786750793457, + "learning_rate": 5.798496240601505e-06, + "loss": 0.2712, + "step": 27940 + }, + { + "epoch": 42.03, + "grad_norm": 6.053528308868408, + "learning_rate": 5.796992481203007e-06, + "loss": 0.2482, + "step": 27950 + }, + { + "epoch": 42.05, + "grad_norm": 5.752837657928467, + "learning_rate": 5.795488721804512e-06, + "loss": 0.2579, + "step": 27960 + }, + { + "epoch": 42.06, + "grad_norm": 8.296350479125977, + "learning_rate": 5.793984962406015e-06, + "loss": 0.2481, + "step": 27970 + }, + { + "epoch": 42.08, + "grad_norm": 4.709738254547119, + "learning_rate": 5.79248120300752e-06, + "loss": 0.1771, + "step": 27980 + }, + { + "epoch": 42.09, + "grad_norm": 3.9645566940307617, + "learning_rate": 5.790977443609022e-06, + "loss": 0.218, + "step": 27990 + }, + { + "epoch": 42.11, + "grad_norm": 5.714948654174805, + "learning_rate": 5.789473684210527e-06, + "loss": 0.2734, + "step": 28000 + }, + { + "epoch": 42.12, + "grad_norm": 3.882260799407959, + "learning_rate": 5.78796992481203e-06, + "loss": 0.2747, + "step": 28010 + }, + { + "epoch": 42.14, + "grad_norm": 4.841667175292969, + "learning_rate": 5.786466165413535e-06, + "loss": 0.2728, + "step": 28020 + }, + { + "epoch": 42.15, + "grad_norm": 7.8732523918151855, + "learning_rate": 5.784962406015038e-06, + "loss": 0.2417, + "step": 28030 + }, + { + "epoch": 42.17, + "grad_norm": 6.003359317779541, + "learning_rate": 5.7834586466165425e-06, + "loss": 0.2442, + "step": 28040 + }, + { + "epoch": 42.18, + "grad_norm": 4.850991249084473, + "learning_rate": 5.781954887218045e-06, + "loss": 0.2731, + "step": 28050 + }, + { + "epoch": 42.2, + "grad_norm": 5.1097025871276855, + "learning_rate": 5.7804511278195495e-06, + "loss": 0.1836, + "step": 28060 + }, + { + "epoch": 42.21, + "grad_norm": 6.158971309661865, + "learning_rate": 5.778947368421053e-06, + "loss": 0.2186, + "step": 28070 + }, + { + "epoch": 42.23, + "grad_norm": 4.05385160446167, + "learning_rate": 5.777443609022557e-06, + "loss": 0.2081, + "step": 28080 + }, + { + "epoch": 42.24, + "grad_norm": 6.728810787200928, + "learning_rate": 5.77593984962406e-06, + "loss": 0.2637, + "step": 28090 + }, + { + "epoch": 42.26, + "grad_norm": 4.585230350494385, + "learning_rate": 5.7744360902255644e-06, + "loss": 0.2556, + "step": 28100 + }, + { + "epoch": 42.27, + "grad_norm": 4.899889945983887, + "learning_rate": 5.772932330827068e-06, + "loss": 0.2692, + "step": 28110 + }, + { + "epoch": 42.29, + "grad_norm": 8.909112930297852, + "learning_rate": 5.771428571428572e-06, + "loss": 0.1978, + "step": 28120 + }, + { + "epoch": 42.3, + "grad_norm": 5.728882312774658, + "learning_rate": 5.769924812030076e-06, + "loss": 0.2635, + "step": 28130 + }, + { + "epoch": 42.32, + "grad_norm": 4.432710647583008, + "learning_rate": 5.76842105263158e-06, + "loss": 0.2616, + "step": 28140 + }, + { + "epoch": 42.33, + "grad_norm": 3.3793838024139404, + "learning_rate": 5.766917293233083e-06, + "loss": 0.2387, + "step": 28150 + }, + { + "epoch": 42.35, + "grad_norm": 10.177301406860352, + "learning_rate": 5.765413533834587e-06, + "loss": 0.2494, + "step": 28160 + }, + { + "epoch": 42.36, + "grad_norm": 5.089664459228516, + "learning_rate": 5.763909774436091e-06, + "loss": 0.2588, + "step": 28170 + }, + { + "epoch": 42.38, + "grad_norm": 3.203890800476074, + "learning_rate": 5.762406015037595e-06, + "loss": 0.3037, + "step": 28180 + }, + { + "epoch": 42.39, + "grad_norm": 5.658225059509277, + "learning_rate": 5.760902255639098e-06, + "loss": 0.2522, + "step": 28190 + }, + { + "epoch": 42.41, + "grad_norm": 3.704941987991333, + "learning_rate": 5.759398496240602e-06, + "loss": 0.2382, + "step": 28200 + }, + { + "epoch": 42.42, + "grad_norm": 4.1319780349731445, + "learning_rate": 5.757894736842106e-06, + "loss": 0.2014, + "step": 28210 + }, + { + "epoch": 42.44, + "grad_norm": 4.998769760131836, + "learning_rate": 5.75639097744361e-06, + "loss": 0.2205, + "step": 28220 + }, + { + "epoch": 42.45, + "grad_norm": 2.3330812454223633, + "learning_rate": 5.7548872180451136e-06, + "loss": 0.2242, + "step": 28230 + }, + { + "epoch": 42.47, + "grad_norm": 3.1000936031341553, + "learning_rate": 5.753383458646618e-06, + "loss": 0.2238, + "step": 28240 + }, + { + "epoch": 42.48, + "grad_norm": 5.262942790985107, + "learning_rate": 5.751879699248121e-06, + "loss": 0.2225, + "step": 28250 + }, + { + "epoch": 42.5, + "grad_norm": 7.2910475730896, + "learning_rate": 5.750375939849625e-06, + "loss": 0.2829, + "step": 28260 + }, + { + "epoch": 42.51, + "grad_norm": 2.4421637058258057, + "learning_rate": 5.7488721804511285e-06, + "loss": 0.284, + "step": 28270 + }, + { + "epoch": 42.53, + "grad_norm": 2.7164719104766846, + "learning_rate": 5.747368421052633e-06, + "loss": 0.2569, + "step": 28280 + }, + { + "epoch": 42.54, + "grad_norm": 5.2328877449035645, + "learning_rate": 5.7458646616541355e-06, + "loss": 0.2374, + "step": 28290 + }, + { + "epoch": 42.56, + "grad_norm": 3.4275119304656982, + "learning_rate": 5.744360902255639e-06, + "loss": 0.2283, + "step": 28300 + }, + { + "epoch": 42.57, + "grad_norm": 3.65972900390625, + "learning_rate": 5.742857142857143e-06, + "loss": 0.307, + "step": 28310 + }, + { + "epoch": 42.59, + "grad_norm": 4.684558391571045, + "learning_rate": 5.741353383458647e-06, + "loss": 0.2323, + "step": 28320 + }, + { + "epoch": 42.6, + "grad_norm": 7.553560733795166, + "learning_rate": 5.739849624060151e-06, + "loss": 0.1697, + "step": 28330 + }, + { + "epoch": 42.62, + "grad_norm": 5.401693820953369, + "learning_rate": 5.738345864661654e-06, + "loss": 0.2497, + "step": 28340 + }, + { + "epoch": 42.63, + "grad_norm": 4.120377540588379, + "learning_rate": 5.736842105263158e-06, + "loss": 0.285, + "step": 28350 + }, + { + "epoch": 42.65, + "grad_norm": 6.782698154449463, + "learning_rate": 5.735338345864662e-06, + "loss": 0.2505, + "step": 28360 + }, + { + "epoch": 42.66, + "grad_norm": 10.522151947021484, + "learning_rate": 5.733834586466166e-06, + "loss": 0.234, + "step": 28370 + }, + { + "epoch": 42.68, + "grad_norm": 6.4431233406066895, + "learning_rate": 5.732330827067669e-06, + "loss": 0.2463, + "step": 28380 + }, + { + "epoch": 42.69, + "grad_norm": 5.447474479675293, + "learning_rate": 5.730827067669173e-06, + "loss": 0.334, + "step": 28390 + }, + { + "epoch": 42.71, + "grad_norm": 7.193316459655762, + "learning_rate": 5.729323308270677e-06, + "loss": 0.1952, + "step": 28400 + }, + { + "epoch": 42.72, + "grad_norm": 3.5891122817993164, + "learning_rate": 5.727819548872181e-06, + "loss": 0.222, + "step": 28410 + }, + { + "epoch": 42.74, + "grad_norm": 2.9756336212158203, + "learning_rate": 5.726315789473685e-06, + "loss": 0.2087, + "step": 28420 + }, + { + "epoch": 42.75, + "grad_norm": 5.7030720710754395, + "learning_rate": 5.724812030075188e-06, + "loss": 0.2127, + "step": 28430 + }, + { + "epoch": 42.77, + "grad_norm": 4.419146537780762, + "learning_rate": 5.723308270676692e-06, + "loss": 0.1878, + "step": 28440 + }, + { + "epoch": 42.78, + "grad_norm": 5.313589096069336, + "learning_rate": 5.721804511278196e-06, + "loss": 0.2445, + "step": 28450 + }, + { + "epoch": 42.8, + "grad_norm": 5.905307769775391, + "learning_rate": 5.7203007518797e-06, + "loss": 0.2381, + "step": 28460 + }, + { + "epoch": 42.81, + "grad_norm": 5.510866641998291, + "learning_rate": 5.718796992481204e-06, + "loss": 0.2788, + "step": 28470 + }, + { + "epoch": 42.83, + "grad_norm": 5.849859237670898, + "learning_rate": 5.717293233082707e-06, + "loss": 0.2436, + "step": 28480 + }, + { + "epoch": 42.84, + "grad_norm": 5.243779182434082, + "learning_rate": 5.715789473684211e-06, + "loss": 0.1705, + "step": 28490 + }, + { + "epoch": 42.86, + "grad_norm": 3.643617868423462, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.3002, + "step": 28500 + }, + { + "epoch": 42.87, + "grad_norm": 3.605794906616211, + "learning_rate": 5.712781954887219e-06, + "loss": 0.1879, + "step": 28510 + }, + { + "epoch": 42.89, + "grad_norm": 4.405106067657471, + "learning_rate": 5.711278195488722e-06, + "loss": 0.222, + "step": 28520 + }, + { + "epoch": 42.9, + "grad_norm": 7.1118597984313965, + "learning_rate": 5.709774436090226e-06, + "loss": 0.3624, + "step": 28530 + }, + { + "epoch": 42.92, + "grad_norm": 5.815327167510986, + "learning_rate": 5.7082706766917294e-06, + "loss": 0.2162, + "step": 28540 + }, + { + "epoch": 42.93, + "grad_norm": 5.0367021560668945, + "learning_rate": 5.706766917293234e-06, + "loss": 0.2481, + "step": 28550 + }, + { + "epoch": 42.95, + "grad_norm": 6.6812005043029785, + "learning_rate": 5.705263157894737e-06, + "loss": 0.2977, + "step": 28560 + }, + { + "epoch": 42.96, + "grad_norm": 3.0472841262817383, + "learning_rate": 5.703759398496242e-06, + "loss": 0.225, + "step": 28570 + }, + { + "epoch": 42.98, + "grad_norm": 5.001314163208008, + "learning_rate": 5.702255639097744e-06, + "loss": 0.1936, + "step": 28580 + }, + { + "epoch": 42.99, + "grad_norm": 7.2746124267578125, + "learning_rate": 5.700751879699249e-06, + "loss": 0.2208, + "step": 28590 + }, + { + "epoch": 43.0, + "eval_accuracy": 0.9316, + "eval_loss": 0.30223846435546875, + "eval_runtime": 84.2035, + "eval_samples_per_second": 118.76, + "eval_steps_per_second": 0.475, + "step": 28595 + }, + { + "epoch": 43.01, + "grad_norm": 5.429502010345459, + "learning_rate": 5.699248120300752e-06, + "loss": 0.2629, + "step": 28600 + }, + { + "epoch": 43.02, + "grad_norm": 8.030689239501953, + "learning_rate": 5.697744360902257e-06, + "loss": 0.1887, + "step": 28610 + }, + { + "epoch": 43.04, + "grad_norm": 4.963591575622559, + "learning_rate": 5.696240601503759e-06, + "loss": 0.2608, + "step": 28620 + }, + { + "epoch": 43.05, + "grad_norm": 7.090287685394287, + "learning_rate": 5.694736842105264e-06, + "loss": 0.248, + "step": 28630 + }, + { + "epoch": 43.07, + "grad_norm": 5.958043575286865, + "learning_rate": 5.693233082706767e-06, + "loss": 0.3033, + "step": 28640 + }, + { + "epoch": 43.08, + "grad_norm": 6.555380821228027, + "learning_rate": 5.6917293233082715e-06, + "loss": 0.2399, + "step": 28650 + }, + { + "epoch": 43.1, + "grad_norm": 7.416017055511475, + "learning_rate": 5.690225563909775e-06, + "loss": 0.2513, + "step": 28660 + }, + { + "epoch": 43.11, + "grad_norm": 4.798736095428467, + "learning_rate": 5.688721804511279e-06, + "loss": 0.2721, + "step": 28670 + }, + { + "epoch": 43.13, + "grad_norm": 6.545383930206299, + "learning_rate": 5.687218045112782e-06, + "loss": 0.2962, + "step": 28680 + }, + { + "epoch": 43.14, + "grad_norm": 7.702247142791748, + "learning_rate": 5.6857142857142865e-06, + "loss": 0.2098, + "step": 28690 + }, + { + "epoch": 43.16, + "grad_norm": 2.8116698265075684, + "learning_rate": 5.68421052631579e-06, + "loss": 0.1915, + "step": 28700 + }, + { + "epoch": 43.17, + "grad_norm": 8.904080390930176, + "learning_rate": 5.682706766917294e-06, + "loss": 0.3113, + "step": 28710 + }, + { + "epoch": 43.19, + "grad_norm": 3.884316921234131, + "learning_rate": 5.681203007518797e-06, + "loss": 0.2209, + "step": 28720 + }, + { + "epoch": 43.2, + "grad_norm": 3.413797616958618, + "learning_rate": 5.679699248120301e-06, + "loss": 0.1643, + "step": 28730 + }, + { + "epoch": 43.22, + "grad_norm": 7.109938144683838, + "learning_rate": 5.678195488721805e-06, + "loss": 0.2486, + "step": 28740 + }, + { + "epoch": 43.23, + "grad_norm": 4.222686290740967, + "learning_rate": 5.676691729323309e-06, + "loss": 0.2572, + "step": 28750 + }, + { + "epoch": 43.25, + "grad_norm": 4.973549842834473, + "learning_rate": 5.675187969924813e-06, + "loss": 0.2302, + "step": 28760 + }, + { + "epoch": 43.26, + "grad_norm": 4.342545986175537, + "learning_rate": 5.673684210526317e-06, + "loss": 0.2825, + "step": 28770 + }, + { + "epoch": 43.28, + "grad_norm": 5.619150638580322, + "learning_rate": 5.67218045112782e-06, + "loss": 0.2352, + "step": 28780 + }, + { + "epoch": 43.29, + "grad_norm": 6.378529071807861, + "learning_rate": 5.670676691729323e-06, + "loss": 0.231, + "step": 28790 + }, + { + "epoch": 43.31, + "grad_norm": 3.949969530105591, + "learning_rate": 5.669172932330828e-06, + "loss": 0.2399, + "step": 28800 + }, + { + "epoch": 43.32, + "grad_norm": 3.982423782348633, + "learning_rate": 5.66766917293233e-06, + "loss": 0.1841, + "step": 28810 + }, + { + "epoch": 43.34, + "grad_norm": 3.037238359451294, + "learning_rate": 5.666165413533835e-06, + "loss": 0.2084, + "step": 28820 + }, + { + "epoch": 43.35, + "grad_norm": 5.667693614959717, + "learning_rate": 5.664661654135338e-06, + "loss": 0.2061, + "step": 28830 + }, + { + "epoch": 43.37, + "grad_norm": 5.317113876342773, + "learning_rate": 5.663157894736843e-06, + "loss": 0.2213, + "step": 28840 + }, + { + "epoch": 43.38, + "grad_norm": 7.288003921508789, + "learning_rate": 5.661654135338346e-06, + "loss": 0.3024, + "step": 28850 + }, + { + "epoch": 43.4, + "grad_norm": 6.291513919830322, + "learning_rate": 5.6601503759398505e-06, + "loss": 0.2002, + "step": 28860 + }, + { + "epoch": 43.41, + "grad_norm": 4.533992767333984, + "learning_rate": 5.658646616541353e-06, + "loss": 0.2687, + "step": 28870 + }, + { + "epoch": 43.43, + "grad_norm": 5.96859073638916, + "learning_rate": 5.6571428571428576e-06, + "loss": 0.2142, + "step": 28880 + }, + { + "epoch": 43.44, + "grad_norm": 5.588306903839111, + "learning_rate": 5.655639097744361e-06, + "loss": 0.2045, + "step": 28890 + }, + { + "epoch": 43.46, + "grad_norm": 3.9208312034606934, + "learning_rate": 5.6541353383458654e-06, + "loss": 0.3291, + "step": 28900 + }, + { + "epoch": 43.47, + "grad_norm": 4.575656414031982, + "learning_rate": 5.652631578947368e-06, + "loss": 0.203, + "step": 28910 + }, + { + "epoch": 43.49, + "grad_norm": 4.715184211730957, + "learning_rate": 5.6511278195488725e-06, + "loss": 0.2584, + "step": 28920 + }, + { + "epoch": 43.5, + "grad_norm": 7.345559597015381, + "learning_rate": 5.649624060150376e-06, + "loss": 0.2279, + "step": 28930 + }, + { + "epoch": 43.52, + "grad_norm": 8.647790908813477, + "learning_rate": 5.64812030075188e-06, + "loss": 0.286, + "step": 28940 + }, + { + "epoch": 43.53, + "grad_norm": 3.186676025390625, + "learning_rate": 5.646616541353384e-06, + "loss": 0.2159, + "step": 28950 + }, + { + "epoch": 43.55, + "grad_norm": 5.287961006164551, + "learning_rate": 5.645112781954888e-06, + "loss": 0.2934, + "step": 28960 + }, + { + "epoch": 43.56, + "grad_norm": 1.5112135410308838, + "learning_rate": 5.643609022556391e-06, + "loss": 0.2194, + "step": 28970 + }, + { + "epoch": 43.58, + "grad_norm": 4.162924289703369, + "learning_rate": 5.642105263157895e-06, + "loss": 0.1885, + "step": 28980 + }, + { + "epoch": 43.59, + "grad_norm": 6.321695327758789, + "learning_rate": 5.640601503759399e-06, + "loss": 0.1848, + "step": 28990 + }, + { + "epoch": 43.61, + "grad_norm": 7.07379674911499, + "learning_rate": 5.639097744360903e-06, + "loss": 0.2176, + "step": 29000 + }, + { + "epoch": 43.62, + "grad_norm": 2.475600242614746, + "learning_rate": 5.637593984962406e-06, + "loss": 0.2473, + "step": 29010 + }, + { + "epoch": 43.64, + "grad_norm": 2.369236946105957, + "learning_rate": 5.63609022556391e-06, + "loss": 0.2338, + "step": 29020 + }, + { + "epoch": 43.65, + "grad_norm": 8.563982963562012, + "learning_rate": 5.634586466165414e-06, + "loss": 0.2339, + "step": 29030 + }, + { + "epoch": 43.67, + "grad_norm": 11.370944023132324, + "learning_rate": 5.633082706766918e-06, + "loss": 0.2524, + "step": 29040 + }, + { + "epoch": 43.68, + "grad_norm": 8.391033172607422, + "learning_rate": 5.631578947368422e-06, + "loss": 0.2441, + "step": 29050 + }, + { + "epoch": 43.7, + "grad_norm": 4.559657096862793, + "learning_rate": 5.630075187969926e-06, + "loss": 0.2515, + "step": 29060 + }, + { + "epoch": 43.71, + "grad_norm": 4.1660637855529785, + "learning_rate": 5.628571428571429e-06, + "loss": 0.2495, + "step": 29070 + }, + { + "epoch": 43.73, + "grad_norm": 8.0914306640625, + "learning_rate": 5.627067669172933e-06, + "loss": 0.2166, + "step": 29080 + }, + { + "epoch": 43.74, + "grad_norm": 5.4428019523620605, + "learning_rate": 5.6255639097744365e-06, + "loss": 0.2574, + "step": 29090 + }, + { + "epoch": 43.76, + "grad_norm": 4.377140045166016, + "learning_rate": 5.624060150375941e-06, + "loss": 0.2215, + "step": 29100 + }, + { + "epoch": 43.77, + "grad_norm": 5.647352695465088, + "learning_rate": 5.6225563909774436e-06, + "loss": 0.2947, + "step": 29110 + }, + { + "epoch": 43.79, + "grad_norm": 4.882458209991455, + "learning_rate": 5.621052631578948e-06, + "loss": 0.2522, + "step": 29120 + }, + { + "epoch": 43.8, + "grad_norm": 6.157678604125977, + "learning_rate": 5.6195488721804515e-06, + "loss": 0.221, + "step": 29130 + }, + { + "epoch": 43.82, + "grad_norm": 6.887104511260986, + "learning_rate": 5.618045112781956e-06, + "loss": 0.2797, + "step": 29140 + }, + { + "epoch": 43.83, + "grad_norm": 3.868295907974243, + "learning_rate": 5.616541353383459e-06, + "loss": 0.2551, + "step": 29150 + }, + { + "epoch": 43.85, + "grad_norm": 9.585166931152344, + "learning_rate": 5.615037593984964e-06, + "loss": 0.2301, + "step": 29160 + }, + { + "epoch": 43.86, + "grad_norm": 5.948240756988525, + "learning_rate": 5.613533834586466e-06, + "loss": 0.2388, + "step": 29170 + }, + { + "epoch": 43.88, + "grad_norm": 5.998847961425781, + "learning_rate": 5.612030075187971e-06, + "loss": 0.2077, + "step": 29180 + }, + { + "epoch": 43.89, + "grad_norm": 5.52202844619751, + "learning_rate": 5.610526315789474e-06, + "loss": 0.1981, + "step": 29190 + }, + { + "epoch": 43.91, + "grad_norm": 3.2842657566070557, + "learning_rate": 5.609022556390979e-06, + "loss": 0.2744, + "step": 29200 + }, + { + "epoch": 43.92, + "grad_norm": 6.214591026306152, + "learning_rate": 5.607518796992481e-06, + "loss": 0.1858, + "step": 29210 + }, + { + "epoch": 43.94, + "grad_norm": 5.361661911010742, + "learning_rate": 5.606015037593986e-06, + "loss": 0.2499, + "step": 29220 + }, + { + "epoch": 43.95, + "grad_norm": 5.610089302062988, + "learning_rate": 5.604511278195489e-06, + "loss": 0.2287, + "step": 29230 + }, + { + "epoch": 43.97, + "grad_norm": 5.380805492401123, + "learning_rate": 5.6030075187969936e-06, + "loss": 0.2702, + "step": 29240 + }, + { + "epoch": 43.98, + "grad_norm": 3.860438823699951, + "learning_rate": 5.601503759398497e-06, + "loss": 0.2509, + "step": 29250 + }, + { + "epoch": 44.0, + "grad_norm": 0.10728010535240173, + "learning_rate": 5.600000000000001e-06, + "loss": 0.1808, + "step": 29260 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.9304, + "eval_loss": 0.30674317479133606, + "eval_runtime": 85.2945, + "eval_samples_per_second": 117.241, + "eval_steps_per_second": 0.469, + "step": 29260 + }, + { + "epoch": 44.02, + "grad_norm": 7.593850612640381, + "learning_rate": 5.598496240601504e-06, + "loss": 0.2738, + "step": 29270 + }, + { + "epoch": 44.03, + "grad_norm": 6.34241247177124, + "learning_rate": 5.596992481203008e-06, + "loss": 0.2215, + "step": 29280 + }, + { + "epoch": 44.05, + "grad_norm": 5.706809997558594, + "learning_rate": 5.595488721804512e-06, + "loss": 0.277, + "step": 29290 + }, + { + "epoch": 44.06, + "grad_norm": 4.330235481262207, + "learning_rate": 5.593984962406015e-06, + "loss": 0.2421, + "step": 29300 + }, + { + "epoch": 44.08, + "grad_norm": 6.897051811218262, + "learning_rate": 5.592481203007519e-06, + "loss": 0.2199, + "step": 29310 + }, + { + "epoch": 44.09, + "grad_norm": 3.312359571456909, + "learning_rate": 5.5909774436090226e-06, + "loss": 0.1884, + "step": 29320 + }, + { + "epoch": 44.11, + "grad_norm": 4.363166809082031, + "learning_rate": 5.589473684210527e-06, + "loss": 0.255, + "step": 29330 + }, + { + "epoch": 44.12, + "grad_norm": 7.3857269287109375, + "learning_rate": 5.5879699248120304e-06, + "loss": 0.298, + "step": 29340 + }, + { + "epoch": 44.14, + "grad_norm": 4.645081520080566, + "learning_rate": 5.586466165413534e-06, + "loss": 0.318, + "step": 29350 + }, + { + "epoch": 44.15, + "grad_norm": 5.76023006439209, + "learning_rate": 5.5849624060150375e-06, + "loss": 0.2442, + "step": 29360 + }, + { + "epoch": 44.17, + "grad_norm": 3.3517050743103027, + "learning_rate": 5.583458646616542e-06, + "loss": 0.2442, + "step": 29370 + }, + { + "epoch": 44.18, + "grad_norm": 4.758605480194092, + "learning_rate": 5.581954887218045e-06, + "loss": 0.2328, + "step": 29380 + }, + { + "epoch": 44.2, + "grad_norm": 5.125278949737549, + "learning_rate": 5.58045112781955e-06, + "loss": 0.2316, + "step": 29390 + }, + { + "epoch": 44.21, + "grad_norm": 5.346681594848633, + "learning_rate": 5.578947368421052e-06, + "loss": 0.2119, + "step": 29400 + }, + { + "epoch": 44.23, + "grad_norm": 5.317344665527344, + "learning_rate": 5.577443609022557e-06, + "loss": 0.1829, + "step": 29410 + }, + { + "epoch": 44.24, + "grad_norm": 6.973268985748291, + "learning_rate": 5.57593984962406e-06, + "loss": 0.2207, + "step": 29420 + }, + { + "epoch": 44.26, + "grad_norm": 2.5319881439208984, + "learning_rate": 5.574436090225565e-06, + "loss": 0.2677, + "step": 29430 + }, + { + "epoch": 44.27, + "grad_norm": 1.6542171239852905, + "learning_rate": 5.572932330827068e-06, + "loss": 0.2092, + "step": 29440 + }, + { + "epoch": 44.29, + "grad_norm": 5.905990123748779, + "learning_rate": 5.571428571428572e-06, + "loss": 0.2059, + "step": 29450 + }, + { + "epoch": 44.3, + "grad_norm": 3.364076614379883, + "learning_rate": 5.569924812030075e-06, + "loss": 0.2847, + "step": 29460 + }, + { + "epoch": 44.32, + "grad_norm": 2.6462037563323975, + "learning_rate": 5.5684210526315796e-06, + "loss": 0.2227, + "step": 29470 + }, + { + "epoch": 44.33, + "grad_norm": 4.426711559295654, + "learning_rate": 5.566917293233083e-06, + "loss": 0.2758, + "step": 29480 + }, + { + "epoch": 44.35, + "grad_norm": 6.097959518432617, + "learning_rate": 5.5654135338345875e-06, + "loss": 0.256, + "step": 29490 + }, + { + "epoch": 44.36, + "grad_norm": 3.295834541320801, + "learning_rate": 5.56390977443609e-06, + "loss": 0.2586, + "step": 29500 + }, + { + "epoch": 44.38, + "grad_norm": 6.391618251800537, + "learning_rate": 5.5624060150375945e-06, + "loss": 0.2186, + "step": 29510 + }, + { + "epoch": 44.39, + "grad_norm": 5.297180652618408, + "learning_rate": 5.560902255639098e-06, + "loss": 0.2465, + "step": 29520 + }, + { + "epoch": 44.41, + "grad_norm": 3.471672773361206, + "learning_rate": 5.559398496240602e-06, + "loss": 0.2363, + "step": 29530 + }, + { + "epoch": 44.42, + "grad_norm": 3.7030515670776367, + "learning_rate": 5.557894736842105e-06, + "loss": 0.2308, + "step": 29540 + }, + { + "epoch": 44.44, + "grad_norm": 4.913259029388428, + "learning_rate": 5.556390977443609e-06, + "loss": 0.2459, + "step": 29550 + }, + { + "epoch": 44.45, + "grad_norm": 7.469844818115234, + "learning_rate": 5.554887218045113e-06, + "loss": 0.1795, + "step": 29560 + }, + { + "epoch": 44.47, + "grad_norm": 6.876651763916016, + "learning_rate": 5.553383458646617e-06, + "loss": 0.2296, + "step": 29570 + }, + { + "epoch": 44.48, + "grad_norm": 3.4335014820098877, + "learning_rate": 5.551879699248121e-06, + "loss": 0.2769, + "step": 29580 + }, + { + "epoch": 44.5, + "grad_norm": 5.788983345031738, + "learning_rate": 5.550375939849625e-06, + "loss": 0.2428, + "step": 29590 + }, + { + "epoch": 44.51, + "grad_norm": 4.837071895599365, + "learning_rate": 5.548872180451128e-06, + "loss": 0.2378, + "step": 29600 + }, + { + "epoch": 44.53, + "grad_norm": 4.5005202293396, + "learning_rate": 5.547368421052632e-06, + "loss": 0.2397, + "step": 29610 + }, + { + "epoch": 44.54, + "grad_norm": 4.948091983795166, + "learning_rate": 5.545864661654136e-06, + "loss": 0.2521, + "step": 29620 + }, + { + "epoch": 44.56, + "grad_norm": 7.149682998657227, + "learning_rate": 5.54436090225564e-06, + "loss": 0.2811, + "step": 29630 + }, + { + "epoch": 44.57, + "grad_norm": 4.339492321014404, + "learning_rate": 5.542857142857143e-06, + "loss": 0.19, + "step": 29640 + }, + { + "epoch": 44.59, + "grad_norm": 5.427370548248291, + "learning_rate": 5.541353383458647e-06, + "loss": 0.2379, + "step": 29650 + }, + { + "epoch": 44.6, + "grad_norm": 3.6944634914398193, + "learning_rate": 5.539849624060151e-06, + "loss": 0.183, + "step": 29660 + }, + { + "epoch": 44.62, + "grad_norm": 5.690896034240723, + "learning_rate": 5.538345864661655e-06, + "loss": 0.1527, + "step": 29670 + }, + { + "epoch": 44.63, + "grad_norm": 5.7550048828125, + "learning_rate": 5.5368421052631586e-06, + "loss": 0.1855, + "step": 29680 + }, + { + "epoch": 44.65, + "grad_norm": 2.685657024383545, + "learning_rate": 5.535338345864663e-06, + "loss": 0.2244, + "step": 29690 + }, + { + "epoch": 44.66, + "grad_norm": 3.263343334197998, + "learning_rate": 5.533834586466166e-06, + "loss": 0.2051, + "step": 29700 + }, + { + "epoch": 44.68, + "grad_norm": 5.3920183181762695, + "learning_rate": 5.53233082706767e-06, + "loss": 0.2987, + "step": 29710 + }, + { + "epoch": 44.69, + "grad_norm": 9.512574195861816, + "learning_rate": 5.5308270676691735e-06, + "loss": 0.2528, + "step": 29720 + }, + { + "epoch": 44.71, + "grad_norm": 8.392505645751953, + "learning_rate": 5.529323308270678e-06, + "loss": 0.2906, + "step": 29730 + }, + { + "epoch": 44.72, + "grad_norm": 5.764971733093262, + "learning_rate": 5.5278195488721805e-06, + "loss": 0.2531, + "step": 29740 + }, + { + "epoch": 44.74, + "grad_norm": 4.783633232116699, + "learning_rate": 5.526315789473685e-06, + "loss": 0.3142, + "step": 29750 + }, + { + "epoch": 44.75, + "grad_norm": 7.186029434204102, + "learning_rate": 5.524812030075188e-06, + "loss": 0.2501, + "step": 29760 + }, + { + "epoch": 44.77, + "grad_norm": 1.497518539428711, + "learning_rate": 5.523308270676693e-06, + "loss": 0.2384, + "step": 29770 + }, + { + "epoch": 44.78, + "grad_norm": 8.040397644042969, + "learning_rate": 5.521804511278196e-06, + "loss": 0.1935, + "step": 29780 + }, + { + "epoch": 44.8, + "grad_norm": 5.375740051269531, + "learning_rate": 5.520300751879699e-06, + "loss": 0.2086, + "step": 29790 + }, + { + "epoch": 44.81, + "grad_norm": 4.856134414672852, + "learning_rate": 5.518796992481203e-06, + "loss": 0.2516, + "step": 29800 + }, + { + "epoch": 44.83, + "grad_norm": 5.769529819488525, + "learning_rate": 5.517293233082707e-06, + "loss": 0.2667, + "step": 29810 + }, + { + "epoch": 44.84, + "grad_norm": 3.9014487266540527, + "learning_rate": 5.515789473684211e-06, + "loss": 0.2218, + "step": 29820 + }, + { + "epoch": 44.86, + "grad_norm": 3.6634175777435303, + "learning_rate": 5.514285714285714e-06, + "loss": 0.2317, + "step": 29830 + }, + { + "epoch": 44.87, + "grad_norm": 3.2795653343200684, + "learning_rate": 5.512781954887218e-06, + "loss": 0.2201, + "step": 29840 + }, + { + "epoch": 44.89, + "grad_norm": 3.049172878265381, + "learning_rate": 5.511278195488722e-06, + "loss": 0.1521, + "step": 29850 + }, + { + "epoch": 44.9, + "grad_norm": 3.333717107772827, + "learning_rate": 5.509774436090226e-06, + "loss": 0.272, + "step": 29860 + }, + { + "epoch": 44.92, + "grad_norm": 6.023979187011719, + "learning_rate": 5.50827067669173e-06, + "loss": 0.2422, + "step": 29870 + }, + { + "epoch": 44.93, + "grad_norm": 2.427889347076416, + "learning_rate": 5.506766917293234e-06, + "loss": 0.219, + "step": 29880 + }, + { + "epoch": 44.95, + "grad_norm": 7.4705729484558105, + "learning_rate": 5.505263157894737e-06, + "loss": 0.2594, + "step": 29890 + }, + { + "epoch": 44.96, + "grad_norm": 3.8348019123077393, + "learning_rate": 5.503759398496241e-06, + "loss": 0.2171, + "step": 29900 + }, + { + "epoch": 44.98, + "grad_norm": 6.202742576599121, + "learning_rate": 5.502255639097745e-06, + "loss": 0.3124, + "step": 29910 + }, + { + "epoch": 44.99, + "grad_norm": 6.531280517578125, + "learning_rate": 5.500751879699249e-06, + "loss": 0.2477, + "step": 29920 + }, + { + "epoch": 45.0, + "eval_accuracy": 0.9289, + "eval_loss": 0.30727890133857727, + "eval_runtime": 84.3468, + "eval_samples_per_second": 118.558, + "eval_steps_per_second": 0.474, + "step": 29925 + }, + { + "epoch": 45.01, + "grad_norm": 6.2103376388549805, + "learning_rate": 5.499248120300752e-06, + "loss": 0.3184, + "step": 29930 + }, + { + "epoch": 45.02, + "grad_norm": 4.898624897003174, + "learning_rate": 5.497744360902256e-06, + "loss": 0.2263, + "step": 29940 + }, + { + "epoch": 45.04, + "grad_norm": 5.601806640625, + "learning_rate": 5.4962406015037595e-06, + "loss": 0.1897, + "step": 29950 + }, + { + "epoch": 45.05, + "grad_norm": 3.2715799808502197, + "learning_rate": 5.494736842105264e-06, + "loss": 0.2605, + "step": 29960 + }, + { + "epoch": 45.07, + "grad_norm": 4.670425891876221, + "learning_rate": 5.493233082706767e-06, + "loss": 0.2316, + "step": 29970 + }, + { + "epoch": 45.08, + "grad_norm": 3.2674357891082764, + "learning_rate": 5.491729323308272e-06, + "loss": 0.1808, + "step": 29980 + }, + { + "epoch": 45.1, + "grad_norm": 4.537569522857666, + "learning_rate": 5.4902255639097744e-06, + "loss": 0.2665, + "step": 29990 + }, + { + "epoch": 45.11, + "grad_norm": 4.6132378578186035, + "learning_rate": 5.488721804511279e-06, + "loss": 0.2267, + "step": 30000 + }, + { + "epoch": 45.13, + "grad_norm": 6.702213764190674, + "learning_rate": 5.487218045112782e-06, + "loss": 0.2676, + "step": 30010 + }, + { + "epoch": 45.14, + "grad_norm": 5.990668296813965, + "learning_rate": 5.485714285714287e-06, + "loss": 0.265, + "step": 30020 + }, + { + "epoch": 45.16, + "grad_norm": 4.7114787101745605, + "learning_rate": 5.484210526315789e-06, + "loss": 0.204, + "step": 30030 + }, + { + "epoch": 45.17, + "grad_norm": 2.577725648880005, + "learning_rate": 5.482706766917294e-06, + "loss": 0.2601, + "step": 30040 + }, + { + "epoch": 45.19, + "grad_norm": 7.188079833984375, + "learning_rate": 5.481203007518797e-06, + "loss": 0.2447, + "step": 30050 + }, + { + "epoch": 45.2, + "grad_norm": 8.671704292297363, + "learning_rate": 5.479699248120302e-06, + "loss": 0.202, + "step": 30060 + }, + { + "epoch": 45.22, + "grad_norm": 1.6407883167266846, + "learning_rate": 5.478195488721805e-06, + "loss": 0.2504, + "step": 30070 + }, + { + "epoch": 45.23, + "grad_norm": 6.040456295013428, + "learning_rate": 5.476691729323309e-06, + "loss": 0.2701, + "step": 30080 + }, + { + "epoch": 45.25, + "grad_norm": 4.857377529144287, + "learning_rate": 5.475187969924812e-06, + "loss": 0.1846, + "step": 30090 + }, + { + "epoch": 45.26, + "grad_norm": 4.847814083099365, + "learning_rate": 5.4736842105263165e-06, + "loss": 0.2355, + "step": 30100 + }, + { + "epoch": 45.28, + "grad_norm": 5.133174896240234, + "learning_rate": 5.47218045112782e-06, + "loss": 0.2244, + "step": 30110 + }, + { + "epoch": 45.29, + "grad_norm": 4.437646389007568, + "learning_rate": 5.470676691729324e-06, + "loss": 0.2366, + "step": 30120 + }, + { + "epoch": 45.31, + "grad_norm": 9.677255630493164, + "learning_rate": 5.469172932330827e-06, + "loss": 0.2258, + "step": 30130 + }, + { + "epoch": 45.32, + "grad_norm": 10.691313743591309, + "learning_rate": 5.4676691729323314e-06, + "loss": 0.2529, + "step": 30140 + }, + { + "epoch": 45.34, + "grad_norm": 4.892699718475342, + "learning_rate": 5.466165413533835e-06, + "loss": 0.212, + "step": 30150 + }, + { + "epoch": 45.35, + "grad_norm": 8.477937698364258, + "learning_rate": 5.464661654135339e-06, + "loss": 0.2291, + "step": 30160 + }, + { + "epoch": 45.37, + "grad_norm": 4.592231273651123, + "learning_rate": 5.463157894736843e-06, + "loss": 0.2567, + "step": 30170 + }, + { + "epoch": 45.38, + "grad_norm": 4.459981918334961, + "learning_rate": 5.461654135338346e-06, + "loss": 0.2596, + "step": 30180 + }, + { + "epoch": 45.4, + "grad_norm": 6.071181774139404, + "learning_rate": 5.46015037593985e-06, + "loss": 0.2445, + "step": 30190 + }, + { + "epoch": 45.41, + "grad_norm": 5.071913242340088, + "learning_rate": 5.458646616541354e-06, + "loss": 0.208, + "step": 30200 + }, + { + "epoch": 45.43, + "grad_norm": 7.438906669616699, + "learning_rate": 5.457142857142858e-06, + "loss": 0.2321, + "step": 30210 + }, + { + "epoch": 45.44, + "grad_norm": 5.6083292961120605, + "learning_rate": 5.455639097744362e-06, + "loss": 0.1798, + "step": 30220 + }, + { + "epoch": 45.46, + "grad_norm": 5.303403377532959, + "learning_rate": 5.454135338345865e-06, + "loss": 0.2718, + "step": 30230 + }, + { + "epoch": 45.47, + "grad_norm": 4.844057083129883, + "learning_rate": 5.452631578947369e-06, + "loss": 0.1896, + "step": 30240 + }, + { + "epoch": 45.49, + "grad_norm": 4.131418228149414, + "learning_rate": 5.451127819548873e-06, + "loss": 0.2972, + "step": 30250 + }, + { + "epoch": 45.5, + "grad_norm": 4.064949035644531, + "learning_rate": 5.449624060150377e-06, + "loss": 0.2298, + "step": 30260 + }, + { + "epoch": 45.52, + "grad_norm": 4.775879859924316, + "learning_rate": 5.44812030075188e-06, + "loss": 0.2578, + "step": 30270 + }, + { + "epoch": 45.53, + "grad_norm": 8.634382247924805, + "learning_rate": 5.446616541353383e-06, + "loss": 0.2508, + "step": 30280 + }, + { + "epoch": 45.55, + "grad_norm": 2.0520710945129395, + "learning_rate": 5.445112781954888e-06, + "loss": 0.2033, + "step": 30290 + }, + { + "epoch": 45.56, + "grad_norm": 7.116695880889893, + "learning_rate": 5.443609022556391e-06, + "loss": 0.2528, + "step": 30300 + }, + { + "epoch": 45.58, + "grad_norm": 5.326464653015137, + "learning_rate": 5.4421052631578955e-06, + "loss": 0.2536, + "step": 30310 + }, + { + "epoch": 45.59, + "grad_norm": 10.817031860351562, + "learning_rate": 5.440601503759398e-06, + "loss": 0.2231, + "step": 30320 + }, + { + "epoch": 45.61, + "grad_norm": 4.3931965827941895, + "learning_rate": 5.4390977443609025e-06, + "loss": 0.261, + "step": 30330 + }, + { + "epoch": 45.62, + "grad_norm": 5.171022891998291, + "learning_rate": 5.437593984962406e-06, + "loss": 0.2064, + "step": 30340 + }, + { + "epoch": 45.64, + "grad_norm": 6.176837921142578, + "learning_rate": 5.4360902255639104e-06, + "loss": 0.2539, + "step": 30350 + }, + { + "epoch": 45.65, + "grad_norm": 5.6013078689575195, + "learning_rate": 5.434586466165413e-06, + "loss": 0.229, + "step": 30360 + }, + { + "epoch": 45.67, + "grad_norm": 7.163257122039795, + "learning_rate": 5.4330827067669175e-06, + "loss": 0.2193, + "step": 30370 + }, + { + "epoch": 45.68, + "grad_norm": 5.970539093017578, + "learning_rate": 5.431578947368421e-06, + "loss": 0.2238, + "step": 30380 + }, + { + "epoch": 45.7, + "grad_norm": 4.074548721313477, + "learning_rate": 5.430075187969925e-06, + "loss": 0.1859, + "step": 30390 + }, + { + "epoch": 45.71, + "grad_norm": 6.107884883880615, + "learning_rate": 5.428571428571429e-06, + "loss": 0.2406, + "step": 30400 + }, + { + "epoch": 45.73, + "grad_norm": 6.470170497894287, + "learning_rate": 5.427067669172933e-06, + "loss": 0.2258, + "step": 30410 + }, + { + "epoch": 45.74, + "grad_norm": 5.55424165725708, + "learning_rate": 5.425563909774436e-06, + "loss": 0.2494, + "step": 30420 + }, + { + "epoch": 45.76, + "grad_norm": 7.8290910720825195, + "learning_rate": 5.42406015037594e-06, + "loss": 0.1809, + "step": 30430 + }, + { + "epoch": 45.77, + "grad_norm": 3.04001784324646, + "learning_rate": 5.422556390977444e-06, + "loss": 0.2146, + "step": 30440 + }, + { + "epoch": 45.79, + "grad_norm": 4.655550479888916, + "learning_rate": 5.421052631578948e-06, + "loss": 0.2232, + "step": 30450 + }, + { + "epoch": 45.8, + "grad_norm": 5.950998306274414, + "learning_rate": 5.419548872180451e-06, + "loss": 0.275, + "step": 30460 + }, + { + "epoch": 45.82, + "grad_norm": 6.617697238922119, + "learning_rate": 5.418045112781955e-06, + "loss": 0.1975, + "step": 30470 + }, + { + "epoch": 45.83, + "grad_norm": 4.946775913238525, + "learning_rate": 5.416541353383459e-06, + "loss": 0.2631, + "step": 30480 + }, + { + "epoch": 45.85, + "grad_norm": 8.360345840454102, + "learning_rate": 5.415037593984963e-06, + "loss": 0.2695, + "step": 30490 + }, + { + "epoch": 45.86, + "grad_norm": 5.876543998718262, + "learning_rate": 5.413533834586467e-06, + "loss": 0.253, + "step": 30500 + }, + { + "epoch": 45.88, + "grad_norm": 5.3216047286987305, + "learning_rate": 5.412030075187971e-06, + "loss": 0.2594, + "step": 30510 + }, + { + "epoch": 45.89, + "grad_norm": 3.109912633895874, + "learning_rate": 5.410526315789474e-06, + "loss": 0.2005, + "step": 30520 + }, + { + "epoch": 45.91, + "grad_norm": 16.188390731811523, + "learning_rate": 5.409022556390978e-06, + "loss": 0.1955, + "step": 30530 + }, + { + "epoch": 45.92, + "grad_norm": 4.817110061645508, + "learning_rate": 5.4075187969924815e-06, + "loss": 0.2133, + "step": 30540 + }, + { + "epoch": 45.94, + "grad_norm": 6.045027732849121, + "learning_rate": 5.406015037593986e-06, + "loss": 0.2392, + "step": 30550 + }, + { + "epoch": 45.95, + "grad_norm": 3.955580234527588, + "learning_rate": 5.4045112781954886e-06, + "loss": 0.224, + "step": 30560 + }, + { + "epoch": 45.97, + "grad_norm": 2.896059036254883, + "learning_rate": 5.403007518796993e-06, + "loss": 0.2249, + "step": 30570 + }, + { + "epoch": 45.98, + "grad_norm": 5.959082126617432, + "learning_rate": 5.4015037593984964e-06, + "loss": 0.2038, + "step": 30580 + }, + { + "epoch": 46.0, + "grad_norm": 23.55235481262207, + "learning_rate": 5.400000000000001e-06, + "loss": 0.2059, + "step": 30590 + }, + { + "epoch": 46.0, + "eval_accuracy": 0.931, + "eval_loss": 0.30095645785331726, + "eval_runtime": 84.8003, + "eval_samples_per_second": 117.924, + "eval_steps_per_second": 0.472, + "step": 30590 + }, + { + "epoch": 46.02, + "grad_norm": 14.315238952636719, + "learning_rate": 5.398496240601504e-06, + "loss": 0.2133, + "step": 30600 + }, + { + "epoch": 46.03, + "grad_norm": 3.028569221496582, + "learning_rate": 5.396992481203009e-06, + "loss": 0.2082, + "step": 30610 + }, + { + "epoch": 46.05, + "grad_norm": 4.44739294052124, + "learning_rate": 5.395488721804511e-06, + "loss": 0.3014, + "step": 30620 + }, + { + "epoch": 46.06, + "grad_norm": 5.786952018737793, + "learning_rate": 5.393984962406016e-06, + "loss": 0.196, + "step": 30630 + }, + { + "epoch": 46.08, + "grad_norm": 6.879361152648926, + "learning_rate": 5.392481203007519e-06, + "loss": 0.2238, + "step": 30640 + }, + { + "epoch": 46.09, + "grad_norm": 2.7093942165374756, + "learning_rate": 5.390977443609024e-06, + "loss": 0.236, + "step": 30650 + }, + { + "epoch": 46.11, + "grad_norm": 4.412895679473877, + "learning_rate": 5.389473684210526e-06, + "loss": 0.214, + "step": 30660 + }, + { + "epoch": 46.12, + "grad_norm": 2.3530495166778564, + "learning_rate": 5.387969924812031e-06, + "loss": 0.2996, + "step": 30670 + }, + { + "epoch": 46.14, + "grad_norm": 5.1143107414245605, + "learning_rate": 5.386466165413534e-06, + "loss": 0.2465, + "step": 30680 + }, + { + "epoch": 46.15, + "grad_norm": 5.444381237030029, + "learning_rate": 5.3849624060150385e-06, + "loss": 0.2921, + "step": 30690 + }, + { + "epoch": 46.17, + "grad_norm": 6.554684162139893, + "learning_rate": 5.383458646616542e-06, + "loss": 0.2532, + "step": 30700 + }, + { + "epoch": 46.18, + "grad_norm": 6.188846588134766, + "learning_rate": 5.3819548872180464e-06, + "loss": 0.1924, + "step": 30710 + }, + { + "epoch": 46.2, + "grad_norm": 4.2928361892700195, + "learning_rate": 5.380451127819549e-06, + "loss": 0.2115, + "step": 30720 + }, + { + "epoch": 46.21, + "grad_norm": 1.6907163858413696, + "learning_rate": 5.3789473684210535e-06, + "loss": 0.3012, + "step": 30730 + }, + { + "epoch": 46.23, + "grad_norm": 8.911703109741211, + "learning_rate": 5.377443609022557e-06, + "loss": 0.2386, + "step": 30740 + }, + { + "epoch": 46.24, + "grad_norm": 12.857329368591309, + "learning_rate": 5.375939849624061e-06, + "loss": 0.2622, + "step": 30750 + }, + { + "epoch": 46.26, + "grad_norm": 4.299805641174316, + "learning_rate": 5.374436090225564e-06, + "loss": 0.2495, + "step": 30760 + }, + { + "epoch": 46.27, + "grad_norm": 4.013417720794678, + "learning_rate": 5.3729323308270675e-06, + "loss": 0.2414, + "step": 30770 + }, + { + "epoch": 46.29, + "grad_norm": 5.057952404022217, + "learning_rate": 5.371428571428572e-06, + "loss": 0.3043, + "step": 30780 + }, + { + "epoch": 46.3, + "grad_norm": 2.708482265472412, + "learning_rate": 5.3699248120300754e-06, + "loss": 0.2512, + "step": 30790 + }, + { + "epoch": 46.32, + "grad_norm": 4.2768988609313965, + "learning_rate": 5.36842105263158e-06, + "loss": 0.2791, + "step": 30800 + }, + { + "epoch": 46.33, + "grad_norm": 10.092656135559082, + "learning_rate": 5.3669172932330825e-06, + "loss": 0.2439, + "step": 30810 + }, + { + "epoch": 46.35, + "grad_norm": 7.980869293212891, + "learning_rate": 5.365413533834587e-06, + "loss": 0.2967, + "step": 30820 + }, + { + "epoch": 46.36, + "grad_norm": 6.0706000328063965, + "learning_rate": 5.36390977443609e-06, + "loss": 0.2448, + "step": 30830 + }, + { + "epoch": 46.38, + "grad_norm": 5.074240207672119, + "learning_rate": 5.362406015037595e-06, + "loss": 0.2367, + "step": 30840 + }, + { + "epoch": 46.39, + "grad_norm": 4.816855430603027, + "learning_rate": 5.360902255639097e-06, + "loss": 0.2318, + "step": 30850 + }, + { + "epoch": 46.41, + "grad_norm": 3.8746345043182373, + "learning_rate": 5.359398496240602e-06, + "loss": 0.2336, + "step": 30860 + }, + { + "epoch": 46.42, + "grad_norm": 5.22980260848999, + "learning_rate": 5.357894736842105e-06, + "loss": 0.2148, + "step": 30870 + }, + { + "epoch": 46.44, + "grad_norm": 6.546250343322754, + "learning_rate": 5.35639097744361e-06, + "loss": 0.2513, + "step": 30880 + }, + { + "epoch": 46.45, + "grad_norm": 3.120495319366455, + "learning_rate": 5.354887218045113e-06, + "loss": 0.2673, + "step": 30890 + }, + { + "epoch": 46.47, + "grad_norm": 4.755849838256836, + "learning_rate": 5.3533834586466175e-06, + "loss": 0.2706, + "step": 30900 + }, + { + "epoch": 46.48, + "grad_norm": 3.598883628845215, + "learning_rate": 5.35187969924812e-06, + "loss": 0.1639, + "step": 30910 + }, + { + "epoch": 46.5, + "grad_norm": 4.807301044464111, + "learning_rate": 5.3503759398496246e-06, + "loss": 0.2063, + "step": 30920 + }, + { + "epoch": 46.51, + "grad_norm": 4.723282337188721, + "learning_rate": 5.348872180451128e-06, + "loss": 0.1954, + "step": 30930 + }, + { + "epoch": 46.53, + "grad_norm": 6.898508071899414, + "learning_rate": 5.3473684210526325e-06, + "loss": 0.274, + "step": 30940 + }, + { + "epoch": 46.54, + "grad_norm": 1.9944006204605103, + "learning_rate": 5.345864661654135e-06, + "loss": 0.2548, + "step": 30950 + }, + { + "epoch": 46.56, + "grad_norm": 4.713189125061035, + "learning_rate": 5.3443609022556395e-06, + "loss": 0.2342, + "step": 30960 + }, + { + "epoch": 46.57, + "grad_norm": 5.438635349273682, + "learning_rate": 5.342857142857143e-06, + "loss": 0.2282, + "step": 30970 + }, + { + "epoch": 46.59, + "grad_norm": 4.57274055480957, + "learning_rate": 5.341353383458647e-06, + "loss": 0.2339, + "step": 30980 + }, + { + "epoch": 46.6, + "grad_norm": 6.225501537322998, + "learning_rate": 5.339849624060151e-06, + "loss": 0.2293, + "step": 30990 + }, + { + "epoch": 46.62, + "grad_norm": 10.704837799072266, + "learning_rate": 5.338345864661654e-06, + "loss": 0.2327, + "step": 31000 + }, + { + "epoch": 46.63, + "grad_norm": 7.582201957702637, + "learning_rate": 5.336842105263158e-06, + "loss": 0.2563, + "step": 31010 + }, + { + "epoch": 46.65, + "grad_norm": 6.811306476593018, + "learning_rate": 5.335338345864662e-06, + "loss": 0.198, + "step": 31020 + }, + { + "epoch": 46.66, + "grad_norm": 3.75140118598938, + "learning_rate": 5.333834586466166e-06, + "loss": 0.2295, + "step": 31030 + }, + { + "epoch": 46.68, + "grad_norm": 7.410831928253174, + "learning_rate": 5.33233082706767e-06, + "loss": 0.2624, + "step": 31040 + }, + { + "epoch": 46.69, + "grad_norm": 4.584934711456299, + "learning_rate": 5.330827067669173e-06, + "loss": 0.2611, + "step": 31050 + }, + { + "epoch": 46.71, + "grad_norm": 2.435068130493164, + "learning_rate": 5.329323308270677e-06, + "loss": 0.2045, + "step": 31060 + }, + { + "epoch": 46.72, + "grad_norm": 6.5712056159973145, + "learning_rate": 5.327819548872181e-06, + "loss": 0.1974, + "step": 31070 + }, + { + "epoch": 46.74, + "grad_norm": 6.2264862060546875, + "learning_rate": 5.326315789473685e-06, + "loss": 0.3578, + "step": 31080 + }, + { + "epoch": 46.75, + "grad_norm": 4.949437141418457, + "learning_rate": 5.324812030075189e-06, + "loss": 0.2336, + "step": 31090 + }, + { + "epoch": 46.77, + "grad_norm": 8.773530006408691, + "learning_rate": 5.323308270676692e-06, + "loss": 0.2793, + "step": 31100 + }, + { + "epoch": 46.78, + "grad_norm": 4.285604000091553, + "learning_rate": 5.321804511278196e-06, + "loss": 0.2367, + "step": 31110 + }, + { + "epoch": 46.8, + "grad_norm": 5.313040256500244, + "learning_rate": 5.3203007518797e-06, + "loss": 0.2804, + "step": 31120 + }, + { + "epoch": 46.81, + "grad_norm": 4.113051414489746, + "learning_rate": 5.3187969924812035e-06, + "loss": 0.2158, + "step": 31130 + }, + { + "epoch": 46.83, + "grad_norm": 3.403592824935913, + "learning_rate": 5.317293233082708e-06, + "loss": 0.2477, + "step": 31140 + }, + { + "epoch": 46.84, + "grad_norm": 9.54311466217041, + "learning_rate": 5.315789473684211e-06, + "loss": 0.2816, + "step": 31150 + }, + { + "epoch": 46.86, + "grad_norm": 11.368732452392578, + "learning_rate": 5.314285714285715e-06, + "loss": 0.2512, + "step": 31160 + }, + { + "epoch": 46.87, + "grad_norm": 8.554758071899414, + "learning_rate": 5.3127819548872185e-06, + "loss": 0.2527, + "step": 31170 + }, + { + "epoch": 46.89, + "grad_norm": 4.945856094360352, + "learning_rate": 5.311278195488723e-06, + "loss": 0.2287, + "step": 31180 + }, + { + "epoch": 46.9, + "grad_norm": 5.310131072998047, + "learning_rate": 5.3097744360902255e-06, + "loss": 0.2594, + "step": 31190 + }, + { + "epoch": 46.92, + "grad_norm": 5.681679725646973, + "learning_rate": 5.30827067669173e-06, + "loss": 0.2152, + "step": 31200 + }, + { + "epoch": 46.93, + "grad_norm": 7.65255069732666, + "learning_rate": 5.306766917293233e-06, + "loss": 0.2017, + "step": 31210 + }, + { + "epoch": 46.95, + "grad_norm": 7.68698787689209, + "learning_rate": 5.305263157894738e-06, + "loss": 0.2355, + "step": 31220 + }, + { + "epoch": 46.96, + "grad_norm": 6.632236003875732, + "learning_rate": 5.303759398496241e-06, + "loss": 0.2626, + "step": 31230 + }, + { + "epoch": 46.98, + "grad_norm": 3.1763651371002197, + "learning_rate": 5.302255639097746e-06, + "loss": 0.2013, + "step": 31240 + }, + { + "epoch": 46.99, + "grad_norm": 5.487941741943359, + "learning_rate": 5.300751879699248e-06, + "loss": 0.2156, + "step": 31250 + }, + { + "epoch": 47.0, + "eval_accuracy": 0.9318, + "eval_loss": 0.2919594645500183, + "eval_runtime": 84.7462, + "eval_samples_per_second": 117.999, + "eval_steps_per_second": 0.472, + "step": 31255 + }, + { + "epoch": 47.01, + "grad_norm": 4.854064464569092, + "learning_rate": 5.299248120300753e-06, + "loss": 0.2527, + "step": 31260 + }, + { + "epoch": 47.02, + "grad_norm": 6.271472454071045, + "learning_rate": 5.297744360902256e-06, + "loss": 0.191, + "step": 31270 + }, + { + "epoch": 47.04, + "grad_norm": 6.080266952514648, + "learning_rate": 5.296240601503759e-06, + "loss": 0.2479, + "step": 31280 + }, + { + "epoch": 47.05, + "grad_norm": 7.085132122039795, + "learning_rate": 5.294736842105263e-06, + "loss": 0.2624, + "step": 31290 + }, + { + "epoch": 47.07, + "grad_norm": 4.636129856109619, + "learning_rate": 5.293233082706767e-06, + "loss": 0.1893, + "step": 31300 + }, + { + "epoch": 47.08, + "grad_norm": 6.469307899475098, + "learning_rate": 5.291729323308271e-06, + "loss": 0.1865, + "step": 31310 + }, + { + "epoch": 47.1, + "grad_norm": 8.775065422058105, + "learning_rate": 5.290225563909775e-06, + "loss": 0.2459, + "step": 31320 + }, + { + "epoch": 47.11, + "grad_norm": 3.9796481132507324, + "learning_rate": 5.288721804511279e-06, + "loss": 0.257, + "step": 31330 + }, + { + "epoch": 47.13, + "grad_norm": 5.34701681137085, + "learning_rate": 5.287218045112782e-06, + "loss": 0.199, + "step": 31340 + }, + { + "epoch": 47.14, + "grad_norm": 6.631876468658447, + "learning_rate": 5.285714285714286e-06, + "loss": 0.2502, + "step": 31350 + }, + { + "epoch": 47.16, + "grad_norm": 4.332927227020264, + "learning_rate": 5.2842105263157896e-06, + "loss": 0.2145, + "step": 31360 + }, + { + "epoch": 47.17, + "grad_norm": 7.87563419342041, + "learning_rate": 5.282706766917294e-06, + "loss": 0.2484, + "step": 31370 + }, + { + "epoch": 47.19, + "grad_norm": 4.990164279937744, + "learning_rate": 5.281203007518797e-06, + "loss": 0.2739, + "step": 31380 + }, + { + "epoch": 47.2, + "grad_norm": 5.960522174835205, + "learning_rate": 5.279699248120301e-06, + "loss": 0.2388, + "step": 31390 + }, + { + "epoch": 47.22, + "grad_norm": 8.016762733459473, + "learning_rate": 5.2781954887218045e-06, + "loss": 0.2013, + "step": 31400 + }, + { + "epoch": 47.23, + "grad_norm": 4.126075744628906, + "learning_rate": 5.276691729323309e-06, + "loss": 0.2994, + "step": 31410 + }, + { + "epoch": 47.25, + "grad_norm": 4.315171241760254, + "learning_rate": 5.275187969924812e-06, + "loss": 0.3082, + "step": 31420 + }, + { + "epoch": 47.26, + "grad_norm": 6.842277526855469, + "learning_rate": 5.273684210526317e-06, + "loss": 0.2019, + "step": 31430 + }, + { + "epoch": 47.28, + "grad_norm": 16.402454376220703, + "learning_rate": 5.272180451127819e-06, + "loss": 0.3009, + "step": 31440 + }, + { + "epoch": 47.29, + "grad_norm": 4.734708309173584, + "learning_rate": 5.270676691729324e-06, + "loss": 0.1851, + "step": 31450 + }, + { + "epoch": 47.31, + "grad_norm": 7.005868434906006, + "learning_rate": 5.269172932330827e-06, + "loss": 0.2021, + "step": 31460 + }, + { + "epoch": 47.32, + "grad_norm": 4.445467948913574, + "learning_rate": 5.267669172932332e-06, + "loss": 0.2028, + "step": 31470 + }, + { + "epoch": 47.34, + "grad_norm": 3.294844627380371, + "learning_rate": 5.266165413533834e-06, + "loss": 0.2901, + "step": 31480 + }, + { + "epoch": 47.35, + "grad_norm": 5.847194194793701, + "learning_rate": 5.264661654135339e-06, + "loss": 0.2215, + "step": 31490 + }, + { + "epoch": 47.37, + "grad_norm": 3.2134320735931396, + "learning_rate": 5.263157894736842e-06, + "loss": 0.213, + "step": 31500 + }, + { + "epoch": 47.38, + "grad_norm": 7.5339035987854, + "learning_rate": 5.261654135338347e-06, + "loss": 0.2378, + "step": 31510 + }, + { + "epoch": 47.4, + "grad_norm": 3.7223424911499023, + "learning_rate": 5.26015037593985e-06, + "loss": 0.3221, + "step": 31520 + }, + { + "epoch": 47.41, + "grad_norm": 4.247413158416748, + "learning_rate": 5.2586466165413545e-06, + "loss": 0.274, + "step": 31530 + }, + { + "epoch": 47.43, + "grad_norm": 4.432199478149414, + "learning_rate": 5.257142857142857e-06, + "loss": 0.2672, + "step": 31540 + }, + { + "epoch": 47.44, + "grad_norm": 4.548630714416504, + "learning_rate": 5.2556390977443615e-06, + "loss": 0.2413, + "step": 31550 + }, + { + "epoch": 47.46, + "grad_norm": 5.084230899810791, + "learning_rate": 5.254135338345865e-06, + "loss": 0.233, + "step": 31560 + }, + { + "epoch": 47.47, + "grad_norm": 2.7033839225769043, + "learning_rate": 5.252631578947369e-06, + "loss": 0.2344, + "step": 31570 + }, + { + "epoch": 47.49, + "grad_norm": 6.172457218170166, + "learning_rate": 5.251127819548872e-06, + "loss": 0.2259, + "step": 31580 + }, + { + "epoch": 47.5, + "grad_norm": 4.346304893493652, + "learning_rate": 5.2496240601503764e-06, + "loss": 0.2755, + "step": 31590 + }, + { + "epoch": 47.52, + "grad_norm": 5.721127986907959, + "learning_rate": 5.24812030075188e-06, + "loss": 0.2132, + "step": 31600 + }, + { + "epoch": 47.53, + "grad_norm": 4.425881862640381, + "learning_rate": 5.246616541353384e-06, + "loss": 0.249, + "step": 31610 + }, + { + "epoch": 47.55, + "grad_norm": 5.007637977600098, + "learning_rate": 5.245112781954888e-06, + "loss": 0.2291, + "step": 31620 + }, + { + "epoch": 47.56, + "grad_norm": 5.64668083190918, + "learning_rate": 5.243609022556392e-06, + "loss": 0.2487, + "step": 31630 + }, + { + "epoch": 47.58, + "grad_norm": 4.099886417388916, + "learning_rate": 5.242105263157895e-06, + "loss": 0.2259, + "step": 31640 + }, + { + "epoch": 47.59, + "grad_norm": 4.3517584800720215, + "learning_rate": 5.240601503759399e-06, + "loss": 0.1817, + "step": 31650 + }, + { + "epoch": 47.61, + "grad_norm": 7.435219764709473, + "learning_rate": 5.239097744360903e-06, + "loss": 0.2105, + "step": 31660 + }, + { + "epoch": 47.62, + "grad_norm": 4.386735439300537, + "learning_rate": 5.237593984962407e-06, + "loss": 0.2167, + "step": 31670 + }, + { + "epoch": 47.64, + "grad_norm": 9.977165222167969, + "learning_rate": 5.23609022556391e-06, + "loss": 0.2578, + "step": 31680 + }, + { + "epoch": 47.65, + "grad_norm": 5.100271701812744, + "learning_rate": 5.234586466165414e-06, + "loss": 0.2037, + "step": 31690 + }, + { + "epoch": 47.67, + "grad_norm": 12.209273338317871, + "learning_rate": 5.233082706766918e-06, + "loss": 0.2373, + "step": 31700 + }, + { + "epoch": 47.68, + "grad_norm": 4.825228691101074, + "learning_rate": 5.231578947368422e-06, + "loss": 0.217, + "step": 31710 + }, + { + "epoch": 47.7, + "grad_norm": 4.916600227355957, + "learning_rate": 5.2300751879699256e-06, + "loss": 0.2243, + "step": 31720 + }, + { + "epoch": 47.71, + "grad_norm": 4.243865966796875, + "learning_rate": 5.22857142857143e-06, + "loss": 0.198, + "step": 31730 + }, + { + "epoch": 47.73, + "grad_norm": 4.085118293762207, + "learning_rate": 5.227067669172933e-06, + "loss": 0.2393, + "step": 31740 + }, + { + "epoch": 47.74, + "grad_norm": 3.169874906539917, + "learning_rate": 5.225563909774437e-06, + "loss": 0.227, + "step": 31750 + }, + { + "epoch": 47.76, + "grad_norm": 5.03513240814209, + "learning_rate": 5.2240601503759405e-06, + "loss": 0.1579, + "step": 31760 + }, + { + "epoch": 47.77, + "grad_norm": 4.764927864074707, + "learning_rate": 5.222556390977443e-06, + "loss": 0.2179, + "step": 31770 + }, + { + "epoch": 47.79, + "grad_norm": 3.114337205886841, + "learning_rate": 5.2210526315789475e-06, + "loss": 0.1934, + "step": 31780 + }, + { + "epoch": 47.8, + "grad_norm": 4.358081817626953, + "learning_rate": 5.219548872180451e-06, + "loss": 0.1997, + "step": 31790 + }, + { + "epoch": 47.82, + "grad_norm": 4.839754104614258, + "learning_rate": 5.218045112781955e-06, + "loss": 0.2273, + "step": 31800 + }, + { + "epoch": 47.83, + "grad_norm": 4.898540496826172, + "learning_rate": 5.216541353383459e-06, + "loss": 0.2084, + "step": 31810 + }, + { + "epoch": 47.85, + "grad_norm": 7.094751834869385, + "learning_rate": 5.215037593984963e-06, + "loss": 0.2072, + "step": 31820 + }, + { + "epoch": 47.86, + "grad_norm": 6.301358699798584, + "learning_rate": 5.213533834586466e-06, + "loss": 0.2076, + "step": 31830 + }, + { + "epoch": 47.88, + "grad_norm": 5.953322410583496, + "learning_rate": 5.21203007518797e-06, + "loss": 0.234, + "step": 31840 + }, + { + "epoch": 47.89, + "grad_norm": 4.481212615966797, + "learning_rate": 5.210526315789474e-06, + "loss": 0.2074, + "step": 31850 + }, + { + "epoch": 47.91, + "grad_norm": 7.783965587615967, + "learning_rate": 5.209022556390978e-06, + "loss": 0.2597, + "step": 31860 + }, + { + "epoch": 47.92, + "grad_norm": 10.910694122314453, + "learning_rate": 5.207518796992481e-06, + "loss": 0.2851, + "step": 31870 + }, + { + "epoch": 47.94, + "grad_norm": 1.9801486730575562, + "learning_rate": 5.206015037593985e-06, + "loss": 0.2154, + "step": 31880 + }, + { + "epoch": 47.95, + "grad_norm": 11.243240356445312, + "learning_rate": 5.204511278195489e-06, + "loss": 0.2158, + "step": 31890 + }, + { + "epoch": 47.97, + "grad_norm": 2.5445377826690674, + "learning_rate": 5.203007518796993e-06, + "loss": 0.2497, + "step": 31900 + }, + { + "epoch": 47.98, + "grad_norm": 5.46054744720459, + "learning_rate": 5.201503759398497e-06, + "loss": 0.226, + "step": 31910 + }, + { + "epoch": 48.0, + "grad_norm": 26.797359466552734, + "learning_rate": 5.2e-06, + "loss": 0.2719, + "step": 31920 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.9311, + "eval_loss": 0.3056710362434387, + "eval_runtime": 84.8487, + "eval_samples_per_second": 117.857, + "eval_steps_per_second": 0.471, + "step": 31920 + }, + { + "epoch": 48.02, + "grad_norm": 3.771094799041748, + "learning_rate": 5.198496240601504e-06, + "loss": 0.197, + "step": 31930 + }, + { + "epoch": 48.03, + "grad_norm": 3.460973024368286, + "learning_rate": 5.196992481203008e-06, + "loss": 0.2012, + "step": 31940 + }, + { + "epoch": 48.05, + "grad_norm": 7.17149543762207, + "learning_rate": 5.195488721804512e-06, + "loss": 0.2799, + "step": 31950 + }, + { + "epoch": 48.06, + "grad_norm": 2.8762929439544678, + "learning_rate": 5.193984962406016e-06, + "loss": 0.2632, + "step": 31960 + }, + { + "epoch": 48.08, + "grad_norm": 5.614086627960205, + "learning_rate": 5.192481203007519e-06, + "loss": 0.205, + "step": 31970 + }, + { + "epoch": 48.09, + "grad_norm": 5.698775768280029, + "learning_rate": 5.190977443609023e-06, + "loss": 0.2031, + "step": 31980 + }, + { + "epoch": 48.11, + "grad_norm": 3.4972689151763916, + "learning_rate": 5.1894736842105265e-06, + "loss": 0.1935, + "step": 31990 + }, + { + "epoch": 48.12, + "grad_norm": 6.399228096008301, + "learning_rate": 5.187969924812031e-06, + "loss": 0.201, + "step": 32000 + }, + { + "epoch": 48.14, + "grad_norm": 6.805654048919678, + "learning_rate": 5.186466165413534e-06, + "loss": 0.1714, + "step": 32010 + }, + { + "epoch": 48.15, + "grad_norm": 4.721649646759033, + "learning_rate": 5.184962406015038e-06, + "loss": 0.2139, + "step": 32020 + }, + { + "epoch": 48.17, + "grad_norm": 5.464878082275391, + "learning_rate": 5.1834586466165414e-06, + "loss": 0.2137, + "step": 32030 + }, + { + "epoch": 48.18, + "grad_norm": 5.358616352081299, + "learning_rate": 5.181954887218046e-06, + "loss": 0.2196, + "step": 32040 + }, + { + "epoch": 48.2, + "grad_norm": 8.136012077331543, + "learning_rate": 5.180451127819549e-06, + "loss": 0.2861, + "step": 32050 + }, + { + "epoch": 48.21, + "grad_norm": 10.187384605407715, + "learning_rate": 5.178947368421054e-06, + "loss": 0.2223, + "step": 32060 + }, + { + "epoch": 48.23, + "grad_norm": 4.251842498779297, + "learning_rate": 5.177443609022556e-06, + "loss": 0.1949, + "step": 32070 + }, + { + "epoch": 48.24, + "grad_norm": 6.019373893737793, + "learning_rate": 5.175939849624061e-06, + "loss": 0.2512, + "step": 32080 + }, + { + "epoch": 48.26, + "grad_norm": 5.162561416625977, + "learning_rate": 5.174436090225564e-06, + "loss": 0.2408, + "step": 32090 + }, + { + "epoch": 48.27, + "grad_norm": 4.33914852142334, + "learning_rate": 5.172932330827069e-06, + "loss": 0.2901, + "step": 32100 + }, + { + "epoch": 48.29, + "grad_norm": 6.14069938659668, + "learning_rate": 5.171428571428571e-06, + "loss": 0.1849, + "step": 32110 + }, + { + "epoch": 48.3, + "grad_norm": 4.674006462097168, + "learning_rate": 5.169924812030076e-06, + "loss": 0.2483, + "step": 32120 + }, + { + "epoch": 48.32, + "grad_norm": 9.517473220825195, + "learning_rate": 5.168421052631579e-06, + "loss": 0.2329, + "step": 32130 + }, + { + "epoch": 48.33, + "grad_norm": 3.6502087116241455, + "learning_rate": 5.1669172932330835e-06, + "loss": 0.2357, + "step": 32140 + }, + { + "epoch": 48.35, + "grad_norm": 2.8939104080200195, + "learning_rate": 5.165413533834587e-06, + "loss": 0.265, + "step": 32150 + }, + { + "epoch": 48.36, + "grad_norm": 4.541179656982422, + "learning_rate": 5.163909774436091e-06, + "loss": 0.2007, + "step": 32160 + }, + { + "epoch": 48.38, + "grad_norm": 3.5995399951934814, + "learning_rate": 5.162406015037594e-06, + "loss": 0.1895, + "step": 32170 + }, + { + "epoch": 48.39, + "grad_norm": 4.528938293457031, + "learning_rate": 5.1609022556390985e-06, + "loss": 0.2574, + "step": 32180 + }, + { + "epoch": 48.41, + "grad_norm": 3.6735799312591553, + "learning_rate": 5.159398496240602e-06, + "loss": 0.2564, + "step": 32190 + }, + { + "epoch": 48.42, + "grad_norm": 7.794544219970703, + "learning_rate": 5.157894736842106e-06, + "loss": 0.2534, + "step": 32200 + }, + { + "epoch": 48.44, + "grad_norm": 9.559613227844238, + "learning_rate": 5.156390977443609e-06, + "loss": 0.2308, + "step": 32210 + }, + { + "epoch": 48.45, + "grad_norm": 5.519898414611816, + "learning_rate": 5.154887218045113e-06, + "loss": 0.2299, + "step": 32220 + }, + { + "epoch": 48.47, + "grad_norm": 5.717340469360352, + "learning_rate": 5.153383458646617e-06, + "loss": 0.2285, + "step": 32230 + }, + { + "epoch": 48.48, + "grad_norm": 4.257745265960693, + "learning_rate": 5.151879699248121e-06, + "loss": 0.2257, + "step": 32240 + }, + { + "epoch": 48.5, + "grad_norm": 3.741241693496704, + "learning_rate": 5.150375939849625e-06, + "loss": 0.2006, + "step": 32250 + }, + { + "epoch": 48.51, + "grad_norm": 4.753995418548584, + "learning_rate": 5.1488721804511275e-06, + "loss": 0.2896, + "step": 32260 + }, + { + "epoch": 48.53, + "grad_norm": 4.3221635818481445, + "learning_rate": 5.147368421052632e-06, + "loss": 0.2416, + "step": 32270 + }, + { + "epoch": 48.54, + "grad_norm": 6.452643394470215, + "learning_rate": 5.145864661654135e-06, + "loss": 0.2546, + "step": 32280 + }, + { + "epoch": 48.56, + "grad_norm": 10.680849075317383, + "learning_rate": 5.14436090225564e-06, + "loss": 0.2578, + "step": 32290 + }, + { + "epoch": 48.57, + "grad_norm": 10.565488815307617, + "learning_rate": 5.142857142857142e-06, + "loss": 0.2105, + "step": 32300 + }, + { + "epoch": 48.59, + "grad_norm": 6.964345455169678, + "learning_rate": 5.141353383458647e-06, + "loss": 0.2407, + "step": 32310 + }, + { + "epoch": 48.6, + "grad_norm": 5.355147838592529, + "learning_rate": 5.13984962406015e-06, + "loss": 0.122, + "step": 32320 + }, + { + "epoch": 48.62, + "grad_norm": 4.7337565422058105, + "learning_rate": 5.138345864661655e-06, + "loss": 0.2202, + "step": 32330 + }, + { + "epoch": 48.63, + "grad_norm": 3.4737660884857178, + "learning_rate": 5.136842105263158e-06, + "loss": 0.2382, + "step": 32340 + }, + { + "epoch": 48.65, + "grad_norm": 3.8572425842285156, + "learning_rate": 5.1353383458646625e-06, + "loss": 0.2735, + "step": 32350 + }, + { + "epoch": 48.66, + "grad_norm": 3.9931530952453613, + "learning_rate": 5.133834586466165e-06, + "loss": 0.2399, + "step": 32360 + }, + { + "epoch": 48.68, + "grad_norm": 6.738966941833496, + "learning_rate": 5.1323308270676696e-06, + "loss": 0.2595, + "step": 32370 + }, + { + "epoch": 48.69, + "grad_norm": 5.455244064331055, + "learning_rate": 5.130827067669173e-06, + "loss": 0.1811, + "step": 32380 + }, + { + "epoch": 48.71, + "grad_norm": 8.760961532592773, + "learning_rate": 5.1293233082706774e-06, + "loss": 0.2323, + "step": 32390 + }, + { + "epoch": 48.72, + "grad_norm": 1.5300565958023071, + "learning_rate": 5.12781954887218e-06, + "loss": 0.2225, + "step": 32400 + }, + { + "epoch": 48.74, + "grad_norm": 4.378961563110352, + "learning_rate": 5.1263157894736845e-06, + "loss": 0.1771, + "step": 32410 + }, + { + "epoch": 48.75, + "grad_norm": 5.027568340301514, + "learning_rate": 5.124812030075188e-06, + "loss": 0.1993, + "step": 32420 + }, + { + "epoch": 48.77, + "grad_norm": 3.989525556564331, + "learning_rate": 5.123308270676692e-06, + "loss": 0.2182, + "step": 32430 + }, + { + "epoch": 48.78, + "grad_norm": 5.23897123336792, + "learning_rate": 5.121804511278196e-06, + "loss": 0.1689, + "step": 32440 + }, + { + "epoch": 48.8, + "grad_norm": 6.119510173797607, + "learning_rate": 5.1203007518797e-06, + "loss": 0.1778, + "step": 32450 + }, + { + "epoch": 48.81, + "grad_norm": 5.041784286499023, + "learning_rate": 5.118796992481203e-06, + "loss": 0.2215, + "step": 32460 + }, + { + "epoch": 48.83, + "grad_norm": 4.170535564422607, + "learning_rate": 5.117293233082707e-06, + "loss": 0.1809, + "step": 32470 + }, + { + "epoch": 48.84, + "grad_norm": 5.3539204597473145, + "learning_rate": 5.115789473684211e-06, + "loss": 0.2089, + "step": 32480 + }, + { + "epoch": 48.86, + "grad_norm": 3.4489662647247314, + "learning_rate": 5.114285714285715e-06, + "loss": 0.1827, + "step": 32490 + }, + { + "epoch": 48.87, + "grad_norm": 3.438225507736206, + "learning_rate": 5.112781954887218e-06, + "loss": 0.205, + "step": 32500 + }, + { + "epoch": 48.89, + "grad_norm": 5.150193214416504, + "learning_rate": 5.111278195488722e-06, + "loss": 0.195, + "step": 32510 + }, + { + "epoch": 48.9, + "grad_norm": 7.413297176361084, + "learning_rate": 5.109774436090226e-06, + "loss": 0.2422, + "step": 32520 + }, + { + "epoch": 48.92, + "grad_norm": 4.774962902069092, + "learning_rate": 5.10827067669173e-06, + "loss": 0.2325, + "step": 32530 + }, + { + "epoch": 48.93, + "grad_norm": 6.498291969299316, + "learning_rate": 5.106766917293234e-06, + "loss": 0.257, + "step": 32540 + }, + { + "epoch": 48.95, + "grad_norm": 5.5614728927612305, + "learning_rate": 5.105263157894738e-06, + "loss": 0.18, + "step": 32550 + }, + { + "epoch": 48.96, + "grad_norm": 9.62303352355957, + "learning_rate": 5.103759398496241e-06, + "loss": 0.2386, + "step": 32560 + }, + { + "epoch": 48.98, + "grad_norm": 7.92848539352417, + "learning_rate": 5.102255639097745e-06, + "loss": 0.2292, + "step": 32570 + }, + { + "epoch": 48.99, + "grad_norm": 4.110299587249756, + "learning_rate": 5.1007518796992485e-06, + "loss": 0.2156, + "step": 32580 + }, + { + "epoch": 49.0, + "eval_accuracy": 0.9292, + "eval_loss": 0.31269803643226624, + "eval_runtime": 84.4748, + "eval_samples_per_second": 118.379, + "eval_steps_per_second": 0.474, + "step": 32585 + }, + { + "epoch": 49.01, + "grad_norm": 7.004157543182373, + "learning_rate": 5.099248120300753e-06, + "loss": 0.3448, + "step": 32590 + }, + { + "epoch": 49.02, + "grad_norm": 4.223759651184082, + "learning_rate": 5.097744360902256e-06, + "loss": 0.2868, + "step": 32600 + }, + { + "epoch": 49.04, + "grad_norm": 3.9704689979553223, + "learning_rate": 5.09624060150376e-06, + "loss": 0.2549, + "step": 32610 + }, + { + "epoch": 49.05, + "grad_norm": 3.7155163288116455, + "learning_rate": 5.0947368421052635e-06, + "loss": 0.2238, + "step": 32620 + }, + { + "epoch": 49.07, + "grad_norm": 4.680662631988525, + "learning_rate": 5.093233082706768e-06, + "loss": 0.2062, + "step": 32630 + }, + { + "epoch": 49.08, + "grad_norm": 4.773167610168457, + "learning_rate": 5.091729323308271e-06, + "loss": 0.1666, + "step": 32640 + }, + { + "epoch": 49.1, + "grad_norm": 8.678242683410645, + "learning_rate": 5.090225563909776e-06, + "loss": 0.2808, + "step": 32650 + }, + { + "epoch": 49.11, + "grad_norm": 5.068692207336426, + "learning_rate": 5.088721804511278e-06, + "loss": 0.2623, + "step": 32660 + }, + { + "epoch": 49.13, + "grad_norm": 2.034926414489746, + "learning_rate": 5.087218045112783e-06, + "loss": 0.2039, + "step": 32670 + }, + { + "epoch": 49.14, + "grad_norm": 7.186866283416748, + "learning_rate": 5.085714285714286e-06, + "loss": 0.2584, + "step": 32680 + }, + { + "epoch": 49.16, + "grad_norm": 7.276577472686768, + "learning_rate": 5.084210526315791e-06, + "loss": 0.1756, + "step": 32690 + }, + { + "epoch": 49.17, + "grad_norm": 3.856623888015747, + "learning_rate": 5.082706766917293e-06, + "loss": 0.1908, + "step": 32700 + }, + { + "epoch": 49.19, + "grad_norm": 4.449421405792236, + "learning_rate": 5.081203007518798e-06, + "loss": 0.233, + "step": 32710 + }, + { + "epoch": 49.2, + "grad_norm": 7.532309055328369, + "learning_rate": 5.079699248120301e-06, + "loss": 0.2067, + "step": 32720 + }, + { + "epoch": 49.22, + "grad_norm": 5.170470237731934, + "learning_rate": 5.0781954887218056e-06, + "loss": 0.2273, + "step": 32730 + }, + { + "epoch": 49.23, + "grad_norm": 3.9134700298309326, + "learning_rate": 5.076691729323309e-06, + "loss": 0.2791, + "step": 32740 + }, + { + "epoch": 49.25, + "grad_norm": 6.696217060089111, + "learning_rate": 5.075187969924813e-06, + "loss": 0.224, + "step": 32750 + }, + { + "epoch": 49.26, + "grad_norm": 9.15458869934082, + "learning_rate": 5.073684210526316e-06, + "loss": 0.2591, + "step": 32760 + }, + { + "epoch": 49.28, + "grad_norm": 3.095736026763916, + "learning_rate": 5.07218045112782e-06, + "loss": 0.1681, + "step": 32770 + }, + { + "epoch": 49.29, + "grad_norm": 5.608436107635498, + "learning_rate": 5.070676691729324e-06, + "loss": 0.2269, + "step": 32780 + }, + { + "epoch": 49.31, + "grad_norm": 5.287787437438965, + "learning_rate": 5.069172932330827e-06, + "loss": 0.2443, + "step": 32790 + }, + { + "epoch": 49.32, + "grad_norm": 8.475576400756836, + "learning_rate": 5.067669172932331e-06, + "loss": 0.3255, + "step": 32800 + }, + { + "epoch": 49.34, + "grad_norm": 3.967184066772461, + "learning_rate": 5.0661654135338346e-06, + "loss": 0.1712, + "step": 32810 + }, + { + "epoch": 49.35, + "grad_norm": 7.337131023406982, + "learning_rate": 5.064661654135339e-06, + "loss": 0.3166, + "step": 32820 + }, + { + "epoch": 49.37, + "grad_norm": 6.623044967651367, + "learning_rate": 5.0631578947368424e-06, + "loss": 0.3035, + "step": 32830 + }, + { + "epoch": 49.38, + "grad_norm": 5.275474548339844, + "learning_rate": 5.061654135338346e-06, + "loss": 0.2025, + "step": 32840 + }, + { + "epoch": 49.4, + "grad_norm": 6.363266468048096, + "learning_rate": 5.0601503759398495e-06, + "loss": 0.3033, + "step": 32850 + }, + { + "epoch": 49.41, + "grad_norm": 3.8887858390808105, + "learning_rate": 5.058646616541354e-06, + "loss": 0.2282, + "step": 32860 + }, + { + "epoch": 49.43, + "grad_norm": 7.158453941345215, + "learning_rate": 5.057142857142857e-06, + "loss": 0.2427, + "step": 32870 + }, + { + "epoch": 49.44, + "grad_norm": 5.235705375671387, + "learning_rate": 5.055639097744362e-06, + "loss": 0.3072, + "step": 32880 + }, + { + "epoch": 49.46, + "grad_norm": 3.328618049621582, + "learning_rate": 5.054135338345864e-06, + "loss": 0.2078, + "step": 32890 + }, + { + "epoch": 49.47, + "grad_norm": 7.1164398193359375, + "learning_rate": 5.052631578947369e-06, + "loss": 0.2707, + "step": 32900 + }, + { + "epoch": 49.49, + "grad_norm": 4.149364948272705, + "learning_rate": 5.051127819548872e-06, + "loss": 0.1759, + "step": 32910 + }, + { + "epoch": 49.5, + "grad_norm": 5.859124183654785, + "learning_rate": 5.049624060150377e-06, + "loss": 0.2588, + "step": 32920 + }, + { + "epoch": 49.52, + "grad_norm": 7.663355350494385, + "learning_rate": 5.04812030075188e-06, + "loss": 0.285, + "step": 32930 + }, + { + "epoch": 49.53, + "grad_norm": 5.694937229156494, + "learning_rate": 5.046616541353384e-06, + "loss": 0.2546, + "step": 32940 + }, + { + "epoch": 49.55, + "grad_norm": 7.81641960144043, + "learning_rate": 5.045112781954887e-06, + "loss": 0.1886, + "step": 32950 + }, + { + "epoch": 49.56, + "grad_norm": 4.236837387084961, + "learning_rate": 5.043609022556392e-06, + "loss": 0.2051, + "step": 32960 + }, + { + "epoch": 49.58, + "grad_norm": 4.034359931945801, + "learning_rate": 5.042105263157895e-06, + "loss": 0.1713, + "step": 32970 + }, + { + "epoch": 49.59, + "grad_norm": 10.613497734069824, + "learning_rate": 5.0406015037593995e-06, + "loss": 0.2747, + "step": 32980 + }, + { + "epoch": 49.61, + "grad_norm": 6.468116760253906, + "learning_rate": 5.039097744360902e-06, + "loss": 0.2624, + "step": 32990 + }, + { + "epoch": 49.62, + "grad_norm": 6.871288776397705, + "learning_rate": 5.0375939849624065e-06, + "loss": 0.2418, + "step": 33000 + }, + { + "epoch": 49.64, + "grad_norm": 5.56874942779541, + "learning_rate": 5.03609022556391e-06, + "loss": 0.2581, + "step": 33010 + }, + { + "epoch": 49.65, + "grad_norm": 4.636134147644043, + "learning_rate": 5.034586466165414e-06, + "loss": 0.2183, + "step": 33020 + }, + { + "epoch": 49.67, + "grad_norm": 3.762443780899048, + "learning_rate": 5.033082706766917e-06, + "loss": 0.2067, + "step": 33030 + }, + { + "epoch": 49.68, + "grad_norm": 2.3653151988983154, + "learning_rate": 5.0315789473684214e-06, + "loss": 0.1524, + "step": 33040 + }, + { + "epoch": 49.7, + "grad_norm": 1.23353111743927, + "learning_rate": 5.030075187969925e-06, + "loss": 0.2496, + "step": 33050 + }, + { + "epoch": 49.71, + "grad_norm": 6.81777811050415, + "learning_rate": 5.028571428571429e-06, + "loss": 0.2662, + "step": 33060 + }, + { + "epoch": 49.73, + "grad_norm": 3.0888495445251465, + "learning_rate": 5.027067669172933e-06, + "loss": 0.2782, + "step": 33070 + }, + { + "epoch": 49.74, + "grad_norm": 5.303954124450684, + "learning_rate": 5.025563909774437e-06, + "loss": 0.205, + "step": 33080 + }, + { + "epoch": 49.76, + "grad_norm": 3.523970603942871, + "learning_rate": 5.02406015037594e-06, + "loss": 0.1989, + "step": 33090 + }, + { + "epoch": 49.77, + "grad_norm": 5.378790378570557, + "learning_rate": 5.022556390977444e-06, + "loss": 0.2331, + "step": 33100 + }, + { + "epoch": 49.79, + "grad_norm": 5.677578926086426, + "learning_rate": 5.021052631578948e-06, + "loss": 0.2528, + "step": 33110 + }, + { + "epoch": 49.8, + "grad_norm": 9.107217788696289, + "learning_rate": 5.019548872180452e-06, + "loss": 0.1952, + "step": 33120 + }, + { + "epoch": 49.82, + "grad_norm": 4.993006706237793, + "learning_rate": 5.018045112781955e-06, + "loss": 0.1862, + "step": 33130 + }, + { + "epoch": 49.83, + "grad_norm": 5.515392780303955, + "learning_rate": 5.016541353383459e-06, + "loss": 0.2022, + "step": 33140 + }, + { + "epoch": 49.85, + "grad_norm": 8.87382698059082, + "learning_rate": 5.015037593984963e-06, + "loss": 0.2332, + "step": 33150 + }, + { + "epoch": 49.86, + "grad_norm": 6.892481327056885, + "learning_rate": 5.013533834586467e-06, + "loss": 0.1816, + "step": 33160 + }, + { + "epoch": 49.88, + "grad_norm": 2.5804250240325928, + "learning_rate": 5.0120300751879706e-06, + "loss": 0.223, + "step": 33170 + }, + { + "epoch": 49.89, + "grad_norm": 20.23832130432129, + "learning_rate": 5.010526315789475e-06, + "loss": 0.1953, + "step": 33180 + }, + { + "epoch": 49.91, + "grad_norm": 3.5330405235290527, + "learning_rate": 5.009022556390978e-06, + "loss": 0.2249, + "step": 33190 + }, + { + "epoch": 49.92, + "grad_norm": 7.592809677124023, + "learning_rate": 5.007518796992482e-06, + "loss": 0.3143, + "step": 33200 + }, + { + "epoch": 49.94, + "grad_norm": 5.459137916564941, + "learning_rate": 5.0060150375939855e-06, + "loss": 0.2856, + "step": 33210 + }, + { + "epoch": 49.95, + "grad_norm": 5.517751216888428, + "learning_rate": 5.00451127819549e-06, + "loss": 0.1942, + "step": 33220 + }, + { + "epoch": 49.97, + "grad_norm": 5.425515651702881, + "learning_rate": 5.0030075187969925e-06, + "loss": 0.207, + "step": 33230 + }, + { + "epoch": 49.98, + "grad_norm": 4.391520023345947, + "learning_rate": 5.001503759398497e-06, + "loss": 0.2186, + "step": 33240 + }, + { + "epoch": 50.0, + "grad_norm": 8.936457633972168, + "learning_rate": 5e-06, + "loss": 0.2562, + "step": 33250 + }, + { + "epoch": 50.0, + "eval_accuracy": 0.93, + "eval_loss": 0.31154853105545044, + "eval_runtime": 84.9326, + "eval_samples_per_second": 117.74, + "eval_steps_per_second": 0.471, + "step": 33250 + }, + { + "epoch": 50.02, + "grad_norm": 4.7076497077941895, + "learning_rate": 4.998496240601504e-06, + "loss": 0.2196, + "step": 33260 + }, + { + "epoch": 50.03, + "grad_norm": 3.4939560890197754, + "learning_rate": 4.996992481203008e-06, + "loss": 0.2139, + "step": 33270 + }, + { + "epoch": 50.05, + "grad_norm": 3.0447769165039062, + "learning_rate": 4.995488721804512e-06, + "loss": 0.1865, + "step": 33280 + }, + { + "epoch": 50.06, + "grad_norm": 2.098543405532837, + "learning_rate": 4.993984962406015e-06, + "loss": 0.2419, + "step": 33290 + }, + { + "epoch": 50.08, + "grad_norm": 5.257837772369385, + "learning_rate": 4.992481203007519e-06, + "loss": 0.2051, + "step": 33300 + }, + { + "epoch": 50.09, + "grad_norm": 4.075168609619141, + "learning_rate": 4.990977443609023e-06, + "loss": 0.2148, + "step": 33310 + }, + { + "epoch": 50.11, + "grad_norm": 3.889608144760132, + "learning_rate": 4.989473684210527e-06, + "loss": 0.2621, + "step": 33320 + }, + { + "epoch": 50.12, + "grad_norm": 5.086583614349365, + "learning_rate": 4.98796992481203e-06, + "loss": 0.2163, + "step": 33330 + }, + { + "epoch": 50.14, + "grad_norm": 4.367302894592285, + "learning_rate": 4.986466165413535e-06, + "loss": 0.2182, + "step": 33340 + }, + { + "epoch": 50.15, + "grad_norm": 4.624885559082031, + "learning_rate": 4.984962406015038e-06, + "loss": 0.2048, + "step": 33350 + }, + { + "epoch": 50.17, + "grad_norm": 5.809417247772217, + "learning_rate": 4.983458646616542e-06, + "loss": 0.2646, + "step": 33360 + }, + { + "epoch": 50.18, + "grad_norm": 2.8429479598999023, + "learning_rate": 4.981954887218046e-06, + "loss": 0.2445, + "step": 33370 + }, + { + "epoch": 50.2, + "grad_norm": 7.78208065032959, + "learning_rate": 4.9804511278195495e-06, + "loss": 0.186, + "step": 33380 + }, + { + "epoch": 50.21, + "grad_norm": 5.874340534210205, + "learning_rate": 4.978947368421053e-06, + "loss": 0.2056, + "step": 33390 + }, + { + "epoch": 50.23, + "grad_norm": 3.485530138015747, + "learning_rate": 4.977443609022557e-06, + "loss": 0.2269, + "step": 33400 + }, + { + "epoch": 50.24, + "grad_norm": 6.335367679595947, + "learning_rate": 4.975939849624061e-06, + "loss": 0.1929, + "step": 33410 + }, + { + "epoch": 50.26, + "grad_norm": 7.2522382736206055, + "learning_rate": 4.9744360902255645e-06, + "loss": 0.2561, + "step": 33420 + }, + { + "epoch": 50.27, + "grad_norm": 8.870294570922852, + "learning_rate": 4.972932330827068e-06, + "loss": 0.2507, + "step": 33430 + }, + { + "epoch": 50.29, + "grad_norm": 2.679324150085449, + "learning_rate": 4.971428571428572e-06, + "loss": 0.1811, + "step": 33440 + }, + { + "epoch": 50.3, + "grad_norm": 4.810848236083984, + "learning_rate": 4.969924812030076e-06, + "loss": 0.2152, + "step": 33450 + }, + { + "epoch": 50.32, + "grad_norm": 7.02167272567749, + "learning_rate": 4.968421052631579e-06, + "loss": 0.1854, + "step": 33460 + }, + { + "epoch": 50.33, + "grad_norm": 3.6129260063171387, + "learning_rate": 4.966917293233084e-06, + "loss": 0.234, + "step": 33470 + }, + { + "epoch": 50.35, + "grad_norm": 3.452362060546875, + "learning_rate": 4.965413533834587e-06, + "loss": 0.2666, + "step": 33480 + }, + { + "epoch": 50.36, + "grad_norm": 6.474914073944092, + "learning_rate": 4.963909774436091e-06, + "loss": 0.2418, + "step": 33490 + }, + { + "epoch": 50.38, + "grad_norm": 3.1944398880004883, + "learning_rate": 4.962406015037594e-06, + "loss": 0.2529, + "step": 33500 + }, + { + "epoch": 50.39, + "grad_norm": 4.8096699714660645, + "learning_rate": 4.960902255639098e-06, + "loss": 0.2176, + "step": 33510 + }, + { + "epoch": 50.41, + "grad_norm": 4.8568925857543945, + "learning_rate": 4.959398496240601e-06, + "loss": 0.1876, + "step": 33520 + }, + { + "epoch": 50.42, + "grad_norm": 5.153539657592773, + "learning_rate": 4.957894736842106e-06, + "loss": 0.2234, + "step": 33530 + }, + { + "epoch": 50.44, + "grad_norm": 12.457379341125488, + "learning_rate": 4.956390977443609e-06, + "loss": 0.2267, + "step": 33540 + }, + { + "epoch": 50.45, + "grad_norm": 3.840301990509033, + "learning_rate": 4.954887218045113e-06, + "loss": 0.231, + "step": 33550 + }, + { + "epoch": 50.47, + "grad_norm": 4.2671966552734375, + "learning_rate": 4.953383458646617e-06, + "loss": 0.2221, + "step": 33560 + }, + { + "epoch": 50.48, + "grad_norm": 1.5949358940124512, + "learning_rate": 4.951879699248121e-06, + "loss": 0.2137, + "step": 33570 + }, + { + "epoch": 50.5, + "grad_norm": 4.011662006378174, + "learning_rate": 4.950375939849624e-06, + "loss": 0.2406, + "step": 33580 + }, + { + "epoch": 50.51, + "grad_norm": 5.408310413360596, + "learning_rate": 4.948872180451128e-06, + "loss": 0.2115, + "step": 33590 + }, + { + "epoch": 50.53, + "grad_norm": 13.068920135498047, + "learning_rate": 4.947368421052632e-06, + "loss": 0.2343, + "step": 33600 + }, + { + "epoch": 50.54, + "grad_norm": 5.289875507354736, + "learning_rate": 4.9458646616541356e-06, + "loss": 0.212, + "step": 33610 + }, + { + "epoch": 50.56, + "grad_norm": 9.420212745666504, + "learning_rate": 4.944360902255639e-06, + "loss": 0.2435, + "step": 33620 + }, + { + "epoch": 50.57, + "grad_norm": 4.894474506378174, + "learning_rate": 4.9428571428571435e-06, + "loss": 0.1806, + "step": 33630 + }, + { + "epoch": 50.59, + "grad_norm": 5.293659687042236, + "learning_rate": 4.941353383458647e-06, + "loss": 0.2205, + "step": 33640 + }, + { + "epoch": 50.6, + "grad_norm": 3.308318853378296, + "learning_rate": 4.9398496240601505e-06, + "loss": 0.2025, + "step": 33650 + }, + { + "epoch": 50.62, + "grad_norm": 6.303762435913086, + "learning_rate": 4.938345864661655e-06, + "loss": 0.2399, + "step": 33660 + }, + { + "epoch": 50.63, + "grad_norm": 5.376105308532715, + "learning_rate": 4.936842105263158e-06, + "loss": 0.1792, + "step": 33670 + }, + { + "epoch": 50.65, + "grad_norm": 7.851215839385986, + "learning_rate": 4.935338345864662e-06, + "loss": 0.2101, + "step": 33680 + }, + { + "epoch": 50.66, + "grad_norm": 4.721893310546875, + "learning_rate": 4.933834586466165e-06, + "loss": 0.2835, + "step": 33690 + }, + { + "epoch": 50.68, + "grad_norm": 5.323988914489746, + "learning_rate": 4.93233082706767e-06, + "loss": 0.2572, + "step": 33700 + }, + { + "epoch": 50.69, + "grad_norm": 6.7880964279174805, + "learning_rate": 4.930827067669173e-06, + "loss": 0.2099, + "step": 33710 + }, + { + "epoch": 50.71, + "grad_norm": 4.452812194824219, + "learning_rate": 4.929323308270677e-06, + "loss": 0.2393, + "step": 33720 + }, + { + "epoch": 50.72, + "grad_norm": 3.799269676208496, + "learning_rate": 4.927819548872181e-06, + "loss": 0.1869, + "step": 33730 + }, + { + "epoch": 50.74, + "grad_norm": 4.9699273109436035, + "learning_rate": 4.926315789473685e-06, + "loss": 0.242, + "step": 33740 + }, + { + "epoch": 50.75, + "grad_norm": 3.6977415084838867, + "learning_rate": 4.924812030075188e-06, + "loss": 0.1882, + "step": 33750 + }, + { + "epoch": 50.77, + "grad_norm": 4.581582546234131, + "learning_rate": 4.923308270676692e-06, + "loss": 0.2257, + "step": 33760 + }, + { + "epoch": 50.78, + "grad_norm": 6.505821228027344, + "learning_rate": 4.921804511278196e-06, + "loss": 0.2212, + "step": 33770 + }, + { + "epoch": 50.8, + "grad_norm": 6.289919376373291, + "learning_rate": 4.9203007518797e-06, + "loss": 0.2553, + "step": 33780 + }, + { + "epoch": 50.81, + "grad_norm": 3.341113567352295, + "learning_rate": 4.918796992481203e-06, + "loss": 0.2567, + "step": 33790 + }, + { + "epoch": 50.83, + "grad_norm": 4.804263114929199, + "learning_rate": 4.9172932330827075e-06, + "loss": 0.2432, + "step": 33800 + }, + { + "epoch": 50.84, + "grad_norm": 6.457090854644775, + "learning_rate": 4.915789473684211e-06, + "loss": 0.223, + "step": 33810 + }, + { + "epoch": 50.86, + "grad_norm": 7.253601551055908, + "learning_rate": 4.9142857142857145e-06, + "loss": 0.2747, + "step": 33820 + }, + { + "epoch": 50.87, + "grad_norm": 4.229115962982178, + "learning_rate": 4.912781954887219e-06, + "loss": 0.2616, + "step": 33830 + }, + { + "epoch": 50.89, + "grad_norm": 3.4108991622924805, + "learning_rate": 4.9112781954887224e-06, + "loss": 0.2176, + "step": 33840 + }, + { + "epoch": 50.9, + "grad_norm": 6.427495002746582, + "learning_rate": 4.909774436090226e-06, + "loss": 0.1867, + "step": 33850 + }, + { + "epoch": 50.92, + "grad_norm": 4.823879718780518, + "learning_rate": 4.9082706766917295e-06, + "loss": 0.2758, + "step": 33860 + }, + { + "epoch": 50.93, + "grad_norm": 5.894002914428711, + "learning_rate": 4.906766917293234e-06, + "loss": 0.275, + "step": 33870 + }, + { + "epoch": 50.95, + "grad_norm": 7.533518314361572, + "learning_rate": 4.905263157894737e-06, + "loss": 0.2269, + "step": 33880 + }, + { + "epoch": 50.96, + "grad_norm": 6.328519344329834, + "learning_rate": 4.903759398496241e-06, + "loss": 0.2842, + "step": 33890 + }, + { + "epoch": 50.98, + "grad_norm": 2.4413256645202637, + "learning_rate": 4.902255639097745e-06, + "loss": 0.1786, + "step": 33900 + }, + { + "epoch": 50.99, + "grad_norm": 2.602782964706421, + "learning_rate": 4.900751879699249e-06, + "loss": 0.1847, + "step": 33910 + }, + { + "epoch": 51.0, + "eval_accuracy": 0.9311, + "eval_loss": 0.3058427572250366, + "eval_runtime": 84.9491, + "eval_samples_per_second": 117.718, + "eval_steps_per_second": 0.471, + "step": 33915 + }, + { + "epoch": 51.01, + "grad_norm": 12.357138633728027, + "learning_rate": 4.899248120300752e-06, + "loss": 0.1841, + "step": 33920 + }, + { + "epoch": 51.02, + "grad_norm": 6.953237056732178, + "learning_rate": 4.897744360902257e-06, + "loss": 0.2775, + "step": 33930 + }, + { + "epoch": 51.04, + "grad_norm": 4.370208263397217, + "learning_rate": 4.89624060150376e-06, + "loss": 0.216, + "step": 33940 + }, + { + "epoch": 51.05, + "grad_norm": 5.940341472625732, + "learning_rate": 4.894736842105264e-06, + "loss": 0.2888, + "step": 33950 + }, + { + "epoch": 51.07, + "grad_norm": 2.9402363300323486, + "learning_rate": 4.893233082706767e-06, + "loss": 0.2437, + "step": 33960 + }, + { + "epoch": 51.08, + "grad_norm": 6.118091583251953, + "learning_rate": 4.8917293233082716e-06, + "loss": 0.2558, + "step": 33970 + }, + { + "epoch": 51.1, + "grad_norm": 3.255457878112793, + "learning_rate": 4.890225563909775e-06, + "loss": 0.217, + "step": 33980 + }, + { + "epoch": 51.11, + "grad_norm": 6.439407825469971, + "learning_rate": 4.888721804511279e-06, + "loss": 0.2119, + "step": 33990 + }, + { + "epoch": 51.13, + "grad_norm": 5.628686904907227, + "learning_rate": 4.887218045112782e-06, + "loss": 0.2647, + "step": 34000 + }, + { + "epoch": 51.14, + "grad_norm": 3.908249616622925, + "learning_rate": 4.885714285714286e-06, + "loss": 0.221, + "step": 34010 + }, + { + "epoch": 51.16, + "grad_norm": 5.72270393371582, + "learning_rate": 4.88421052631579e-06, + "loss": 0.2116, + "step": 34020 + }, + { + "epoch": 51.17, + "grad_norm": 6.743198394775391, + "learning_rate": 4.8827067669172935e-06, + "loss": 0.2716, + "step": 34030 + }, + { + "epoch": 51.19, + "grad_norm": 6.313354969024658, + "learning_rate": 4.881203007518797e-06, + "loss": 0.242, + "step": 34040 + }, + { + "epoch": 51.2, + "grad_norm": 4.181947708129883, + "learning_rate": 4.8796992481203006e-06, + "loss": 0.2448, + "step": 34050 + }, + { + "epoch": 51.22, + "grad_norm": 7.440803050994873, + "learning_rate": 4.878195488721805e-06, + "loss": 0.2734, + "step": 34060 + }, + { + "epoch": 51.23, + "grad_norm": 7.924306869506836, + "learning_rate": 4.8766917293233085e-06, + "loss": 0.2438, + "step": 34070 + }, + { + "epoch": 51.25, + "grad_norm": 4.542608737945557, + "learning_rate": 4.875187969924812e-06, + "loss": 0.2291, + "step": 34080 + }, + { + "epoch": 51.26, + "grad_norm": 3.60923171043396, + "learning_rate": 4.873684210526316e-06, + "loss": 0.201, + "step": 34090 + }, + { + "epoch": 51.28, + "grad_norm": 7.753152847290039, + "learning_rate": 4.87218045112782e-06, + "loss": 0.2213, + "step": 34100 + }, + { + "epoch": 51.29, + "grad_norm": 4.770752429962158, + "learning_rate": 4.870676691729323e-06, + "loss": 0.1961, + "step": 34110 + }, + { + "epoch": 51.31, + "grad_norm": 4.9310479164123535, + "learning_rate": 4.869172932330828e-06, + "loss": 0.1574, + "step": 34120 + }, + { + "epoch": 51.32, + "grad_norm": 3.8556582927703857, + "learning_rate": 4.867669172932331e-06, + "loss": 0.2608, + "step": 34130 + }, + { + "epoch": 51.34, + "grad_norm": 6.298683166503906, + "learning_rate": 4.866165413533835e-06, + "loss": 0.2558, + "step": 34140 + }, + { + "epoch": 51.35, + "grad_norm": 3.553882122039795, + "learning_rate": 4.864661654135338e-06, + "loss": 0.152, + "step": 34150 + }, + { + "epoch": 51.37, + "grad_norm": 4.3429975509643555, + "learning_rate": 4.863157894736843e-06, + "loss": 0.2239, + "step": 34160 + }, + { + "epoch": 51.38, + "grad_norm": 3.7060399055480957, + "learning_rate": 4.861654135338346e-06, + "loss": 0.268, + "step": 34170 + }, + { + "epoch": 51.4, + "grad_norm": 4.588679313659668, + "learning_rate": 4.86015037593985e-06, + "loss": 0.2724, + "step": 34180 + }, + { + "epoch": 51.41, + "grad_norm": 3.715130567550659, + "learning_rate": 4.858646616541354e-06, + "loss": 0.1813, + "step": 34190 + }, + { + "epoch": 51.43, + "grad_norm": 6.059221267700195, + "learning_rate": 4.857142857142858e-06, + "loss": 0.201, + "step": 34200 + }, + { + "epoch": 51.44, + "grad_norm": 9.141858100891113, + "learning_rate": 4.855639097744361e-06, + "loss": 0.2786, + "step": 34210 + }, + { + "epoch": 51.46, + "grad_norm": 5.355362892150879, + "learning_rate": 4.854135338345865e-06, + "loss": 0.2175, + "step": 34220 + }, + { + "epoch": 51.47, + "grad_norm": 4.845487117767334, + "learning_rate": 4.852631578947369e-06, + "loss": 0.2597, + "step": 34230 + }, + { + "epoch": 51.49, + "grad_norm": 4.735893249511719, + "learning_rate": 4.8511278195488725e-06, + "loss": 0.2157, + "step": 34240 + }, + { + "epoch": 51.5, + "grad_norm": 5.093930244445801, + "learning_rate": 4.849624060150376e-06, + "loss": 0.1836, + "step": 34250 + }, + { + "epoch": 51.52, + "grad_norm": 1.877554178237915, + "learning_rate": 4.84812030075188e-06, + "loss": 0.1809, + "step": 34260 + }, + { + "epoch": 51.53, + "grad_norm": 5.213804721832275, + "learning_rate": 4.846616541353384e-06, + "loss": 0.2439, + "step": 34270 + }, + { + "epoch": 51.55, + "grad_norm": 2.9461185932159424, + "learning_rate": 4.8451127819548874e-06, + "loss": 0.1654, + "step": 34280 + }, + { + "epoch": 51.56, + "grad_norm": 3.598033905029297, + "learning_rate": 4.843609022556392e-06, + "loss": 0.1807, + "step": 34290 + }, + { + "epoch": 51.58, + "grad_norm": 3.246472120285034, + "learning_rate": 4.842105263157895e-06, + "loss": 0.2873, + "step": 34300 + }, + { + "epoch": 51.59, + "grad_norm": 7.0338921546936035, + "learning_rate": 4.840601503759399e-06, + "loss": 0.257, + "step": 34310 + }, + { + "epoch": 51.61, + "grad_norm": 7.394874095916748, + "learning_rate": 4.839097744360902e-06, + "loss": 0.1845, + "step": 34320 + }, + { + "epoch": 51.62, + "grad_norm": 5.90761661529541, + "learning_rate": 4.837593984962407e-06, + "loss": 0.2476, + "step": 34330 + }, + { + "epoch": 51.64, + "grad_norm": 2.5447800159454346, + "learning_rate": 4.83609022556391e-06, + "loss": 0.2252, + "step": 34340 + }, + { + "epoch": 51.65, + "grad_norm": 6.951369285583496, + "learning_rate": 4.834586466165414e-06, + "loss": 0.198, + "step": 34350 + }, + { + "epoch": 51.67, + "grad_norm": 8.201958656311035, + "learning_rate": 4.833082706766918e-06, + "loss": 0.2374, + "step": 34360 + }, + { + "epoch": 51.68, + "grad_norm": 7.837949752807617, + "learning_rate": 4.831578947368422e-06, + "loss": 0.1674, + "step": 34370 + }, + { + "epoch": 51.7, + "grad_norm": 5.410804748535156, + "learning_rate": 4.830075187969925e-06, + "loss": 0.2734, + "step": 34380 + }, + { + "epoch": 51.71, + "grad_norm": 3.385745048522949, + "learning_rate": 4.8285714285714295e-06, + "loss": 0.234, + "step": 34390 + }, + { + "epoch": 51.73, + "grad_norm": 7.365379810333252, + "learning_rate": 4.827067669172933e-06, + "loss": 0.2139, + "step": 34400 + }, + { + "epoch": 51.74, + "grad_norm": 6.774968147277832, + "learning_rate": 4.8255639097744366e-06, + "loss": 0.151, + "step": 34410 + }, + { + "epoch": 51.76, + "grad_norm": 5.092763900756836, + "learning_rate": 4.82406015037594e-06, + "loss": 0.2152, + "step": 34420 + }, + { + "epoch": 51.77, + "grad_norm": 3.2947609424591064, + "learning_rate": 4.8225563909774445e-06, + "loss": 0.2309, + "step": 34430 + }, + { + "epoch": 51.79, + "grad_norm": 4.706571102142334, + "learning_rate": 4.821052631578948e-06, + "loss": 0.2546, + "step": 34440 + }, + { + "epoch": 51.8, + "grad_norm": 8.115716934204102, + "learning_rate": 4.8195488721804515e-06, + "loss": 0.1969, + "step": 34450 + }, + { + "epoch": 51.82, + "grad_norm": 7.135550022125244, + "learning_rate": 4.818045112781956e-06, + "loss": 0.2594, + "step": 34460 + }, + { + "epoch": 51.83, + "grad_norm": 4.663880348205566, + "learning_rate": 4.816541353383459e-06, + "loss": 0.245, + "step": 34470 + }, + { + "epoch": 51.85, + "grad_norm": 5.021750450134277, + "learning_rate": 4.815037593984963e-06, + "loss": 0.2571, + "step": 34480 + }, + { + "epoch": 51.86, + "grad_norm": 3.6975669860839844, + "learning_rate": 4.813533834586466e-06, + "loss": 0.2923, + "step": 34490 + }, + { + "epoch": 51.88, + "grad_norm": 5.429826259613037, + "learning_rate": 4.81203007518797e-06, + "loss": 0.158, + "step": 34500 + }, + { + "epoch": 51.89, + "grad_norm": 5.443629264831543, + "learning_rate": 4.8105263157894735e-06, + "loss": 0.1788, + "step": 34510 + }, + { + "epoch": 51.91, + "grad_norm": 2.6010379791259766, + "learning_rate": 4.809022556390978e-06, + "loss": 0.2203, + "step": 34520 + }, + { + "epoch": 51.92, + "grad_norm": 2.72519588470459, + "learning_rate": 4.807518796992481e-06, + "loss": 0.2021, + "step": 34530 + }, + { + "epoch": 51.94, + "grad_norm": 7.415553569793701, + "learning_rate": 4.806015037593985e-06, + "loss": 0.2145, + "step": 34540 + }, + { + "epoch": 51.95, + "grad_norm": 3.181380033493042, + "learning_rate": 4.804511278195489e-06, + "loss": 0.1729, + "step": 34550 + }, + { + "epoch": 51.97, + "grad_norm": 4.509653091430664, + "learning_rate": 4.803007518796993e-06, + "loss": 0.1733, + "step": 34560 + }, + { + "epoch": 51.98, + "grad_norm": 2.9621808528900146, + "learning_rate": 4.801503759398496e-06, + "loss": 0.2322, + "step": 34570 + }, + { + "epoch": 52.0, + "grad_norm": 16.024185180664062, + "learning_rate": 4.800000000000001e-06, + "loss": 0.2453, + "step": 34580 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.9308, + "eval_loss": 0.3179681897163391, + "eval_runtime": 84.5142, + "eval_samples_per_second": 118.323, + "eval_steps_per_second": 0.473, + "step": 34580 + }, + { + "epoch": 52.02, + "grad_norm": 4.019076347351074, + "learning_rate": 4.798496240601504e-06, + "loss": 0.1856, + "step": 34590 + }, + { + "epoch": 52.03, + "grad_norm": 4.832670211791992, + "learning_rate": 4.796992481203008e-06, + "loss": 0.1791, + "step": 34600 + }, + { + "epoch": 52.05, + "grad_norm": 3.983247756958008, + "learning_rate": 4.795488721804511e-06, + "loss": 0.2176, + "step": 34610 + }, + { + "epoch": 52.06, + "grad_norm": 5.2948832511901855, + "learning_rate": 4.7939849624060156e-06, + "loss": 0.2737, + "step": 34620 + }, + { + "epoch": 52.08, + "grad_norm": 3.9428210258483887, + "learning_rate": 4.792481203007519e-06, + "loss": 0.2457, + "step": 34630 + }, + { + "epoch": 52.09, + "grad_norm": 7.841516971588135, + "learning_rate": 4.790977443609023e-06, + "loss": 0.2125, + "step": 34640 + }, + { + "epoch": 52.11, + "grad_norm": 2.363363265991211, + "learning_rate": 4.789473684210527e-06, + "loss": 0.1784, + "step": 34650 + }, + { + "epoch": 52.12, + "grad_norm": 3.259340286254883, + "learning_rate": 4.7879699248120305e-06, + "loss": 0.2177, + "step": 34660 + }, + { + "epoch": 52.14, + "grad_norm": 2.5872132778167725, + "learning_rate": 4.786466165413534e-06, + "loss": 0.229, + "step": 34670 + }, + { + "epoch": 52.15, + "grad_norm": 6.223973274230957, + "learning_rate": 4.7849624060150375e-06, + "loss": 0.2612, + "step": 34680 + }, + { + "epoch": 52.17, + "grad_norm": 8.220977783203125, + "learning_rate": 4.783458646616542e-06, + "loss": 0.2316, + "step": 34690 + }, + { + "epoch": 52.18, + "grad_norm": 5.418039321899414, + "learning_rate": 4.781954887218045e-06, + "loss": 0.2063, + "step": 34700 + }, + { + "epoch": 52.2, + "grad_norm": 2.79838490486145, + "learning_rate": 4.780451127819549e-06, + "loss": 0.1965, + "step": 34710 + }, + { + "epoch": 52.21, + "grad_norm": 5.194334030151367, + "learning_rate": 4.778947368421053e-06, + "loss": 0.1817, + "step": 34720 + }, + { + "epoch": 52.23, + "grad_norm": 4.306404113769531, + "learning_rate": 4.777443609022557e-06, + "loss": 0.1943, + "step": 34730 + }, + { + "epoch": 52.24, + "grad_norm": 5.353394508361816, + "learning_rate": 4.77593984962406e-06, + "loss": 0.2019, + "step": 34740 + }, + { + "epoch": 52.26, + "grad_norm": 4.193369388580322, + "learning_rate": 4.774436090225565e-06, + "loss": 0.1593, + "step": 34750 + }, + { + "epoch": 52.27, + "grad_norm": 5.131227493286133, + "learning_rate": 4.772932330827068e-06, + "loss": 0.2845, + "step": 34760 + }, + { + "epoch": 52.29, + "grad_norm": 6.315420627593994, + "learning_rate": 4.771428571428572e-06, + "loss": 0.2163, + "step": 34770 + }, + { + "epoch": 52.3, + "grad_norm": 5.833438873291016, + "learning_rate": 4.769924812030075e-06, + "loss": 0.2064, + "step": 34780 + }, + { + "epoch": 52.32, + "grad_norm": 2.4818150997161865, + "learning_rate": 4.76842105263158e-06, + "loss": 0.2169, + "step": 34790 + }, + { + "epoch": 52.33, + "grad_norm": 5.627359390258789, + "learning_rate": 4.766917293233083e-06, + "loss": 0.2654, + "step": 34800 + }, + { + "epoch": 52.35, + "grad_norm": 4.044781684875488, + "learning_rate": 4.765413533834587e-06, + "loss": 0.233, + "step": 34810 + }, + { + "epoch": 52.36, + "grad_norm": 9.770264625549316, + "learning_rate": 4.763909774436091e-06, + "loss": 0.2018, + "step": 34820 + }, + { + "epoch": 52.38, + "grad_norm": 5.436629295349121, + "learning_rate": 4.7624060150375945e-06, + "loss": 0.1943, + "step": 34830 + }, + { + "epoch": 52.39, + "grad_norm": 8.624287605285645, + "learning_rate": 4.760902255639098e-06, + "loss": 0.2665, + "step": 34840 + }, + { + "epoch": 52.41, + "grad_norm": 5.233580589294434, + "learning_rate": 4.759398496240602e-06, + "loss": 0.1882, + "step": 34850 + }, + { + "epoch": 52.42, + "grad_norm": 5.307055950164795, + "learning_rate": 4.757894736842106e-06, + "loss": 0.2765, + "step": 34860 + }, + { + "epoch": 52.44, + "grad_norm": 4.9283928871154785, + "learning_rate": 4.7563909774436095e-06, + "loss": 0.2012, + "step": 34870 + }, + { + "epoch": 52.45, + "grad_norm": 7.545740604400635, + "learning_rate": 4.754887218045113e-06, + "loss": 0.1997, + "step": 34880 + }, + { + "epoch": 52.47, + "grad_norm": 4.995537757873535, + "learning_rate": 4.753383458646617e-06, + "loss": 0.2371, + "step": 34890 + }, + { + "epoch": 52.48, + "grad_norm": 4.122995376586914, + "learning_rate": 4.751879699248121e-06, + "loss": 0.2107, + "step": 34900 + }, + { + "epoch": 52.5, + "grad_norm": 6.755688667297363, + "learning_rate": 4.750375939849624e-06, + "loss": 0.1947, + "step": 34910 + }, + { + "epoch": 52.51, + "grad_norm": 3.243288278579712, + "learning_rate": 4.748872180451129e-06, + "loss": 0.2091, + "step": 34920 + }, + { + "epoch": 52.53, + "grad_norm": 4.718216896057129, + "learning_rate": 4.747368421052632e-06, + "loss": 0.2318, + "step": 34930 + }, + { + "epoch": 52.54, + "grad_norm": 6.7740936279296875, + "learning_rate": 4.745864661654136e-06, + "loss": 0.2014, + "step": 34940 + }, + { + "epoch": 52.56, + "grad_norm": 3.995626449584961, + "learning_rate": 4.744360902255639e-06, + "loss": 0.2062, + "step": 34950 + }, + { + "epoch": 52.57, + "grad_norm": 6.141635417938232, + "learning_rate": 4.742857142857144e-06, + "loss": 0.183, + "step": 34960 + }, + { + "epoch": 52.59, + "grad_norm": 7.161436557769775, + "learning_rate": 4.741353383458647e-06, + "loss": 0.2518, + "step": 34970 + }, + { + "epoch": 52.6, + "grad_norm": 2.2341339588165283, + "learning_rate": 4.739849624060151e-06, + "loss": 0.2516, + "step": 34980 + }, + { + "epoch": 52.62, + "grad_norm": 5.752125263214111, + "learning_rate": 4.738345864661654e-06, + "loss": 0.1559, + "step": 34990 + }, + { + "epoch": 52.63, + "grad_norm": 4.806941032409668, + "learning_rate": 4.736842105263158e-06, + "loss": 0.177, + "step": 35000 + }, + { + "epoch": 52.65, + "grad_norm": 2.0574851036071777, + "learning_rate": 4.735338345864662e-06, + "loss": 0.1954, + "step": 35010 + }, + { + "epoch": 52.66, + "grad_norm": 4.443716526031494, + "learning_rate": 4.733834586466166e-06, + "loss": 0.2045, + "step": 35020 + }, + { + "epoch": 52.68, + "grad_norm": 3.878716468811035, + "learning_rate": 4.732330827067669e-06, + "loss": 0.2095, + "step": 35030 + }, + { + "epoch": 52.69, + "grad_norm": 10.685707092285156, + "learning_rate": 4.7308270676691735e-06, + "loss": 0.2586, + "step": 35040 + }, + { + "epoch": 52.71, + "grad_norm": 3.904240369796753, + "learning_rate": 4.729323308270677e-06, + "loss": 0.3103, + "step": 35050 + }, + { + "epoch": 52.72, + "grad_norm": 4.6063337326049805, + "learning_rate": 4.7278195488721806e-06, + "loss": 0.1517, + "step": 35060 + }, + { + "epoch": 52.74, + "grad_norm": 4.317811489105225, + "learning_rate": 4.726315789473684e-06, + "loss": 0.216, + "step": 35070 + }, + { + "epoch": 52.75, + "grad_norm": 7.456090927124023, + "learning_rate": 4.7248120300751884e-06, + "loss": 0.2438, + "step": 35080 + }, + { + "epoch": 52.77, + "grad_norm": 7.284855842590332, + "learning_rate": 4.723308270676692e-06, + "loss": 0.2263, + "step": 35090 + }, + { + "epoch": 52.78, + "grad_norm": 4.406165599822998, + "learning_rate": 4.7218045112781955e-06, + "loss": 0.181, + "step": 35100 + }, + { + "epoch": 52.8, + "grad_norm": 2.9364101886749268, + "learning_rate": 4.7203007518797e-06, + "loss": 0.2351, + "step": 35110 + }, + { + "epoch": 52.81, + "grad_norm": 5.638926029205322, + "learning_rate": 4.718796992481203e-06, + "loss": 0.2289, + "step": 35120 + }, + { + "epoch": 52.83, + "grad_norm": 4.295120716094971, + "learning_rate": 4.717293233082707e-06, + "loss": 0.211, + "step": 35130 + }, + { + "epoch": 52.84, + "grad_norm": 4.422874450683594, + "learning_rate": 4.71578947368421e-06, + "loss": 0.1947, + "step": 35140 + }, + { + "epoch": 52.86, + "grad_norm": 0.5836885571479797, + "learning_rate": 4.714285714285715e-06, + "loss": 0.2299, + "step": 35150 + }, + { + "epoch": 52.87, + "grad_norm": 4.698862552642822, + "learning_rate": 4.712781954887218e-06, + "loss": 0.1864, + "step": 35160 + }, + { + "epoch": 52.89, + "grad_norm": 4.577108860015869, + "learning_rate": 4.711278195488722e-06, + "loss": 0.2257, + "step": 35170 + }, + { + "epoch": 52.9, + "grad_norm": 7.971583843231201, + "learning_rate": 4.709774436090226e-06, + "loss": 0.2093, + "step": 35180 + }, + { + "epoch": 52.92, + "grad_norm": 6.064690589904785, + "learning_rate": 4.70827067669173e-06, + "loss": 0.2444, + "step": 35190 + }, + { + "epoch": 52.93, + "grad_norm": 7.225642681121826, + "learning_rate": 4.706766917293233e-06, + "loss": 0.2325, + "step": 35200 + }, + { + "epoch": 52.95, + "grad_norm": 4.481410503387451, + "learning_rate": 4.705263157894738e-06, + "loss": 0.2357, + "step": 35210 + }, + { + "epoch": 52.96, + "grad_norm": 4.95440673828125, + "learning_rate": 4.703759398496241e-06, + "loss": 0.2526, + "step": 35220 + }, + { + "epoch": 52.98, + "grad_norm": 3.8696534633636475, + "learning_rate": 4.702255639097745e-06, + "loss": 0.2009, + "step": 35230 + }, + { + "epoch": 52.99, + "grad_norm": 6.271843910217285, + "learning_rate": 4.700751879699248e-06, + "loss": 0.2763, + "step": 35240 + }, + { + "epoch": 53.0, + "eval_accuracy": 0.932, + "eval_loss": 0.30757245421409607, + "eval_runtime": 84.7331, + "eval_samples_per_second": 118.018, + "eval_steps_per_second": 0.472, + "step": 35245 + }, + { + "epoch": 53.01, + "grad_norm": 8.736787796020508, + "learning_rate": 4.6992481203007525e-06, + "loss": 0.2478, + "step": 35250 + }, + { + "epoch": 53.02, + "grad_norm": 6.983564853668213, + "learning_rate": 4.697744360902256e-06, + "loss": 0.2756, + "step": 35260 + }, + { + "epoch": 53.04, + "grad_norm": 3.659640073776245, + "learning_rate": 4.6962406015037595e-06, + "loss": 0.2843, + "step": 35270 + }, + { + "epoch": 53.05, + "grad_norm": 6.107889175415039, + "learning_rate": 4.694736842105264e-06, + "loss": 0.2212, + "step": 35280 + }, + { + "epoch": 53.07, + "grad_norm": 3.515962600708008, + "learning_rate": 4.693233082706767e-06, + "loss": 0.199, + "step": 35290 + }, + { + "epoch": 53.08, + "grad_norm": 4.900839328765869, + "learning_rate": 4.691729323308271e-06, + "loss": 0.2398, + "step": 35300 + }, + { + "epoch": 53.1, + "grad_norm": 4.189992904663086, + "learning_rate": 4.690225563909775e-06, + "loss": 0.2255, + "step": 35310 + }, + { + "epoch": 53.11, + "grad_norm": 4.84755277633667, + "learning_rate": 4.688721804511279e-06, + "loss": 0.1594, + "step": 35320 + }, + { + "epoch": 53.13, + "grad_norm": 4.748797416687012, + "learning_rate": 4.687218045112782e-06, + "loss": 0.1627, + "step": 35330 + }, + { + "epoch": 53.14, + "grad_norm": 9.220650672912598, + "learning_rate": 4.685714285714286e-06, + "loss": 0.2756, + "step": 35340 + }, + { + "epoch": 53.16, + "grad_norm": 4.247942924499512, + "learning_rate": 4.68421052631579e-06, + "loss": 0.1987, + "step": 35350 + }, + { + "epoch": 53.17, + "grad_norm": 4.994369983673096, + "learning_rate": 4.682706766917294e-06, + "loss": 0.1973, + "step": 35360 + }, + { + "epoch": 53.19, + "grad_norm": 5.42191743850708, + "learning_rate": 4.681203007518797e-06, + "loss": 0.2074, + "step": 35370 + }, + { + "epoch": 53.2, + "grad_norm": 3.5923995971679688, + "learning_rate": 4.679699248120302e-06, + "loss": 0.2612, + "step": 35380 + }, + { + "epoch": 53.22, + "grad_norm": 5.668188095092773, + "learning_rate": 4.678195488721805e-06, + "loss": 0.2088, + "step": 35390 + }, + { + "epoch": 53.23, + "grad_norm": 4.510677337646484, + "learning_rate": 4.676691729323309e-06, + "loss": 0.2987, + "step": 35400 + }, + { + "epoch": 53.25, + "grad_norm": 6.8448028564453125, + "learning_rate": 4.675187969924812e-06, + "loss": 0.236, + "step": 35410 + }, + { + "epoch": 53.26, + "grad_norm": 5.6076130867004395, + "learning_rate": 4.6736842105263166e-06, + "loss": 0.183, + "step": 35420 + }, + { + "epoch": 53.28, + "grad_norm": 5.029841423034668, + "learning_rate": 4.67218045112782e-06, + "loss": 0.1961, + "step": 35430 + }, + { + "epoch": 53.29, + "grad_norm": 4.591315746307373, + "learning_rate": 4.670676691729324e-06, + "loss": 0.2227, + "step": 35440 + }, + { + "epoch": 53.31, + "grad_norm": 8.496804237365723, + "learning_rate": 4.669172932330828e-06, + "loss": 0.2237, + "step": 35450 + }, + { + "epoch": 53.32, + "grad_norm": 4.822425842285156, + "learning_rate": 4.6676691729323315e-06, + "loss": 0.2348, + "step": 35460 + }, + { + "epoch": 53.34, + "grad_norm": 3.9777705669403076, + "learning_rate": 4.666165413533835e-06, + "loss": 0.2115, + "step": 35470 + }, + { + "epoch": 53.35, + "grad_norm": 16.29306983947754, + "learning_rate": 4.664661654135339e-06, + "loss": 0.1931, + "step": 35480 + }, + { + "epoch": 53.37, + "grad_norm": 2.485229730606079, + "learning_rate": 4.663157894736842e-06, + "loss": 0.1755, + "step": 35490 + }, + { + "epoch": 53.38, + "grad_norm": 5.529495716094971, + "learning_rate": 4.661654135338346e-06, + "loss": 0.2145, + "step": 35500 + }, + { + "epoch": 53.4, + "grad_norm": 5.058990478515625, + "learning_rate": 4.66015037593985e-06, + "loss": 0.2215, + "step": 35510 + }, + { + "epoch": 53.41, + "grad_norm": 3.2668142318725586, + "learning_rate": 4.6586466165413534e-06, + "loss": 0.2098, + "step": 35520 + }, + { + "epoch": 53.43, + "grad_norm": 8.407674789428711, + "learning_rate": 4.657142857142857e-06, + "loss": 0.1849, + "step": 35530 + }, + { + "epoch": 53.44, + "grad_norm": 5.4570488929748535, + "learning_rate": 4.655639097744361e-06, + "loss": 0.2515, + "step": 35540 + }, + { + "epoch": 53.46, + "grad_norm": 4.499083042144775, + "learning_rate": 4.654135338345865e-06, + "loss": 0.2259, + "step": 35550 + }, + { + "epoch": 53.47, + "grad_norm": 9.1441068649292, + "learning_rate": 4.652631578947368e-06, + "loss": 0.2769, + "step": 35560 + }, + { + "epoch": 53.49, + "grad_norm": 5.146740436553955, + "learning_rate": 4.651127819548873e-06, + "loss": 0.2152, + "step": 35570 + }, + { + "epoch": 53.5, + "grad_norm": 7.009411811828613, + "learning_rate": 4.649624060150376e-06, + "loss": 0.2264, + "step": 35580 + }, + { + "epoch": 53.52, + "grad_norm": 6.005060195922852, + "learning_rate": 4.64812030075188e-06, + "loss": 0.1889, + "step": 35590 + }, + { + "epoch": 53.53, + "grad_norm": 8.248251914978027, + "learning_rate": 4.646616541353383e-06, + "loss": 0.2329, + "step": 35600 + }, + { + "epoch": 53.55, + "grad_norm": 6.950926303863525, + "learning_rate": 4.645112781954888e-06, + "loss": 0.2437, + "step": 35610 + }, + { + "epoch": 53.56, + "grad_norm": 6.0409836769104, + "learning_rate": 4.643609022556391e-06, + "loss": 0.1981, + "step": 35620 + }, + { + "epoch": 53.58, + "grad_norm": 6.35344123840332, + "learning_rate": 4.642105263157895e-06, + "loss": 0.2629, + "step": 35630 + }, + { + "epoch": 53.59, + "grad_norm": 4.371424674987793, + "learning_rate": 4.640601503759399e-06, + "loss": 0.2816, + "step": 35640 + }, + { + "epoch": 53.61, + "grad_norm": 6.9506659507751465, + "learning_rate": 4.639097744360903e-06, + "loss": 0.2337, + "step": 35650 + }, + { + "epoch": 53.62, + "grad_norm": 4.818626403808594, + "learning_rate": 4.637593984962406e-06, + "loss": 0.1822, + "step": 35660 + }, + { + "epoch": 53.64, + "grad_norm": 5.629130840301514, + "learning_rate": 4.6360902255639105e-06, + "loss": 0.1786, + "step": 35670 + }, + { + "epoch": 53.65, + "grad_norm": 5.757498264312744, + "learning_rate": 4.634586466165414e-06, + "loss": 0.1709, + "step": 35680 + }, + { + "epoch": 53.67, + "grad_norm": 6.2977800369262695, + "learning_rate": 4.6330827067669175e-06, + "loss": 0.2847, + "step": 35690 + }, + { + "epoch": 53.68, + "grad_norm": 4.801835536956787, + "learning_rate": 4.631578947368421e-06, + "loss": 0.2441, + "step": 35700 + }, + { + "epoch": 53.7, + "grad_norm": 4.978909015655518, + "learning_rate": 4.630075187969925e-06, + "loss": 0.2422, + "step": 35710 + }, + { + "epoch": 53.71, + "grad_norm": 2.3563828468322754, + "learning_rate": 4.628571428571429e-06, + "loss": 0.2588, + "step": 35720 + }, + { + "epoch": 53.73, + "grad_norm": 5.150228977203369, + "learning_rate": 4.6270676691729324e-06, + "loss": 0.2531, + "step": 35730 + }, + { + "epoch": 53.74, + "grad_norm": 6.409964084625244, + "learning_rate": 4.625563909774437e-06, + "loss": 0.1561, + "step": 35740 + }, + { + "epoch": 53.76, + "grad_norm": 4.846547603607178, + "learning_rate": 4.62406015037594e-06, + "loss": 0.2398, + "step": 35750 + }, + { + "epoch": 53.77, + "grad_norm": 5.468544006347656, + "learning_rate": 4.622556390977444e-06, + "loss": 0.2449, + "step": 35760 + }, + { + "epoch": 53.79, + "grad_norm": 5.312195301055908, + "learning_rate": 4.621052631578948e-06, + "loss": 0.2073, + "step": 35770 + }, + { + "epoch": 53.8, + "grad_norm": 7.502721309661865, + "learning_rate": 4.619548872180452e-06, + "loss": 0.246, + "step": 35780 + }, + { + "epoch": 53.82, + "grad_norm": 3.2691874504089355, + "learning_rate": 4.618045112781955e-06, + "loss": 0.2128, + "step": 35790 + }, + { + "epoch": 53.83, + "grad_norm": 8.228704452514648, + "learning_rate": 4.616541353383459e-06, + "loss": 0.2276, + "step": 35800 + }, + { + "epoch": 53.85, + "grad_norm": 5.924352169036865, + "learning_rate": 4.615037593984963e-06, + "loss": 0.219, + "step": 35810 + }, + { + "epoch": 53.86, + "grad_norm": 3.2145702838897705, + "learning_rate": 4.613533834586467e-06, + "loss": 0.235, + "step": 35820 + }, + { + "epoch": 53.88, + "grad_norm": 8.739072799682617, + "learning_rate": 4.61203007518797e-06, + "loss": 0.1665, + "step": 35830 + }, + { + "epoch": 53.89, + "grad_norm": 3.9934237003326416, + "learning_rate": 4.6105263157894745e-06, + "loss": 0.1886, + "step": 35840 + }, + { + "epoch": 53.91, + "grad_norm": 6.60872745513916, + "learning_rate": 4.609022556390978e-06, + "loss": 0.2398, + "step": 35850 + }, + { + "epoch": 53.92, + "grad_norm": 7.813724994659424, + "learning_rate": 4.6075187969924816e-06, + "loss": 0.2371, + "step": 35860 + }, + { + "epoch": 53.94, + "grad_norm": 5.628241539001465, + "learning_rate": 4.606015037593985e-06, + "loss": 0.238, + "step": 35870 + }, + { + "epoch": 53.95, + "grad_norm": 4.604690074920654, + "learning_rate": 4.6045112781954894e-06, + "loss": 0.2523, + "step": 35880 + }, + { + "epoch": 53.97, + "grad_norm": 3.7630534172058105, + "learning_rate": 4.603007518796993e-06, + "loss": 0.2467, + "step": 35890 + }, + { + "epoch": 53.98, + "grad_norm": 5.215484142303467, + "learning_rate": 4.6015037593984965e-06, + "loss": 0.2155, + "step": 35900 + }, + { + "epoch": 54.0, + "grad_norm": 1.31098210811615, + "learning_rate": 4.600000000000001e-06, + "loss": 0.1876, + "step": 35910 + }, + { + "epoch": 54.0, + "eval_accuracy": 0.9318, + "eval_loss": 0.3097255229949951, + "eval_runtime": 84.7816, + "eval_samples_per_second": 117.95, + "eval_steps_per_second": 0.472, + "step": 35910 + }, + { + "epoch": 54.02, + "grad_norm": 1.5609022378921509, + "learning_rate": 4.598496240601504e-06, + "loss": 0.1557, + "step": 35920 + }, + { + "epoch": 54.03, + "grad_norm": 5.817564487457275, + "learning_rate": 4.596992481203008e-06, + "loss": 0.2565, + "step": 35930 + }, + { + "epoch": 54.05, + "grad_norm": 4.572742938995361, + "learning_rate": 4.595488721804512e-06, + "loss": 0.2119, + "step": 35940 + }, + { + "epoch": 54.06, + "grad_norm": 6.757458686828613, + "learning_rate": 4.593984962406016e-06, + "loss": 0.2227, + "step": 35950 + }, + { + "epoch": 54.08, + "grad_norm": 5.000956058502197, + "learning_rate": 4.592481203007519e-06, + "loss": 0.212, + "step": 35960 + }, + { + "epoch": 54.09, + "grad_norm": 9.407301902770996, + "learning_rate": 4.590977443609023e-06, + "loss": 0.2416, + "step": 35970 + }, + { + "epoch": 54.11, + "grad_norm": 5.10260534286499, + "learning_rate": 4.589473684210526e-06, + "loss": 0.2637, + "step": 35980 + }, + { + "epoch": 54.12, + "grad_norm": 6.334413528442383, + "learning_rate": 4.58796992481203e-06, + "loss": 0.2182, + "step": 35990 + }, + { + "epoch": 54.14, + "grad_norm": 3.6286683082580566, + "learning_rate": 4.586466165413534e-06, + "loss": 0.1799, + "step": 36000 + }, + { + "epoch": 54.15, + "grad_norm": 4.822414398193359, + "learning_rate": 4.584962406015038e-06, + "loss": 0.278, + "step": 36010 + }, + { + "epoch": 54.17, + "grad_norm": 9.361292839050293, + "learning_rate": 4.583458646616541e-06, + "loss": 0.2302, + "step": 36020 + }, + { + "epoch": 54.18, + "grad_norm": 5.843459129333496, + "learning_rate": 4.581954887218046e-06, + "loss": 0.2186, + "step": 36030 + }, + { + "epoch": 54.2, + "grad_norm": 3.1918785572052, + "learning_rate": 4.580451127819549e-06, + "loss": 0.2655, + "step": 36040 + }, + { + "epoch": 54.21, + "grad_norm": 5.561599254608154, + "learning_rate": 4.578947368421053e-06, + "loss": 0.2058, + "step": 36050 + }, + { + "epoch": 54.23, + "grad_norm": 2.6801440715789795, + "learning_rate": 4.577443609022556e-06, + "loss": 0.2265, + "step": 36060 + }, + { + "epoch": 54.24, + "grad_norm": 5.820893287658691, + "learning_rate": 4.5759398496240605e-06, + "loss": 0.2562, + "step": 36070 + }, + { + "epoch": 54.26, + "grad_norm": 5.376248836517334, + "learning_rate": 4.574436090225564e-06, + "loss": 0.233, + "step": 36080 + }, + { + "epoch": 54.27, + "grad_norm": 3.018950939178467, + "learning_rate": 4.572932330827068e-06, + "loss": 0.2185, + "step": 36090 + }, + { + "epoch": 54.29, + "grad_norm": 7.458950996398926, + "learning_rate": 4.571428571428572e-06, + "loss": 0.2014, + "step": 36100 + }, + { + "epoch": 54.3, + "grad_norm": 4.388574600219727, + "learning_rate": 4.5699248120300755e-06, + "loss": 0.2267, + "step": 36110 + }, + { + "epoch": 54.32, + "grad_norm": 4.121464252471924, + "learning_rate": 4.568421052631579e-06, + "loss": 0.3017, + "step": 36120 + }, + { + "epoch": 54.33, + "grad_norm": 3.9616527557373047, + "learning_rate": 4.566917293233083e-06, + "loss": 0.2357, + "step": 36130 + }, + { + "epoch": 54.35, + "grad_norm": 7.739973545074463, + "learning_rate": 4.565413533834587e-06, + "loss": 0.2267, + "step": 36140 + }, + { + "epoch": 54.36, + "grad_norm": 5.042054176330566, + "learning_rate": 4.56390977443609e-06, + "loss": 0.2686, + "step": 36150 + }, + { + "epoch": 54.38, + "grad_norm": 6.037046909332275, + "learning_rate": 4.562406015037594e-06, + "loss": 0.1939, + "step": 36160 + }, + { + "epoch": 54.39, + "grad_norm": 6.330253601074219, + "learning_rate": 4.560902255639098e-06, + "loss": 0.2152, + "step": 36170 + }, + { + "epoch": 54.41, + "grad_norm": 5.244846820831299, + "learning_rate": 4.559398496240602e-06, + "loss": 0.1767, + "step": 36180 + }, + { + "epoch": 54.42, + "grad_norm": 4.092870712280273, + "learning_rate": 4.557894736842105e-06, + "loss": 0.2417, + "step": 36190 + }, + { + "epoch": 54.44, + "grad_norm": 4.611246585845947, + "learning_rate": 4.55639097744361e-06, + "loss": 0.2388, + "step": 36200 + }, + { + "epoch": 54.45, + "grad_norm": 5.9267377853393555, + "learning_rate": 4.554887218045113e-06, + "loss": 0.2705, + "step": 36210 + }, + { + "epoch": 54.47, + "grad_norm": 3.914341926574707, + "learning_rate": 4.553383458646617e-06, + "loss": 0.2293, + "step": 36220 + }, + { + "epoch": 54.48, + "grad_norm": 6.353694438934326, + "learning_rate": 4.551879699248121e-06, + "loss": 0.2363, + "step": 36230 + }, + { + "epoch": 54.5, + "grad_norm": 5.587188720703125, + "learning_rate": 4.550375939849625e-06, + "loss": 0.2194, + "step": 36240 + }, + { + "epoch": 54.51, + "grad_norm": 7.160801887512207, + "learning_rate": 4.548872180451128e-06, + "loss": 0.2358, + "step": 36250 + }, + { + "epoch": 54.53, + "grad_norm": 4.975449085235596, + "learning_rate": 4.547368421052632e-06, + "loss": 0.2003, + "step": 36260 + }, + { + "epoch": 54.54, + "grad_norm": 5.509252071380615, + "learning_rate": 4.545864661654136e-06, + "loss": 0.2053, + "step": 36270 + }, + { + "epoch": 54.56, + "grad_norm": 6.666650772094727, + "learning_rate": 4.5443609022556395e-06, + "loss": 0.1576, + "step": 36280 + }, + { + "epoch": 54.57, + "grad_norm": 7.263132095336914, + "learning_rate": 4.542857142857143e-06, + "loss": 0.1459, + "step": 36290 + }, + { + "epoch": 54.59, + "grad_norm": 4.0889410972595215, + "learning_rate": 4.541353383458647e-06, + "loss": 0.1864, + "step": 36300 + }, + { + "epoch": 54.6, + "grad_norm": 5.318135738372803, + "learning_rate": 4.539849624060151e-06, + "loss": 0.258, + "step": 36310 + }, + { + "epoch": 54.62, + "grad_norm": 5.723468780517578, + "learning_rate": 4.5383458646616544e-06, + "loss": 0.2035, + "step": 36320 + }, + { + "epoch": 54.63, + "grad_norm": 5.231049060821533, + "learning_rate": 4.536842105263158e-06, + "loss": 0.1719, + "step": 36330 + }, + { + "epoch": 54.65, + "grad_norm": 5.921431541442871, + "learning_rate": 4.535338345864662e-06, + "loss": 0.3196, + "step": 36340 + }, + { + "epoch": 54.66, + "grad_norm": 5.602309703826904, + "learning_rate": 4.533834586466166e-06, + "loss": 0.2069, + "step": 36350 + }, + { + "epoch": 54.68, + "grad_norm": 8.991955757141113, + "learning_rate": 4.532330827067669e-06, + "loss": 0.2646, + "step": 36360 + }, + { + "epoch": 54.69, + "grad_norm": 3.8575797080993652, + "learning_rate": 4.530827067669174e-06, + "loss": 0.1887, + "step": 36370 + }, + { + "epoch": 54.71, + "grad_norm": 3.4165918827056885, + "learning_rate": 4.529323308270677e-06, + "loss": 0.1746, + "step": 36380 + }, + { + "epoch": 54.72, + "grad_norm": 5.644247531890869, + "learning_rate": 4.527819548872181e-06, + "loss": 0.2347, + "step": 36390 + }, + { + "epoch": 54.74, + "grad_norm": 6.519909381866455, + "learning_rate": 4.526315789473685e-06, + "loss": 0.232, + "step": 36400 + }, + { + "epoch": 54.75, + "grad_norm": 3.5762217044830322, + "learning_rate": 4.524812030075189e-06, + "loss": 0.2537, + "step": 36410 + }, + { + "epoch": 54.77, + "grad_norm": 4.4489898681640625, + "learning_rate": 4.523308270676692e-06, + "loss": 0.2041, + "step": 36420 + }, + { + "epoch": 54.78, + "grad_norm": 5.16048526763916, + "learning_rate": 4.521804511278196e-06, + "loss": 0.1966, + "step": 36430 + }, + { + "epoch": 54.8, + "grad_norm": 4.274946689605713, + "learning_rate": 4.5203007518797e-06, + "loss": 0.2945, + "step": 36440 + }, + { + "epoch": 54.81, + "grad_norm": 4.338449954986572, + "learning_rate": 4.518796992481204e-06, + "loss": 0.1972, + "step": 36450 + }, + { + "epoch": 54.83, + "grad_norm": 5.914998531341553, + "learning_rate": 4.517293233082707e-06, + "loss": 0.1903, + "step": 36460 + }, + { + "epoch": 54.84, + "grad_norm": 23.506519317626953, + "learning_rate": 4.5157894736842115e-06, + "loss": 0.2072, + "step": 36470 + }, + { + "epoch": 54.86, + "grad_norm": 5.0130743980407715, + "learning_rate": 4.514285714285714e-06, + "loss": 0.2302, + "step": 36480 + }, + { + "epoch": 54.87, + "grad_norm": 3.10636568069458, + "learning_rate": 4.5127819548872185e-06, + "loss": 0.1711, + "step": 36490 + }, + { + "epoch": 54.89, + "grad_norm": 6.681315898895264, + "learning_rate": 4.511278195488722e-06, + "loss": 0.2502, + "step": 36500 + }, + { + "epoch": 54.9, + "grad_norm": 5.002006530761719, + "learning_rate": 4.5097744360902255e-06, + "loss": 0.1971, + "step": 36510 + }, + { + "epoch": 54.92, + "grad_norm": 1.6818307638168335, + "learning_rate": 4.508270676691729e-06, + "loss": 0.1604, + "step": 36520 + }, + { + "epoch": 54.93, + "grad_norm": 5.976077556610107, + "learning_rate": 4.5067669172932334e-06, + "loss": 0.2416, + "step": 36530 + }, + { + "epoch": 54.95, + "grad_norm": 4.581597805023193, + "learning_rate": 4.505263157894737e-06, + "loss": 0.2433, + "step": 36540 + }, + { + "epoch": 54.96, + "grad_norm": 6.989867210388184, + "learning_rate": 4.5037593984962405e-06, + "loss": 0.2687, + "step": 36550 + }, + { + "epoch": 54.98, + "grad_norm": 8.435478210449219, + "learning_rate": 4.502255639097745e-06, + "loss": 0.2427, + "step": 36560 + }, + { + "epoch": 54.99, + "grad_norm": 5.939743518829346, + "learning_rate": 4.500751879699248e-06, + "loss": 0.1774, + "step": 36570 + }, + { + "epoch": 55.0, + "eval_accuracy": 0.9321, + "eval_loss": 0.3104659616947174, + "eval_runtime": 84.6339, + "eval_samples_per_second": 118.156, + "eval_steps_per_second": 0.473, + "step": 36575 + }, + { + "epoch": 55.01, + "grad_norm": 4.729255676269531, + "learning_rate": 4.499248120300752e-06, + "loss": 0.4104, + "step": 36580 + }, + { + "epoch": 55.02, + "grad_norm": 4.213466644287109, + "learning_rate": 4.497744360902256e-06, + "loss": 0.24, + "step": 36590 + }, + { + "epoch": 55.04, + "grad_norm": 2.702681064605713, + "learning_rate": 4.49624060150376e-06, + "loss": 0.1929, + "step": 36600 + }, + { + "epoch": 55.05, + "grad_norm": 5.444637775421143, + "learning_rate": 4.494736842105263e-06, + "loss": 0.2168, + "step": 36610 + }, + { + "epoch": 55.07, + "grad_norm": 8.909059524536133, + "learning_rate": 4.493233082706767e-06, + "loss": 0.2456, + "step": 36620 + }, + { + "epoch": 55.08, + "grad_norm": 3.999340772628784, + "learning_rate": 4.491729323308271e-06, + "loss": 0.1867, + "step": 36630 + }, + { + "epoch": 55.1, + "grad_norm": 4.647693157196045, + "learning_rate": 4.490225563909775e-06, + "loss": 0.2448, + "step": 36640 + }, + { + "epoch": 55.11, + "grad_norm": 4.264559745788574, + "learning_rate": 4.488721804511278e-06, + "loss": 0.1637, + "step": 36650 + }, + { + "epoch": 55.13, + "grad_norm": 2.194603204727173, + "learning_rate": 4.4872180451127826e-06, + "loss": 0.2602, + "step": 36660 + }, + { + "epoch": 55.14, + "grad_norm": 3.7559118270874023, + "learning_rate": 4.485714285714286e-06, + "loss": 0.2043, + "step": 36670 + }, + { + "epoch": 55.16, + "grad_norm": 3.0917916297912598, + "learning_rate": 4.48421052631579e-06, + "loss": 0.1839, + "step": 36680 + }, + { + "epoch": 55.17, + "grad_norm": 4.948066234588623, + "learning_rate": 4.482706766917294e-06, + "loss": 0.2476, + "step": 36690 + }, + { + "epoch": 55.19, + "grad_norm": 5.164941310882568, + "learning_rate": 4.4812030075187975e-06, + "loss": 0.1977, + "step": 36700 + }, + { + "epoch": 55.2, + "grad_norm": 4.678724765777588, + "learning_rate": 4.479699248120301e-06, + "loss": 0.2576, + "step": 36710 + }, + { + "epoch": 55.22, + "grad_norm": 8.130949020385742, + "learning_rate": 4.4781954887218045e-06, + "loss": 0.232, + "step": 36720 + }, + { + "epoch": 55.23, + "grad_norm": 4.381587505340576, + "learning_rate": 4.476691729323309e-06, + "loss": 0.2102, + "step": 36730 + }, + { + "epoch": 55.25, + "grad_norm": 3.894073724746704, + "learning_rate": 4.475187969924812e-06, + "loss": 0.2271, + "step": 36740 + }, + { + "epoch": 55.26, + "grad_norm": 7.002061367034912, + "learning_rate": 4.473684210526316e-06, + "loss": 0.1887, + "step": 36750 + }, + { + "epoch": 55.28, + "grad_norm": 5.8444132804870605, + "learning_rate": 4.47218045112782e-06, + "loss": 0.2588, + "step": 36760 + }, + { + "epoch": 55.29, + "grad_norm": 3.5753517150878906, + "learning_rate": 4.470676691729324e-06, + "loss": 0.2312, + "step": 36770 + }, + { + "epoch": 55.31, + "grad_norm": 7.695021629333496, + "learning_rate": 4.469172932330827e-06, + "loss": 0.2307, + "step": 36780 + }, + { + "epoch": 55.32, + "grad_norm": 4.55169153213501, + "learning_rate": 4.467669172932331e-06, + "loss": 0.2242, + "step": 36790 + }, + { + "epoch": 55.34, + "grad_norm": 2.349003553390503, + "learning_rate": 4.466165413533835e-06, + "loss": 0.2035, + "step": 36800 + }, + { + "epoch": 55.35, + "grad_norm": 5.827582359313965, + "learning_rate": 4.464661654135339e-06, + "loss": 0.2916, + "step": 36810 + }, + { + "epoch": 55.37, + "grad_norm": 3.1232826709747314, + "learning_rate": 4.463157894736842e-06, + "loss": 0.231, + "step": 36820 + }, + { + "epoch": 55.38, + "grad_norm": 6.336644172668457, + "learning_rate": 4.461654135338347e-06, + "loss": 0.2344, + "step": 36830 + }, + { + "epoch": 55.4, + "grad_norm": 4.629255771636963, + "learning_rate": 4.46015037593985e-06, + "loss": 0.2577, + "step": 36840 + }, + { + "epoch": 55.41, + "grad_norm": 5.478154182434082, + "learning_rate": 4.458646616541354e-06, + "loss": 0.2311, + "step": 36850 + }, + { + "epoch": 55.43, + "grad_norm": 7.846320629119873, + "learning_rate": 4.457142857142858e-06, + "loss": 0.247, + "step": 36860 + }, + { + "epoch": 55.44, + "grad_norm": 3.461475133895874, + "learning_rate": 4.4556390977443615e-06, + "loss": 0.1488, + "step": 36870 + }, + { + "epoch": 55.46, + "grad_norm": 3.4155285358428955, + "learning_rate": 4.454135338345865e-06, + "loss": 0.1368, + "step": 36880 + }, + { + "epoch": 55.47, + "grad_norm": 2.114795684814453, + "learning_rate": 4.452631578947369e-06, + "loss": 0.1632, + "step": 36890 + }, + { + "epoch": 55.49, + "grad_norm": 1.200542688369751, + "learning_rate": 4.451127819548873e-06, + "loss": 0.1591, + "step": 36900 + }, + { + "epoch": 55.5, + "grad_norm": 4.999506950378418, + "learning_rate": 4.4496240601503765e-06, + "loss": 0.1666, + "step": 36910 + }, + { + "epoch": 55.52, + "grad_norm": 3.8099682331085205, + "learning_rate": 4.44812030075188e-06, + "loss": 0.1601, + "step": 36920 + }, + { + "epoch": 55.53, + "grad_norm": 7.439540386199951, + "learning_rate": 4.446616541353384e-06, + "loss": 0.2179, + "step": 36930 + }, + { + "epoch": 55.55, + "grad_norm": 4.367668151855469, + "learning_rate": 4.445112781954888e-06, + "loss": 0.1956, + "step": 36940 + }, + { + "epoch": 55.56, + "grad_norm": 5.904849529266357, + "learning_rate": 4.443609022556391e-06, + "loss": 0.2132, + "step": 36950 + }, + { + "epoch": 55.58, + "grad_norm": 9.836445808410645, + "learning_rate": 4.442105263157896e-06, + "loss": 0.2255, + "step": 36960 + }, + { + "epoch": 55.59, + "grad_norm": 2.742265224456787, + "learning_rate": 4.440601503759399e-06, + "loss": 0.2005, + "step": 36970 + }, + { + "epoch": 55.61, + "grad_norm": 6.371535778045654, + "learning_rate": 4.439097744360902e-06, + "loss": 0.1944, + "step": 36980 + }, + { + "epoch": 55.62, + "grad_norm": 3.961942195892334, + "learning_rate": 4.437593984962406e-06, + "loss": 0.1885, + "step": 36990 + }, + { + "epoch": 55.64, + "grad_norm": 7.4699320793151855, + "learning_rate": 4.43609022556391e-06, + "loss": 0.2383, + "step": 37000 + }, + { + "epoch": 55.65, + "grad_norm": 2.479861259460449, + "learning_rate": 4.434586466165413e-06, + "loss": 0.2797, + "step": 37010 + }, + { + "epoch": 55.67, + "grad_norm": 4.376903057098389, + "learning_rate": 4.433082706766918e-06, + "loss": 0.1898, + "step": 37020 + }, + { + "epoch": 55.68, + "grad_norm": 5.335219860076904, + "learning_rate": 4.431578947368421e-06, + "loss": 0.2382, + "step": 37030 + }, + { + "epoch": 55.7, + "grad_norm": 4.678366184234619, + "learning_rate": 4.430075187969925e-06, + "loss": 0.2029, + "step": 37040 + }, + { + "epoch": 55.71, + "grad_norm": 5.352789878845215, + "learning_rate": 4.428571428571429e-06, + "loss": 0.2286, + "step": 37050 + }, + { + "epoch": 55.73, + "grad_norm": 5.189038276672363, + "learning_rate": 4.427067669172933e-06, + "loss": 0.1941, + "step": 37060 + }, + { + "epoch": 55.74, + "grad_norm": 10.683388710021973, + "learning_rate": 4.425563909774436e-06, + "loss": 0.195, + "step": 37070 + }, + { + "epoch": 55.76, + "grad_norm": 6.178273677825928, + "learning_rate": 4.42406015037594e-06, + "loss": 0.2207, + "step": 37080 + }, + { + "epoch": 55.77, + "grad_norm": 3.819103479385376, + "learning_rate": 4.422556390977444e-06, + "loss": 0.2128, + "step": 37090 + }, + { + "epoch": 55.79, + "grad_norm": 6.3040452003479, + "learning_rate": 4.4210526315789476e-06, + "loss": 0.1738, + "step": 37100 + }, + { + "epoch": 55.8, + "grad_norm": 5.171235084533691, + "learning_rate": 4.419548872180451e-06, + "loss": 0.2041, + "step": 37110 + }, + { + "epoch": 55.82, + "grad_norm": 5.310617446899414, + "learning_rate": 4.4180451127819555e-06, + "loss": 0.2201, + "step": 37120 + }, + { + "epoch": 55.83, + "grad_norm": 7.987985610961914, + "learning_rate": 4.416541353383459e-06, + "loss": 0.1631, + "step": 37130 + }, + { + "epoch": 55.85, + "grad_norm": 3.634016275405884, + "learning_rate": 4.4150375939849625e-06, + "loss": 0.248, + "step": 37140 + }, + { + "epoch": 55.86, + "grad_norm": 7.253101348876953, + "learning_rate": 4.413533834586467e-06, + "loss": 0.2631, + "step": 37150 + }, + { + "epoch": 55.88, + "grad_norm": 5.290616989135742, + "learning_rate": 4.41203007518797e-06, + "loss": 0.1595, + "step": 37160 + }, + { + "epoch": 55.89, + "grad_norm": 7.694485187530518, + "learning_rate": 4.410526315789474e-06, + "loss": 0.2274, + "step": 37170 + }, + { + "epoch": 55.91, + "grad_norm": 4.015926837921143, + "learning_rate": 4.409022556390977e-06, + "loss": 0.1889, + "step": 37180 + }, + { + "epoch": 55.92, + "grad_norm": 6.907248020172119, + "learning_rate": 4.407518796992482e-06, + "loss": 0.2389, + "step": 37190 + }, + { + "epoch": 55.94, + "grad_norm": 9.580093383789062, + "learning_rate": 4.406015037593985e-06, + "loss": 0.1979, + "step": 37200 + }, + { + "epoch": 55.95, + "grad_norm": 5.89567756652832, + "learning_rate": 4.404511278195489e-06, + "loss": 0.2167, + "step": 37210 + }, + { + "epoch": 55.97, + "grad_norm": 6.980398178100586, + "learning_rate": 4.403007518796993e-06, + "loss": 0.2273, + "step": 37220 + }, + { + "epoch": 55.98, + "grad_norm": 4.1732869148254395, + "learning_rate": 4.401503759398497e-06, + "loss": 0.217, + "step": 37230 + }, + { + "epoch": 56.0, + "grad_norm": 0.04293264448642731, + "learning_rate": 4.4e-06, + "loss": 0.2011, + "step": 37240 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.9337, + "eval_loss": 0.3107641935348511, + "eval_runtime": 84.6401, + "eval_samples_per_second": 118.147, + "eval_steps_per_second": 0.473, + "step": 37240 + }, + { + "epoch": 56.02, + "grad_norm": 2.9971933364868164, + "learning_rate": 4.398496240601504e-06, + "loss": 0.1546, + "step": 37250 + }, + { + "epoch": 56.03, + "grad_norm": 4.23447322845459, + "learning_rate": 4.396992481203008e-06, + "loss": 0.2525, + "step": 37260 + }, + { + "epoch": 56.05, + "grad_norm": 2.2622056007385254, + "learning_rate": 4.395488721804512e-06, + "loss": 0.192, + "step": 37270 + }, + { + "epoch": 56.06, + "grad_norm": 6.475707054138184, + "learning_rate": 4.393984962406015e-06, + "loss": 0.2287, + "step": 37280 + }, + { + "epoch": 56.08, + "grad_norm": 5.559025764465332, + "learning_rate": 4.3924812030075195e-06, + "loss": 0.2248, + "step": 37290 + }, + { + "epoch": 56.09, + "grad_norm": 8.06891918182373, + "learning_rate": 4.390977443609023e-06, + "loss": 0.1641, + "step": 37300 + }, + { + "epoch": 56.11, + "grad_norm": 3.4583325386047363, + "learning_rate": 4.3894736842105266e-06, + "loss": 0.231, + "step": 37310 + }, + { + "epoch": 56.12, + "grad_norm": 11.369089126586914, + "learning_rate": 4.387969924812031e-06, + "loss": 0.2368, + "step": 37320 + }, + { + "epoch": 56.14, + "grad_norm": 3.740903854370117, + "learning_rate": 4.3864661654135344e-06, + "loss": 0.1868, + "step": 37330 + }, + { + "epoch": 56.15, + "grad_norm": 7.389920234680176, + "learning_rate": 4.384962406015038e-06, + "loss": 0.2224, + "step": 37340 + }, + { + "epoch": 56.17, + "grad_norm": 4.443047046661377, + "learning_rate": 4.3834586466165415e-06, + "loss": 0.2308, + "step": 37350 + }, + { + "epoch": 56.18, + "grad_norm": 5.819294452667236, + "learning_rate": 4.381954887218046e-06, + "loss": 0.1937, + "step": 37360 + }, + { + "epoch": 56.2, + "grad_norm": 4.005828857421875, + "learning_rate": 4.380451127819549e-06, + "loss": 0.2269, + "step": 37370 + }, + { + "epoch": 56.21, + "grad_norm": 9.527501106262207, + "learning_rate": 4.378947368421053e-06, + "loss": 0.2721, + "step": 37380 + }, + { + "epoch": 56.23, + "grad_norm": 5.382596492767334, + "learning_rate": 4.377443609022557e-06, + "loss": 0.2143, + "step": 37390 + }, + { + "epoch": 56.24, + "grad_norm": 8.085970878601074, + "learning_rate": 4.375939849624061e-06, + "loss": 0.1549, + "step": 37400 + }, + { + "epoch": 56.26, + "grad_norm": 7.813641548156738, + "learning_rate": 4.374436090225564e-06, + "loss": 0.1926, + "step": 37410 + }, + { + "epoch": 56.27, + "grad_norm": 5.974503993988037, + "learning_rate": 4.372932330827069e-06, + "loss": 0.2217, + "step": 37420 + }, + { + "epoch": 56.29, + "grad_norm": 3.7494096755981445, + "learning_rate": 4.371428571428572e-06, + "loss": 0.1835, + "step": 37430 + }, + { + "epoch": 56.3, + "grad_norm": 8.034648895263672, + "learning_rate": 4.369924812030076e-06, + "loss": 0.2319, + "step": 37440 + }, + { + "epoch": 56.32, + "grad_norm": 5.222734451293945, + "learning_rate": 4.368421052631579e-06, + "loss": 0.2468, + "step": 37450 + }, + { + "epoch": 56.33, + "grad_norm": 2.54164981842041, + "learning_rate": 4.3669172932330836e-06, + "loss": 0.1801, + "step": 37460 + }, + { + "epoch": 56.35, + "grad_norm": 8.72768783569336, + "learning_rate": 4.365413533834586e-06, + "loss": 0.2092, + "step": 37470 + }, + { + "epoch": 56.36, + "grad_norm": 10.628849983215332, + "learning_rate": 4.363909774436091e-06, + "loss": 0.2025, + "step": 37480 + }, + { + "epoch": 56.38, + "grad_norm": 5.263908863067627, + "learning_rate": 4.362406015037594e-06, + "loss": 0.1637, + "step": 37490 + }, + { + "epoch": 56.39, + "grad_norm": 3.4017181396484375, + "learning_rate": 4.360902255639098e-06, + "loss": 0.2772, + "step": 37500 + }, + { + "epoch": 56.41, + "grad_norm": 6.786311626434326, + "learning_rate": 4.359398496240602e-06, + "loss": 0.1987, + "step": 37510 + }, + { + "epoch": 56.42, + "grad_norm": 6.281602382659912, + "learning_rate": 4.3578947368421055e-06, + "loss": 0.1707, + "step": 37520 + }, + { + "epoch": 56.44, + "grad_norm": 6.129687786102295, + "learning_rate": 4.356390977443609e-06, + "loss": 0.2591, + "step": 37530 + }, + { + "epoch": 56.45, + "grad_norm": 4.573347568511963, + "learning_rate": 4.3548872180451126e-06, + "loss": 0.2142, + "step": 37540 + }, + { + "epoch": 56.47, + "grad_norm": 4.783934593200684, + "learning_rate": 4.353383458646617e-06, + "loss": 0.1747, + "step": 37550 + }, + { + "epoch": 56.48, + "grad_norm": 3.693443775177002, + "learning_rate": 4.3518796992481205e-06, + "loss": 0.2222, + "step": 37560 + }, + { + "epoch": 56.5, + "grad_norm": 5.919257640838623, + "learning_rate": 4.350375939849624e-06, + "loss": 0.2443, + "step": 37570 + }, + { + "epoch": 56.51, + "grad_norm": 9.530694007873535, + "learning_rate": 4.348872180451128e-06, + "loss": 0.1908, + "step": 37580 + }, + { + "epoch": 56.53, + "grad_norm": 5.846238613128662, + "learning_rate": 4.347368421052632e-06, + "loss": 0.2141, + "step": 37590 + }, + { + "epoch": 56.54, + "grad_norm": 5.0612616539001465, + "learning_rate": 4.345864661654135e-06, + "loss": 0.1802, + "step": 37600 + }, + { + "epoch": 56.56, + "grad_norm": 3.5078976154327393, + "learning_rate": 4.34436090225564e-06, + "loss": 0.1856, + "step": 37610 + }, + { + "epoch": 56.57, + "grad_norm": 2.6494715213775635, + "learning_rate": 4.342857142857143e-06, + "loss": 0.2228, + "step": 37620 + }, + { + "epoch": 56.59, + "grad_norm": 7.06261682510376, + "learning_rate": 4.341353383458647e-06, + "loss": 0.1996, + "step": 37630 + }, + { + "epoch": 56.6, + "grad_norm": 4.200351238250732, + "learning_rate": 4.33984962406015e-06, + "loss": 0.1868, + "step": 37640 + }, + { + "epoch": 56.62, + "grad_norm": 3.3782460689544678, + "learning_rate": 4.338345864661655e-06, + "loss": 0.1933, + "step": 37650 + }, + { + "epoch": 56.63, + "grad_norm": 5.649930953979492, + "learning_rate": 4.336842105263158e-06, + "loss": 0.1826, + "step": 37660 + }, + { + "epoch": 56.65, + "grad_norm": 5.642304420471191, + "learning_rate": 4.335338345864662e-06, + "loss": 0.232, + "step": 37670 + }, + { + "epoch": 56.66, + "grad_norm": 4.194146156311035, + "learning_rate": 4.333834586466166e-06, + "loss": 0.232, + "step": 37680 + }, + { + "epoch": 56.68, + "grad_norm": 6.466404438018799, + "learning_rate": 4.33233082706767e-06, + "loss": 0.184, + "step": 37690 + }, + { + "epoch": 56.69, + "grad_norm": 6.861640930175781, + "learning_rate": 4.330827067669173e-06, + "loss": 0.2036, + "step": 37700 + }, + { + "epoch": 56.71, + "grad_norm": 5.0166850090026855, + "learning_rate": 4.329323308270677e-06, + "loss": 0.2537, + "step": 37710 + }, + { + "epoch": 56.72, + "grad_norm": 6.62353515625, + "learning_rate": 4.327819548872181e-06, + "loss": 0.2708, + "step": 37720 + }, + { + "epoch": 56.74, + "grad_norm": 5.707888603210449, + "learning_rate": 4.3263157894736845e-06, + "loss": 0.162, + "step": 37730 + }, + { + "epoch": 56.75, + "grad_norm": 6.166172981262207, + "learning_rate": 4.324812030075188e-06, + "loss": 0.2149, + "step": 37740 + }, + { + "epoch": 56.77, + "grad_norm": 8.481252670288086, + "learning_rate": 4.323308270676692e-06, + "loss": 0.2423, + "step": 37750 + }, + { + "epoch": 56.78, + "grad_norm": 1.8157340288162231, + "learning_rate": 4.321804511278196e-06, + "loss": 0.2519, + "step": 37760 + }, + { + "epoch": 56.8, + "grad_norm": 6.047143459320068, + "learning_rate": 4.3203007518796994e-06, + "loss": 0.3102, + "step": 37770 + }, + { + "epoch": 56.81, + "grad_norm": 3.9835569858551025, + "learning_rate": 4.318796992481204e-06, + "loss": 0.2141, + "step": 37780 + }, + { + "epoch": 56.83, + "grad_norm": 3.2040421962738037, + "learning_rate": 4.317293233082707e-06, + "loss": 0.1938, + "step": 37790 + }, + { + "epoch": 56.84, + "grad_norm": 3.5100557804107666, + "learning_rate": 4.315789473684211e-06, + "loss": 0.1956, + "step": 37800 + }, + { + "epoch": 56.86, + "grad_norm": 12.860550880432129, + "learning_rate": 4.314285714285714e-06, + "loss": 0.2704, + "step": 37810 + }, + { + "epoch": 56.87, + "grad_norm": 4.295982360839844, + "learning_rate": 4.312781954887219e-06, + "loss": 0.2478, + "step": 37820 + }, + { + "epoch": 56.89, + "grad_norm": 5.491456031799316, + "learning_rate": 4.311278195488722e-06, + "loss": 0.2482, + "step": 37830 + }, + { + "epoch": 56.9, + "grad_norm": 3.705702781677246, + "learning_rate": 4.309774436090226e-06, + "loss": 0.189, + "step": 37840 + }, + { + "epoch": 56.92, + "grad_norm": 9.076353073120117, + "learning_rate": 4.30827067669173e-06, + "loss": 0.2324, + "step": 37850 + }, + { + "epoch": 56.93, + "grad_norm": 5.757368564605713, + "learning_rate": 4.306766917293234e-06, + "loss": 0.2676, + "step": 37860 + }, + { + "epoch": 56.95, + "grad_norm": 5.485102653503418, + "learning_rate": 4.305263157894737e-06, + "loss": 0.1747, + "step": 37870 + }, + { + "epoch": 56.96, + "grad_norm": 3.859330892562866, + "learning_rate": 4.3037593984962415e-06, + "loss": 0.2029, + "step": 37880 + }, + { + "epoch": 56.98, + "grad_norm": 2.6711108684539795, + "learning_rate": 4.302255639097745e-06, + "loss": 0.2083, + "step": 37890 + }, + { + "epoch": 56.99, + "grad_norm": 5.341872692108154, + "learning_rate": 4.3007518796992486e-06, + "loss": 0.2142, + "step": 37900 + }, + { + "epoch": 57.0, + "eval_accuracy": 0.9312, + "eval_loss": 0.3190965950489044, + "eval_runtime": 84.8921, + "eval_samples_per_second": 117.797, + "eval_steps_per_second": 0.471, + "step": 37905 + }, + { + "epoch": 57.01, + "grad_norm": 3.6653482913970947, + "learning_rate": 4.299248120300752e-06, + "loss": 0.2008, + "step": 37910 + }, + { + "epoch": 57.02, + "grad_norm": 3.9756722450256348, + "learning_rate": 4.2977443609022565e-06, + "loss": 0.1872, + "step": 37920 + }, + { + "epoch": 57.04, + "grad_norm": 9.079606056213379, + "learning_rate": 4.29624060150376e-06, + "loss": 0.1913, + "step": 37930 + }, + { + "epoch": 57.05, + "grad_norm": 5.466242790222168, + "learning_rate": 4.2947368421052635e-06, + "loss": 0.2364, + "step": 37940 + }, + { + "epoch": 57.07, + "grad_norm": 1.4235931634902954, + "learning_rate": 4.293233082706768e-06, + "loss": 0.2255, + "step": 37950 + }, + { + "epoch": 57.08, + "grad_norm": 13.605971336364746, + "learning_rate": 4.291729323308271e-06, + "loss": 0.2454, + "step": 37960 + }, + { + "epoch": 57.1, + "grad_norm": 5.704050064086914, + "learning_rate": 4.290225563909775e-06, + "loss": 0.2411, + "step": 37970 + }, + { + "epoch": 57.11, + "grad_norm": 4.244338035583496, + "learning_rate": 4.288721804511278e-06, + "loss": 0.2289, + "step": 37980 + }, + { + "epoch": 57.13, + "grad_norm": 5.691931247711182, + "learning_rate": 4.287218045112782e-06, + "loss": 0.2031, + "step": 37990 + }, + { + "epoch": 57.14, + "grad_norm": 8.121678352355957, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.2434, + "step": 38000 + }, + { + "epoch": 57.16, + "grad_norm": 2.5804800987243652, + "learning_rate": 4.28421052631579e-06, + "loss": 0.1663, + "step": 38010 + }, + { + "epoch": 57.17, + "grad_norm": 6.460157871246338, + "learning_rate": 4.282706766917293e-06, + "loss": 0.1966, + "step": 38020 + }, + { + "epoch": 57.19, + "grad_norm": 6.010727405548096, + "learning_rate": 4.281203007518797e-06, + "loss": 0.2547, + "step": 38030 + }, + { + "epoch": 57.2, + "grad_norm": 3.152200937271118, + "learning_rate": 4.279699248120301e-06, + "loss": 0.1743, + "step": 38040 + }, + { + "epoch": 57.22, + "grad_norm": 6.710914134979248, + "learning_rate": 4.278195488721805e-06, + "loss": 0.2268, + "step": 38050 + }, + { + "epoch": 57.23, + "grad_norm": 2.6717779636383057, + "learning_rate": 4.276691729323308e-06, + "loss": 0.1521, + "step": 38060 + }, + { + "epoch": 57.25, + "grad_norm": 5.592037200927734, + "learning_rate": 4.275187969924813e-06, + "loss": 0.2159, + "step": 38070 + }, + { + "epoch": 57.26, + "grad_norm": 3.0837130546569824, + "learning_rate": 4.273684210526316e-06, + "loss": 0.2232, + "step": 38080 + }, + { + "epoch": 57.28, + "grad_norm": 5.526956558227539, + "learning_rate": 4.27218045112782e-06, + "loss": 0.1866, + "step": 38090 + }, + { + "epoch": 57.29, + "grad_norm": 5.77144718170166, + "learning_rate": 4.270676691729323e-06, + "loss": 0.2533, + "step": 38100 + }, + { + "epoch": 57.31, + "grad_norm": 3.7679331302642822, + "learning_rate": 4.2691729323308276e-06, + "loss": 0.2217, + "step": 38110 + }, + { + "epoch": 57.32, + "grad_norm": 19.891773223876953, + "learning_rate": 4.267669172932331e-06, + "loss": 0.1851, + "step": 38120 + }, + { + "epoch": 57.34, + "grad_norm": 8.34205436706543, + "learning_rate": 4.266165413533835e-06, + "loss": 0.2048, + "step": 38130 + }, + { + "epoch": 57.35, + "grad_norm": 5.428603172302246, + "learning_rate": 4.264661654135339e-06, + "loss": 0.2122, + "step": 38140 + }, + { + "epoch": 57.37, + "grad_norm": 3.5390725135803223, + "learning_rate": 4.2631578947368425e-06, + "loss": 0.2041, + "step": 38150 + }, + { + "epoch": 57.38, + "grad_norm": 4.302058219909668, + "learning_rate": 4.261654135338346e-06, + "loss": 0.1994, + "step": 38160 + }, + { + "epoch": 57.4, + "grad_norm": 16.373668670654297, + "learning_rate": 4.2601503759398495e-06, + "loss": 0.2482, + "step": 38170 + }, + { + "epoch": 57.41, + "grad_norm": 3.5577502250671387, + "learning_rate": 4.258646616541354e-06, + "loss": 0.1791, + "step": 38180 + }, + { + "epoch": 57.43, + "grad_norm": 3.964195728302002, + "learning_rate": 4.257142857142857e-06, + "loss": 0.2346, + "step": 38190 + }, + { + "epoch": 57.44, + "grad_norm": 4.708043575286865, + "learning_rate": 4.255639097744361e-06, + "loss": 0.1822, + "step": 38200 + }, + { + "epoch": 57.46, + "grad_norm": 4.237730503082275, + "learning_rate": 4.254135338345865e-06, + "loss": 0.1897, + "step": 38210 + }, + { + "epoch": 57.47, + "grad_norm": 4.254096508026123, + "learning_rate": 4.252631578947369e-06, + "loss": 0.2153, + "step": 38220 + }, + { + "epoch": 57.49, + "grad_norm": 2.041449546813965, + "learning_rate": 4.251127819548872e-06, + "loss": 0.1745, + "step": 38230 + }, + { + "epoch": 57.5, + "grad_norm": 2.9774394035339355, + "learning_rate": 4.249624060150377e-06, + "loss": 0.2481, + "step": 38240 + }, + { + "epoch": 57.52, + "grad_norm": 2.6556615829467773, + "learning_rate": 4.24812030075188e-06, + "loss": 0.1992, + "step": 38250 + }, + { + "epoch": 57.53, + "grad_norm": 4.096647262573242, + "learning_rate": 4.246616541353384e-06, + "loss": 0.2644, + "step": 38260 + }, + { + "epoch": 57.55, + "grad_norm": 4.746277809143066, + "learning_rate": 4.245112781954887e-06, + "loss": 0.1919, + "step": 38270 + }, + { + "epoch": 57.56, + "grad_norm": 5.280014514923096, + "learning_rate": 4.243609022556392e-06, + "loss": 0.2576, + "step": 38280 + }, + { + "epoch": 57.58, + "grad_norm": 6.924928188323975, + "learning_rate": 4.242105263157895e-06, + "loss": 0.1854, + "step": 38290 + }, + { + "epoch": 57.59, + "grad_norm": 3.799520969390869, + "learning_rate": 4.240601503759399e-06, + "loss": 0.2675, + "step": 38300 + }, + { + "epoch": 57.61, + "grad_norm": 5.440141677856445, + "learning_rate": 4.239097744360903e-06, + "loss": 0.1975, + "step": 38310 + }, + { + "epoch": 57.62, + "grad_norm": 5.276878356933594, + "learning_rate": 4.2375939849624065e-06, + "loss": 0.2251, + "step": 38320 + }, + { + "epoch": 57.64, + "grad_norm": 5.117806434631348, + "learning_rate": 4.23609022556391e-06, + "loss": 0.1818, + "step": 38330 + }, + { + "epoch": 57.65, + "grad_norm": 5.137388229370117, + "learning_rate": 4.2345864661654144e-06, + "loss": 0.2429, + "step": 38340 + }, + { + "epoch": 57.67, + "grad_norm": 1.6786266565322876, + "learning_rate": 4.233082706766918e-06, + "loss": 0.2236, + "step": 38350 + }, + { + "epoch": 57.68, + "grad_norm": 4.657053470611572, + "learning_rate": 4.2315789473684215e-06, + "loss": 0.2304, + "step": 38360 + }, + { + "epoch": 57.7, + "grad_norm": 6.737263202667236, + "learning_rate": 4.230075187969925e-06, + "loss": 0.1869, + "step": 38370 + }, + { + "epoch": 57.71, + "grad_norm": 8.260579109191895, + "learning_rate": 4.228571428571429e-06, + "loss": 0.2162, + "step": 38380 + }, + { + "epoch": 57.73, + "grad_norm": 4.183199405670166, + "learning_rate": 4.227067669172933e-06, + "loss": 0.2047, + "step": 38390 + }, + { + "epoch": 57.74, + "grad_norm": 6.769223213195801, + "learning_rate": 4.225563909774436e-06, + "loss": 0.1686, + "step": 38400 + }, + { + "epoch": 57.76, + "grad_norm": 6.208756923675537, + "learning_rate": 4.224060150375941e-06, + "loss": 0.2128, + "step": 38410 + }, + { + "epoch": 57.77, + "grad_norm": 5.576897144317627, + "learning_rate": 4.222556390977444e-06, + "loss": 0.1921, + "step": 38420 + }, + { + "epoch": 57.79, + "grad_norm": 6.998033046722412, + "learning_rate": 4.221052631578948e-06, + "loss": 0.2155, + "step": 38430 + }, + { + "epoch": 57.8, + "grad_norm": 4.374655246734619, + "learning_rate": 4.219548872180451e-06, + "loss": 0.2439, + "step": 38440 + }, + { + "epoch": 57.82, + "grad_norm": 3.7079293727874756, + "learning_rate": 4.218045112781956e-06, + "loss": 0.228, + "step": 38450 + }, + { + "epoch": 57.83, + "grad_norm": 7.11881685256958, + "learning_rate": 4.216541353383459e-06, + "loss": 0.2366, + "step": 38460 + }, + { + "epoch": 57.85, + "grad_norm": 11.91295051574707, + "learning_rate": 4.215037593984963e-06, + "loss": 0.2144, + "step": 38470 + }, + { + "epoch": 57.86, + "grad_norm": 4.202861309051514, + "learning_rate": 4.213533834586466e-06, + "loss": 0.2088, + "step": 38480 + }, + { + "epoch": 57.88, + "grad_norm": 2.2428081035614014, + "learning_rate": 4.21203007518797e-06, + "loss": 0.2238, + "step": 38490 + }, + { + "epoch": 57.89, + "grad_norm": 6.293793678283691, + "learning_rate": 4.210526315789474e-06, + "loss": 0.246, + "step": 38500 + }, + { + "epoch": 57.91, + "grad_norm": 3.7611231803894043, + "learning_rate": 4.209022556390978e-06, + "loss": 0.2574, + "step": 38510 + }, + { + "epoch": 57.92, + "grad_norm": 3.548879623413086, + "learning_rate": 4.207518796992481e-06, + "loss": 0.2064, + "step": 38520 + }, + { + "epoch": 57.94, + "grad_norm": 3.5769736766815186, + "learning_rate": 4.2060150375939855e-06, + "loss": 0.1324, + "step": 38530 + }, + { + "epoch": 57.95, + "grad_norm": 6.342837810516357, + "learning_rate": 4.204511278195489e-06, + "loss": 0.1634, + "step": 38540 + }, + { + "epoch": 57.97, + "grad_norm": 3.722198724746704, + "learning_rate": 4.2030075187969926e-06, + "loss": 0.2034, + "step": 38550 + }, + { + "epoch": 57.98, + "grad_norm": 6.514225482940674, + "learning_rate": 4.201503759398496e-06, + "loss": 0.2175, + "step": 38560 + }, + { + "epoch": 58.0, + "grad_norm": 0.08241364359855652, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.1931, + "step": 38570 + }, + { + "epoch": 58.0, + "eval_accuracy": 0.9299, + "eval_loss": 0.3219141960144043, + "eval_runtime": 84.8061, + "eval_samples_per_second": 117.916, + "eval_steps_per_second": 0.472, + "step": 38570 + }, + { + "epoch": 58.02, + "grad_norm": 3.8991994857788086, + "learning_rate": 4.198496240601504e-06, + "loss": 0.2143, + "step": 38580 + }, + { + "epoch": 58.03, + "grad_norm": 4.180047035217285, + "learning_rate": 4.1969924812030075e-06, + "loss": 0.1819, + "step": 38590 + }, + { + "epoch": 58.05, + "grad_norm": 9.162282943725586, + "learning_rate": 4.195488721804512e-06, + "loss": 0.2224, + "step": 38600 + }, + { + "epoch": 58.06, + "grad_norm": 3.6437289714813232, + "learning_rate": 4.193984962406015e-06, + "loss": 0.1961, + "step": 38610 + }, + { + "epoch": 58.08, + "grad_norm": 5.206214904785156, + "learning_rate": 4.192481203007519e-06, + "loss": 0.2267, + "step": 38620 + }, + { + "epoch": 58.09, + "grad_norm": 7.128195762634277, + "learning_rate": 4.190977443609022e-06, + "loss": 0.2716, + "step": 38630 + }, + { + "epoch": 58.11, + "grad_norm": 6.48333740234375, + "learning_rate": 4.189473684210527e-06, + "loss": 0.2381, + "step": 38640 + }, + { + "epoch": 58.12, + "grad_norm": 6.768774509429932, + "learning_rate": 4.18796992481203e-06, + "loss": 0.2204, + "step": 38650 + }, + { + "epoch": 58.14, + "grad_norm": 7.444092273712158, + "learning_rate": 4.186466165413534e-06, + "loss": 0.2563, + "step": 38660 + }, + { + "epoch": 58.15, + "grad_norm": 5.427529335021973, + "learning_rate": 4.184962406015038e-06, + "loss": 0.1636, + "step": 38670 + }, + { + "epoch": 58.17, + "grad_norm": 9.827468872070312, + "learning_rate": 4.183458646616542e-06, + "loss": 0.2042, + "step": 38680 + }, + { + "epoch": 58.18, + "grad_norm": 7.735494613647461, + "learning_rate": 4.181954887218045e-06, + "loss": 0.2047, + "step": 38690 + }, + { + "epoch": 58.2, + "grad_norm": 4.068072319030762, + "learning_rate": 4.18045112781955e-06, + "loss": 0.2105, + "step": 38700 + }, + { + "epoch": 58.21, + "grad_norm": 5.502895832061768, + "learning_rate": 4.178947368421053e-06, + "loss": 0.2376, + "step": 38710 + }, + { + "epoch": 58.23, + "grad_norm": 4.636054515838623, + "learning_rate": 4.177443609022557e-06, + "loss": 0.2079, + "step": 38720 + }, + { + "epoch": 58.24, + "grad_norm": 3.9590959548950195, + "learning_rate": 4.17593984962406e-06, + "loss": 0.2144, + "step": 38730 + }, + { + "epoch": 58.26, + "grad_norm": 4.759368896484375, + "learning_rate": 4.1744360902255645e-06, + "loss": 0.1763, + "step": 38740 + }, + { + "epoch": 58.27, + "grad_norm": 4.7535400390625, + "learning_rate": 4.172932330827068e-06, + "loss": 0.1848, + "step": 38750 + }, + { + "epoch": 58.29, + "grad_norm": 8.80599308013916, + "learning_rate": 4.1714285714285715e-06, + "loss": 0.2202, + "step": 38760 + }, + { + "epoch": 58.3, + "grad_norm": 5.612436771392822, + "learning_rate": 4.169924812030076e-06, + "loss": 0.1748, + "step": 38770 + }, + { + "epoch": 58.32, + "grad_norm": 5.2577972412109375, + "learning_rate": 4.1684210526315794e-06, + "loss": 0.2463, + "step": 38780 + }, + { + "epoch": 58.33, + "grad_norm": 6.162819862365723, + "learning_rate": 4.166917293233083e-06, + "loss": 0.1727, + "step": 38790 + }, + { + "epoch": 58.35, + "grad_norm": 5.589300155639648, + "learning_rate": 4.165413533834587e-06, + "loss": 0.2035, + "step": 38800 + }, + { + "epoch": 58.36, + "grad_norm": 3.496182680130005, + "learning_rate": 4.163909774436091e-06, + "loss": 0.1862, + "step": 38810 + }, + { + "epoch": 58.38, + "grad_norm": 7.816127777099609, + "learning_rate": 4.162406015037594e-06, + "loss": 0.1733, + "step": 38820 + }, + { + "epoch": 58.39, + "grad_norm": 8.98507308959961, + "learning_rate": 4.160902255639098e-06, + "loss": 0.1806, + "step": 38830 + }, + { + "epoch": 58.41, + "grad_norm": 3.1957809925079346, + "learning_rate": 4.159398496240602e-06, + "loss": 0.1946, + "step": 38840 + }, + { + "epoch": 58.42, + "grad_norm": 2.860567331314087, + "learning_rate": 4.157894736842106e-06, + "loss": 0.2619, + "step": 38850 + }, + { + "epoch": 58.44, + "grad_norm": 6.364305019378662, + "learning_rate": 4.156390977443609e-06, + "loss": 0.2311, + "step": 38860 + }, + { + "epoch": 58.45, + "grad_norm": 8.90013599395752, + "learning_rate": 4.154887218045114e-06, + "loss": 0.2202, + "step": 38870 + }, + { + "epoch": 58.47, + "grad_norm": 6.431805610656738, + "learning_rate": 4.153383458646617e-06, + "loss": 0.1814, + "step": 38880 + }, + { + "epoch": 58.48, + "grad_norm": 7.369435787200928, + "learning_rate": 4.151879699248121e-06, + "loss": 0.2696, + "step": 38890 + }, + { + "epoch": 58.5, + "grad_norm": 4.004574775695801, + "learning_rate": 4.150375939849624e-06, + "loss": 0.1867, + "step": 38900 + }, + { + "epoch": 58.51, + "grad_norm": 3.348543643951416, + "learning_rate": 4.1488721804511286e-06, + "loss": 0.1993, + "step": 38910 + }, + { + "epoch": 58.53, + "grad_norm": 3.0820324420928955, + "learning_rate": 4.147368421052632e-06, + "loss": 0.2131, + "step": 38920 + }, + { + "epoch": 58.54, + "grad_norm": 5.479022026062012, + "learning_rate": 4.145864661654136e-06, + "loss": 0.255, + "step": 38930 + }, + { + "epoch": 58.56, + "grad_norm": 5.253530979156494, + "learning_rate": 4.14436090225564e-06, + "loss": 0.1944, + "step": 38940 + }, + { + "epoch": 58.57, + "grad_norm": 3.170219659805298, + "learning_rate": 4.1428571428571435e-06, + "loss": 0.1768, + "step": 38950 + }, + { + "epoch": 58.59, + "grad_norm": 3.0692005157470703, + "learning_rate": 4.141353383458647e-06, + "loss": 0.2054, + "step": 38960 + }, + { + "epoch": 58.6, + "grad_norm": 5.293381214141846, + "learning_rate": 4.1398496240601505e-06, + "loss": 0.2317, + "step": 38970 + }, + { + "epoch": 58.62, + "grad_norm": 5.903608322143555, + "learning_rate": 4.138345864661654e-06, + "loss": 0.2098, + "step": 38980 + }, + { + "epoch": 58.63, + "grad_norm": 6.2776665687561035, + "learning_rate": 4.136842105263158e-06, + "loss": 0.2591, + "step": 38990 + }, + { + "epoch": 58.65, + "grad_norm": 1.9607787132263184, + "learning_rate": 4.135338345864662e-06, + "loss": 0.1817, + "step": 39000 + }, + { + "epoch": 58.66, + "grad_norm": 4.821249961853027, + "learning_rate": 4.1338345864661654e-06, + "loss": 0.1306, + "step": 39010 + }, + { + "epoch": 58.68, + "grad_norm": 4.712565898895264, + "learning_rate": 4.132330827067669e-06, + "loss": 0.2086, + "step": 39020 + }, + { + "epoch": 58.69, + "grad_norm": 4.460330486297607, + "learning_rate": 4.130827067669173e-06, + "loss": 0.1441, + "step": 39030 + }, + { + "epoch": 58.71, + "grad_norm": 4.247945308685303, + "learning_rate": 4.129323308270677e-06, + "loss": 0.2307, + "step": 39040 + }, + { + "epoch": 58.72, + "grad_norm": 5.8864521980285645, + "learning_rate": 4.12781954887218e-06, + "loss": 0.2461, + "step": 39050 + }, + { + "epoch": 58.74, + "grad_norm": 4.723229885101318, + "learning_rate": 4.126315789473685e-06, + "loss": 0.2235, + "step": 39060 + }, + { + "epoch": 58.75, + "grad_norm": 3.2411322593688965, + "learning_rate": 4.124812030075188e-06, + "loss": 0.1957, + "step": 39070 + }, + { + "epoch": 58.77, + "grad_norm": 7.346851348876953, + "learning_rate": 4.123308270676692e-06, + "loss": 0.2102, + "step": 39080 + }, + { + "epoch": 58.78, + "grad_norm": 5.536585330963135, + "learning_rate": 4.121804511278195e-06, + "loss": 0.2596, + "step": 39090 + }, + { + "epoch": 58.8, + "grad_norm": 2.5167481899261475, + "learning_rate": 4.1203007518797e-06, + "loss": 0.1739, + "step": 39100 + }, + { + "epoch": 58.81, + "grad_norm": 2.2827956676483154, + "learning_rate": 4.118796992481203e-06, + "loss": 0.2394, + "step": 39110 + }, + { + "epoch": 58.83, + "grad_norm": 3.863255262374878, + "learning_rate": 4.117293233082707e-06, + "loss": 0.205, + "step": 39120 + }, + { + "epoch": 58.84, + "grad_norm": 5.798130035400391, + "learning_rate": 4.115789473684211e-06, + "loss": 0.2017, + "step": 39130 + }, + { + "epoch": 58.86, + "grad_norm": 3.510673761367798, + "learning_rate": 4.114285714285715e-06, + "loss": 0.2036, + "step": 39140 + }, + { + "epoch": 58.87, + "grad_norm": 4.811961650848389, + "learning_rate": 4.112781954887218e-06, + "loss": 0.2651, + "step": 39150 + }, + { + "epoch": 58.89, + "grad_norm": 6.230356216430664, + "learning_rate": 4.1112781954887225e-06, + "loss": 0.1561, + "step": 39160 + }, + { + "epoch": 58.9, + "grad_norm": 5.7233757972717285, + "learning_rate": 4.109774436090226e-06, + "loss": 0.1541, + "step": 39170 + }, + { + "epoch": 58.92, + "grad_norm": 6.532334327697754, + "learning_rate": 4.1082706766917295e-06, + "loss": 0.2049, + "step": 39180 + }, + { + "epoch": 58.93, + "grad_norm": 4.839334011077881, + "learning_rate": 4.106766917293233e-06, + "loss": 0.1846, + "step": 39190 + }, + { + "epoch": 58.95, + "grad_norm": 8.426265716552734, + "learning_rate": 4.105263157894737e-06, + "loss": 0.2213, + "step": 39200 + }, + { + "epoch": 58.96, + "grad_norm": 3.9503211975097656, + "learning_rate": 4.103759398496241e-06, + "loss": 0.1891, + "step": 39210 + }, + { + "epoch": 58.98, + "grad_norm": 5.404723167419434, + "learning_rate": 4.1022556390977444e-06, + "loss": 0.2554, + "step": 39220 + }, + { + "epoch": 58.99, + "grad_norm": 2.1365013122558594, + "learning_rate": 4.100751879699249e-06, + "loss": 0.2328, + "step": 39230 + }, + { + "epoch": 59.0, + "eval_accuracy": 0.9316, + "eval_loss": 0.315520316362381, + "eval_runtime": 84.5641, + "eval_samples_per_second": 118.253, + "eval_steps_per_second": 0.473, + "step": 39235 + }, + { + "epoch": 59.01, + "grad_norm": 6.348912239074707, + "learning_rate": 4.099248120300752e-06, + "loss": 0.174, + "step": 39240 + }, + { + "epoch": 59.02, + "grad_norm": 7.103587627410889, + "learning_rate": 4.097744360902256e-06, + "loss": 0.2624, + "step": 39250 + }, + { + "epoch": 59.04, + "grad_norm": 3.9258017539978027, + "learning_rate": 4.09624060150376e-06, + "loss": 0.2086, + "step": 39260 + }, + { + "epoch": 59.05, + "grad_norm": 4.404173851013184, + "learning_rate": 4.094736842105264e-06, + "loss": 0.2327, + "step": 39270 + }, + { + "epoch": 59.07, + "grad_norm": 4.899421691894531, + "learning_rate": 4.093233082706767e-06, + "loss": 0.2527, + "step": 39280 + }, + { + "epoch": 59.08, + "grad_norm": 5.5442094802856445, + "learning_rate": 4.091729323308271e-06, + "loss": 0.1896, + "step": 39290 + }, + { + "epoch": 59.1, + "grad_norm": 3.816484212875366, + "learning_rate": 4.090225563909775e-06, + "loss": 0.2282, + "step": 39300 + }, + { + "epoch": 59.11, + "grad_norm": 7.135293006896973, + "learning_rate": 4.088721804511279e-06, + "loss": 0.217, + "step": 39310 + }, + { + "epoch": 59.13, + "grad_norm": 7.170220375061035, + "learning_rate": 4.087218045112782e-06, + "loss": 0.2658, + "step": 39320 + }, + { + "epoch": 59.14, + "grad_norm": 3.617730140686035, + "learning_rate": 4.0857142857142865e-06, + "loss": 0.1469, + "step": 39330 + }, + { + "epoch": 59.16, + "grad_norm": 3.9617254734039307, + "learning_rate": 4.08421052631579e-06, + "loss": 0.2034, + "step": 39340 + }, + { + "epoch": 59.17, + "grad_norm": 5.964526653289795, + "learning_rate": 4.0827067669172936e-06, + "loss": 0.2128, + "step": 39350 + }, + { + "epoch": 59.19, + "grad_norm": 5.290602684020996, + "learning_rate": 4.081203007518797e-06, + "loss": 0.2284, + "step": 39360 + }, + { + "epoch": 59.2, + "grad_norm": 9.3060941696167, + "learning_rate": 4.0796992481203015e-06, + "loss": 0.2395, + "step": 39370 + }, + { + "epoch": 59.22, + "grad_norm": 7.939350128173828, + "learning_rate": 4.078195488721805e-06, + "loss": 0.2119, + "step": 39380 + }, + { + "epoch": 59.23, + "grad_norm": 5.00067138671875, + "learning_rate": 4.0766917293233085e-06, + "loss": 0.1631, + "step": 39390 + }, + { + "epoch": 59.25, + "grad_norm": 5.9795918464660645, + "learning_rate": 4.075187969924813e-06, + "loss": 0.1553, + "step": 39400 + }, + { + "epoch": 59.26, + "grad_norm": 2.5546774864196777, + "learning_rate": 4.073684210526316e-06, + "loss": 0.159, + "step": 39410 + }, + { + "epoch": 59.28, + "grad_norm": 8.247179985046387, + "learning_rate": 4.07218045112782e-06, + "loss": 0.2191, + "step": 39420 + }, + { + "epoch": 59.29, + "grad_norm": 5.384758949279785, + "learning_rate": 4.070676691729324e-06, + "loss": 0.1586, + "step": 39430 + }, + { + "epoch": 59.31, + "grad_norm": 2.1114683151245117, + "learning_rate": 4.069172932330828e-06, + "loss": 0.2198, + "step": 39440 + }, + { + "epoch": 59.32, + "grad_norm": 7.196309566497803, + "learning_rate": 4.067669172932331e-06, + "loss": 0.2132, + "step": 39450 + }, + { + "epoch": 59.34, + "grad_norm": 12.441351890563965, + "learning_rate": 4.066165413533835e-06, + "loss": 0.1764, + "step": 39460 + }, + { + "epoch": 59.35, + "grad_norm": 4.480091094970703, + "learning_rate": 4.064661654135338e-06, + "loss": 0.2407, + "step": 39470 + }, + { + "epoch": 59.37, + "grad_norm": 4.517475605010986, + "learning_rate": 4.063157894736842e-06, + "loss": 0.2106, + "step": 39480 + }, + { + "epoch": 59.38, + "grad_norm": 6.213492393493652, + "learning_rate": 4.061654135338346e-06, + "loss": 0.2005, + "step": 39490 + }, + { + "epoch": 59.4, + "grad_norm": 3.382683515548706, + "learning_rate": 4.06015037593985e-06, + "loss": 0.17, + "step": 39500 + }, + { + "epoch": 59.41, + "grad_norm": 5.262118816375732, + "learning_rate": 4.058646616541353e-06, + "loss": 0.2167, + "step": 39510 + }, + { + "epoch": 59.43, + "grad_norm": 8.032793998718262, + "learning_rate": 4.057142857142858e-06, + "loss": 0.2287, + "step": 39520 + }, + { + "epoch": 59.44, + "grad_norm": 5.3855767250061035, + "learning_rate": 4.055639097744361e-06, + "loss": 0.1747, + "step": 39530 + }, + { + "epoch": 59.46, + "grad_norm": 8.112833976745605, + "learning_rate": 4.054135338345865e-06, + "loss": 0.1867, + "step": 39540 + }, + { + "epoch": 59.47, + "grad_norm": 4.0277910232543945, + "learning_rate": 4.052631578947368e-06, + "loss": 0.2311, + "step": 39550 + }, + { + "epoch": 59.49, + "grad_norm": 2.4710066318511963, + "learning_rate": 4.0511278195488725e-06, + "loss": 0.1935, + "step": 39560 + }, + { + "epoch": 59.5, + "grad_norm": 7.679681301116943, + "learning_rate": 4.049624060150376e-06, + "loss": 0.2504, + "step": 39570 + }, + { + "epoch": 59.52, + "grad_norm": 8.625081062316895, + "learning_rate": 4.04812030075188e-06, + "loss": 0.1737, + "step": 39580 + }, + { + "epoch": 59.53, + "grad_norm": 4.943390846252441, + "learning_rate": 4.046616541353384e-06, + "loss": 0.2392, + "step": 39590 + }, + { + "epoch": 59.55, + "grad_norm": 3.67423677444458, + "learning_rate": 4.0451127819548875e-06, + "loss": 0.1376, + "step": 39600 + }, + { + "epoch": 59.56, + "grad_norm": 5.872762203216553, + "learning_rate": 4.043609022556391e-06, + "loss": 0.22, + "step": 39610 + }, + { + "epoch": 59.58, + "grad_norm": 6.167865753173828, + "learning_rate": 4.042105263157895e-06, + "loss": 0.2386, + "step": 39620 + }, + { + "epoch": 59.59, + "grad_norm": 3.7467803955078125, + "learning_rate": 4.040601503759399e-06, + "loss": 0.2012, + "step": 39630 + }, + { + "epoch": 59.61, + "grad_norm": 4.501016139984131, + "learning_rate": 4.039097744360902e-06, + "loss": 0.2285, + "step": 39640 + }, + { + "epoch": 59.62, + "grad_norm": 5.832193851470947, + "learning_rate": 4.037593984962406e-06, + "loss": 0.1822, + "step": 39650 + }, + { + "epoch": 59.64, + "grad_norm": 3.999112844467163, + "learning_rate": 4.03609022556391e-06, + "loss": 0.2316, + "step": 39660 + }, + { + "epoch": 59.65, + "grad_norm": 9.898731231689453, + "learning_rate": 4.034586466165414e-06, + "loss": 0.2284, + "step": 39670 + }, + { + "epoch": 59.67, + "grad_norm": 4.816281318664551, + "learning_rate": 4.033082706766917e-06, + "loss": 0.168, + "step": 39680 + }, + { + "epoch": 59.68, + "grad_norm": 6.9781293869018555, + "learning_rate": 4.031578947368422e-06, + "loss": 0.2007, + "step": 39690 + }, + { + "epoch": 59.7, + "grad_norm": 4.193832874298096, + "learning_rate": 4.030075187969925e-06, + "loss": 0.2435, + "step": 39700 + }, + { + "epoch": 59.71, + "grad_norm": 3.802783966064453, + "learning_rate": 4.028571428571429e-06, + "loss": 0.2144, + "step": 39710 + }, + { + "epoch": 59.73, + "grad_norm": 2.7991650104522705, + "learning_rate": 4.027067669172933e-06, + "loss": 0.1264, + "step": 39720 + }, + { + "epoch": 59.74, + "grad_norm": 4.3173394203186035, + "learning_rate": 4.025563909774437e-06, + "loss": 0.1968, + "step": 39730 + }, + { + "epoch": 59.76, + "grad_norm": 4.434994220733643, + "learning_rate": 4.02406015037594e-06, + "loss": 0.2191, + "step": 39740 + }, + { + "epoch": 59.77, + "grad_norm": 3.0349013805389404, + "learning_rate": 4.022556390977444e-06, + "loss": 0.1819, + "step": 39750 + }, + { + "epoch": 59.79, + "grad_norm": 3.8737924098968506, + "learning_rate": 4.021052631578948e-06, + "loss": 0.2129, + "step": 39760 + }, + { + "epoch": 59.8, + "grad_norm": 6.809603214263916, + "learning_rate": 4.0195488721804515e-06, + "loss": 0.2261, + "step": 39770 + }, + { + "epoch": 59.82, + "grad_norm": 1.9871443510055542, + "learning_rate": 4.018045112781955e-06, + "loss": 0.1694, + "step": 39780 + }, + { + "epoch": 59.83, + "grad_norm": 7.0583977699279785, + "learning_rate": 4.016541353383459e-06, + "loss": 0.2135, + "step": 39790 + }, + { + "epoch": 59.85, + "grad_norm": 5.820276737213135, + "learning_rate": 4.015037593984963e-06, + "loss": 0.2536, + "step": 39800 + }, + { + "epoch": 59.86, + "grad_norm": 6.490072250366211, + "learning_rate": 4.0135338345864665e-06, + "loss": 0.1756, + "step": 39810 + }, + { + "epoch": 59.88, + "grad_norm": 7.442599296569824, + "learning_rate": 4.01203007518797e-06, + "loss": 0.1695, + "step": 39820 + }, + { + "epoch": 59.89, + "grad_norm": 5.296566486358643, + "learning_rate": 4.010526315789474e-06, + "loss": 0.2458, + "step": 39830 + }, + { + "epoch": 59.91, + "grad_norm": 13.77005672454834, + "learning_rate": 4.009022556390978e-06, + "loss": 0.218, + "step": 39840 + }, + { + "epoch": 59.92, + "grad_norm": 4.534656524658203, + "learning_rate": 4.007518796992481e-06, + "loss": 0.2051, + "step": 39850 + }, + { + "epoch": 59.94, + "grad_norm": 6.368296146392822, + "learning_rate": 4.006015037593986e-06, + "loss": 0.1526, + "step": 39860 + }, + { + "epoch": 59.95, + "grad_norm": 7.145541667938232, + "learning_rate": 4.004511278195489e-06, + "loss": 0.2447, + "step": 39870 + }, + { + "epoch": 59.97, + "grad_norm": 4.66719388961792, + "learning_rate": 4.003007518796993e-06, + "loss": 0.259, + "step": 39880 + }, + { + "epoch": 59.98, + "grad_norm": 3.6382246017456055, + "learning_rate": 4.001503759398497e-06, + "loss": 0.1915, + "step": 39890 + }, + { + "epoch": 60.0, + "grad_norm": 0.5675032138824463, + "learning_rate": 4.000000000000001e-06, + "loss": 0.145, + "step": 39900 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.9295, + "eval_loss": 0.3215762674808502, + "eval_runtime": 85.0976, + "eval_samples_per_second": 117.512, + "eval_steps_per_second": 0.47, + "step": 39900 + }, + { + "epoch": 60.02, + "grad_norm": 2.28545880317688, + "learning_rate": 3.998496240601504e-06, + "loss": 0.1901, + "step": 39910 + }, + { + "epoch": 60.03, + "grad_norm": 15.001397132873535, + "learning_rate": 3.996992481203008e-06, + "loss": 0.1951, + "step": 39920 + }, + { + "epoch": 60.05, + "grad_norm": 6.06882905960083, + "learning_rate": 3.995488721804512e-06, + "loss": 0.1432, + "step": 39930 + }, + { + "epoch": 60.06, + "grad_norm": 3.984029769897461, + "learning_rate": 3.993984962406016e-06, + "loss": 0.2015, + "step": 39940 + }, + { + "epoch": 60.08, + "grad_norm": 17.75435447692871, + "learning_rate": 3.992481203007519e-06, + "loss": 0.2074, + "step": 39950 + }, + { + "epoch": 60.09, + "grad_norm": 6.104353427886963, + "learning_rate": 3.990977443609023e-06, + "loss": 0.2112, + "step": 39960 + }, + { + "epoch": 60.11, + "grad_norm": 13.67701530456543, + "learning_rate": 3.989473684210526e-06, + "loss": 0.226, + "step": 39970 + }, + { + "epoch": 60.12, + "grad_norm": 3.5879509449005127, + "learning_rate": 3.9879699248120305e-06, + "loss": 0.176, + "step": 39980 + }, + { + "epoch": 60.14, + "grad_norm": 4.916759490966797, + "learning_rate": 3.986466165413534e-06, + "loss": 0.1977, + "step": 39990 + }, + { + "epoch": 60.15, + "grad_norm": 7.997819423675537, + "learning_rate": 3.9849624060150376e-06, + "loss": 0.199, + "step": 40000 + }, + { + "epoch": 60.17, + "grad_norm": 3.076766014099121, + "learning_rate": 3.983458646616541e-06, + "loss": 0.1479, + "step": 40010 + }, + { + "epoch": 60.18, + "grad_norm": 6.796036720275879, + "learning_rate": 3.9819548872180454e-06, + "loss": 0.2257, + "step": 40020 + }, + { + "epoch": 60.2, + "grad_norm": 6.836452007293701, + "learning_rate": 3.980451127819549e-06, + "loss": 0.2594, + "step": 40030 + }, + { + "epoch": 60.21, + "grad_norm": 7.558508396148682, + "learning_rate": 3.9789473684210525e-06, + "loss": 0.2029, + "step": 40040 + }, + { + "epoch": 60.23, + "grad_norm": 7.763191223144531, + "learning_rate": 3.977443609022557e-06, + "loss": 0.1931, + "step": 40050 + }, + { + "epoch": 60.24, + "grad_norm": 6.8963799476623535, + "learning_rate": 3.97593984962406e-06, + "loss": 0.2369, + "step": 40060 + }, + { + "epoch": 60.26, + "grad_norm": 6.124290943145752, + "learning_rate": 3.974436090225564e-06, + "loss": 0.2332, + "step": 40070 + }, + { + "epoch": 60.27, + "grad_norm": 2.4686286449432373, + "learning_rate": 3.972932330827068e-06, + "loss": 0.1754, + "step": 40080 + }, + { + "epoch": 60.29, + "grad_norm": 4.793791770935059, + "learning_rate": 3.971428571428572e-06, + "loss": 0.2353, + "step": 40090 + }, + { + "epoch": 60.3, + "grad_norm": 2.7970879077911377, + "learning_rate": 3.969924812030075e-06, + "loss": 0.1597, + "step": 40100 + }, + { + "epoch": 60.32, + "grad_norm": 6.253334045410156, + "learning_rate": 3.968421052631579e-06, + "loss": 0.2115, + "step": 40110 + }, + { + "epoch": 60.33, + "grad_norm": 7.9991631507873535, + "learning_rate": 3.966917293233083e-06, + "loss": 0.2242, + "step": 40120 + }, + { + "epoch": 60.35, + "grad_norm": 7.119922637939453, + "learning_rate": 3.965413533834587e-06, + "loss": 0.2398, + "step": 40130 + }, + { + "epoch": 60.36, + "grad_norm": 2.1998627185821533, + "learning_rate": 3.96390977443609e-06, + "loss": 0.1425, + "step": 40140 + }, + { + "epoch": 60.38, + "grad_norm": 5.503266334533691, + "learning_rate": 3.9624060150375946e-06, + "loss": 0.1403, + "step": 40150 + }, + { + "epoch": 60.39, + "grad_norm": 4.746212959289551, + "learning_rate": 3.960902255639098e-06, + "loss": 0.2247, + "step": 40160 + }, + { + "epoch": 60.41, + "grad_norm": 3.6244451999664307, + "learning_rate": 3.959398496240602e-06, + "loss": 0.2011, + "step": 40170 + }, + { + "epoch": 60.42, + "grad_norm": 2.700594663619995, + "learning_rate": 3.957894736842106e-06, + "loss": 0.1956, + "step": 40180 + }, + { + "epoch": 60.44, + "grad_norm": 8.941556930541992, + "learning_rate": 3.9563909774436095e-06, + "loss": 0.2188, + "step": 40190 + }, + { + "epoch": 60.45, + "grad_norm": 3.507028818130493, + "learning_rate": 3.954887218045113e-06, + "loss": 0.191, + "step": 40200 + }, + { + "epoch": 60.47, + "grad_norm": 3.7977683544158936, + "learning_rate": 3.9533834586466165e-06, + "loss": 0.194, + "step": 40210 + }, + { + "epoch": 60.48, + "grad_norm": 4.61478328704834, + "learning_rate": 3.951879699248121e-06, + "loss": 0.2135, + "step": 40220 + }, + { + "epoch": 60.5, + "grad_norm": 3.350219488143921, + "learning_rate": 3.950375939849624e-06, + "loss": 0.224, + "step": 40230 + }, + { + "epoch": 60.51, + "grad_norm": 4.675546169281006, + "learning_rate": 3.948872180451128e-06, + "loss": 0.1851, + "step": 40240 + }, + { + "epoch": 60.53, + "grad_norm": 4.169593811035156, + "learning_rate": 3.947368421052632e-06, + "loss": 0.2605, + "step": 40250 + }, + { + "epoch": 60.54, + "grad_norm": 4.376767635345459, + "learning_rate": 3.945864661654136e-06, + "loss": 0.2116, + "step": 40260 + }, + { + "epoch": 60.56, + "grad_norm": 3.2285714149475098, + "learning_rate": 3.944360902255639e-06, + "loss": 0.1881, + "step": 40270 + }, + { + "epoch": 60.57, + "grad_norm": 3.8466668128967285, + "learning_rate": 3.942857142857143e-06, + "loss": 0.2534, + "step": 40280 + }, + { + "epoch": 60.59, + "grad_norm": 8.687283515930176, + "learning_rate": 3.941353383458647e-06, + "loss": 0.2319, + "step": 40290 + }, + { + "epoch": 60.6, + "grad_norm": 3.930408239364624, + "learning_rate": 3.939849624060151e-06, + "loss": 0.2331, + "step": 40300 + }, + { + "epoch": 60.62, + "grad_norm": 3.52876615524292, + "learning_rate": 3.938345864661654e-06, + "loss": 0.1579, + "step": 40310 + }, + { + "epoch": 60.63, + "grad_norm": 8.303112030029297, + "learning_rate": 3.936842105263159e-06, + "loss": 0.1893, + "step": 40320 + }, + { + "epoch": 60.65, + "grad_norm": 6.3138957023620605, + "learning_rate": 3.935338345864662e-06, + "loss": 0.192, + "step": 40330 + }, + { + "epoch": 60.66, + "grad_norm": 3.4053993225097656, + "learning_rate": 3.933834586466166e-06, + "loss": 0.1935, + "step": 40340 + }, + { + "epoch": 60.68, + "grad_norm": 9.262184143066406, + "learning_rate": 3.93233082706767e-06, + "loss": 0.1957, + "step": 40350 + }, + { + "epoch": 60.69, + "grad_norm": 5.6493754386901855, + "learning_rate": 3.9308270676691736e-06, + "loss": 0.2533, + "step": 40360 + }, + { + "epoch": 60.71, + "grad_norm": 11.651237487792969, + "learning_rate": 3.929323308270677e-06, + "loss": 0.1964, + "step": 40370 + }, + { + "epoch": 60.72, + "grad_norm": 3.903280019760132, + "learning_rate": 3.927819548872181e-06, + "loss": 0.1883, + "step": 40380 + }, + { + "epoch": 60.74, + "grad_norm": 1.6574941873550415, + "learning_rate": 3.926315789473685e-06, + "loss": 0.2372, + "step": 40390 + }, + { + "epoch": 60.75, + "grad_norm": 6.501513957977295, + "learning_rate": 3.9248120300751885e-06, + "loss": 0.13, + "step": 40400 + }, + { + "epoch": 60.77, + "grad_norm": 5.910683631896973, + "learning_rate": 3.923308270676692e-06, + "loss": 0.2353, + "step": 40410 + }, + { + "epoch": 60.78, + "grad_norm": 1.645150065422058, + "learning_rate": 3.921804511278196e-06, + "loss": 0.2189, + "step": 40420 + }, + { + "epoch": 60.8, + "grad_norm": 5.449909210205078, + "learning_rate": 3.9203007518797e-06, + "loss": 0.1899, + "step": 40430 + }, + { + "epoch": 60.81, + "grad_norm": 4.566460609436035, + "learning_rate": 3.918796992481203e-06, + "loss": 0.2828, + "step": 40440 + }, + { + "epoch": 60.83, + "grad_norm": 4.585579872131348, + "learning_rate": 3.917293233082707e-06, + "loss": 0.1729, + "step": 40450 + }, + { + "epoch": 60.84, + "grad_norm": 6.5645976066589355, + "learning_rate": 3.9157894736842104e-06, + "loss": 0.164, + "step": 40460 + }, + { + "epoch": 60.86, + "grad_norm": 9.433652877807617, + "learning_rate": 3.914285714285714e-06, + "loss": 0.219, + "step": 40470 + }, + { + "epoch": 60.87, + "grad_norm": 5.390902042388916, + "learning_rate": 3.912781954887218e-06, + "loss": 0.2299, + "step": 40480 + }, + { + "epoch": 60.89, + "grad_norm": 3.7667274475097656, + "learning_rate": 3.911278195488722e-06, + "loss": 0.2021, + "step": 40490 + }, + { + "epoch": 60.9, + "grad_norm": 4.511600017547607, + "learning_rate": 3.909774436090225e-06, + "loss": 0.2136, + "step": 40500 + }, + { + "epoch": 60.92, + "grad_norm": 3.638052225112915, + "learning_rate": 3.90827067669173e-06, + "loss": 0.1652, + "step": 40510 + }, + { + "epoch": 60.93, + "grad_norm": 13.318207740783691, + "learning_rate": 3.906766917293233e-06, + "loss": 0.1892, + "step": 40520 + }, + { + "epoch": 60.95, + "grad_norm": 4.776786804199219, + "learning_rate": 3.905263157894737e-06, + "loss": 0.2, + "step": 40530 + }, + { + "epoch": 60.96, + "grad_norm": 3.423015832901001, + "learning_rate": 3.903759398496241e-06, + "loss": 0.2601, + "step": 40540 + }, + { + "epoch": 60.98, + "grad_norm": 4.982937812805176, + "learning_rate": 3.902255639097745e-06, + "loss": 0.1368, + "step": 40550 + }, + { + "epoch": 60.99, + "grad_norm": 7.498082637786865, + "learning_rate": 3.900751879699248e-06, + "loss": 0.2804, + "step": 40560 + }, + { + "epoch": 61.0, + "eval_accuracy": 0.9298, + "eval_loss": 0.3252774775028229, + "eval_runtime": 84.231, + "eval_samples_per_second": 118.721, + "eval_steps_per_second": 0.475, + "step": 40565 + }, + { + "epoch": 61.01, + "grad_norm": 2.947619676589966, + "learning_rate": 3.899248120300752e-06, + "loss": 0.2347, + "step": 40570 + }, + { + "epoch": 61.02, + "grad_norm": 4.865323543548584, + "learning_rate": 3.897744360902256e-06, + "loss": 0.1525, + "step": 40580 + }, + { + "epoch": 61.04, + "grad_norm": 4.381734848022461, + "learning_rate": 3.8962406015037596e-06, + "loss": 0.2192, + "step": 40590 + }, + { + "epoch": 61.05, + "grad_norm": 4.15067195892334, + "learning_rate": 3.894736842105263e-06, + "loss": 0.2132, + "step": 40600 + }, + { + "epoch": 61.07, + "grad_norm": 4.439324855804443, + "learning_rate": 3.8932330827067675e-06, + "loss": 0.2321, + "step": 40610 + }, + { + "epoch": 61.08, + "grad_norm": 3.797804594039917, + "learning_rate": 3.891729323308271e-06, + "loss": 0.1685, + "step": 40620 + }, + { + "epoch": 61.1, + "grad_norm": 5.394155502319336, + "learning_rate": 3.8902255639097745e-06, + "loss": 0.1848, + "step": 40630 + }, + { + "epoch": 61.11, + "grad_norm": 6.317572116851807, + "learning_rate": 3.888721804511279e-06, + "loss": 0.2337, + "step": 40640 + }, + { + "epoch": 61.13, + "grad_norm": 4.338418483734131, + "learning_rate": 3.887218045112782e-06, + "loss": 0.1896, + "step": 40650 + }, + { + "epoch": 61.14, + "grad_norm": 4.074448585510254, + "learning_rate": 3.885714285714286e-06, + "loss": 0.2793, + "step": 40660 + }, + { + "epoch": 61.16, + "grad_norm": 7.455972671508789, + "learning_rate": 3.884210526315789e-06, + "loss": 0.2104, + "step": 40670 + }, + { + "epoch": 61.17, + "grad_norm": 6.7889204025268555, + "learning_rate": 3.882706766917294e-06, + "loss": 0.2767, + "step": 40680 + }, + { + "epoch": 61.19, + "grad_norm": 0.9154432415962219, + "learning_rate": 3.881203007518797e-06, + "loss": 0.1906, + "step": 40690 + }, + { + "epoch": 61.2, + "grad_norm": 5.013433933258057, + "learning_rate": 3.879699248120301e-06, + "loss": 0.1908, + "step": 40700 + }, + { + "epoch": 61.22, + "grad_norm": 4.357288360595703, + "learning_rate": 3.878195488721805e-06, + "loss": 0.1872, + "step": 40710 + }, + { + "epoch": 61.23, + "grad_norm": 4.3976569175720215, + "learning_rate": 3.876691729323309e-06, + "loss": 0.1649, + "step": 40720 + }, + { + "epoch": 61.25, + "grad_norm": 5.771651744842529, + "learning_rate": 3.875187969924812e-06, + "loss": 0.2027, + "step": 40730 + }, + { + "epoch": 61.26, + "grad_norm": 4.635252475738525, + "learning_rate": 3.873684210526316e-06, + "loss": 0.2115, + "step": 40740 + }, + { + "epoch": 61.28, + "grad_norm": 3.256643295288086, + "learning_rate": 3.87218045112782e-06, + "loss": 0.2126, + "step": 40750 + }, + { + "epoch": 61.29, + "grad_norm": 5.223315715789795, + "learning_rate": 3.870676691729324e-06, + "loss": 0.2077, + "step": 40760 + }, + { + "epoch": 61.31, + "grad_norm": 4.067598819732666, + "learning_rate": 3.869172932330827e-06, + "loss": 0.1596, + "step": 40770 + }, + { + "epoch": 61.32, + "grad_norm": 3.9475274085998535, + "learning_rate": 3.8676691729323315e-06, + "loss": 0.2226, + "step": 40780 + }, + { + "epoch": 61.34, + "grad_norm": 5.894591331481934, + "learning_rate": 3.866165413533835e-06, + "loss": 0.2033, + "step": 40790 + }, + { + "epoch": 61.35, + "grad_norm": 7.091558933258057, + "learning_rate": 3.8646616541353386e-06, + "loss": 0.234, + "step": 40800 + }, + { + "epoch": 61.37, + "grad_norm": 7.475659370422363, + "learning_rate": 3.863157894736843e-06, + "loss": 0.1912, + "step": 40810 + }, + { + "epoch": 61.38, + "grad_norm": 4.879849910736084, + "learning_rate": 3.8616541353383464e-06, + "loss": 0.1841, + "step": 40820 + }, + { + "epoch": 61.4, + "grad_norm": 5.782186031341553, + "learning_rate": 3.86015037593985e-06, + "loss": 0.2058, + "step": 40830 + }, + { + "epoch": 61.41, + "grad_norm": 5.1282196044921875, + "learning_rate": 3.8586466165413535e-06, + "loss": 0.2359, + "step": 40840 + }, + { + "epoch": 61.43, + "grad_norm": 5.245487213134766, + "learning_rate": 3.857142857142858e-06, + "loss": 0.2142, + "step": 40850 + }, + { + "epoch": 61.44, + "grad_norm": 5.231210708618164, + "learning_rate": 3.855639097744361e-06, + "loss": 0.2342, + "step": 40860 + }, + { + "epoch": 61.46, + "grad_norm": 2.8753762245178223, + "learning_rate": 3.854135338345865e-06, + "loss": 0.2064, + "step": 40870 + }, + { + "epoch": 61.47, + "grad_norm": 10.435833930969238, + "learning_rate": 3.852631578947369e-06, + "loss": 0.1609, + "step": 40880 + }, + { + "epoch": 61.49, + "grad_norm": 6.82497501373291, + "learning_rate": 3.851127819548873e-06, + "loss": 0.237, + "step": 40890 + }, + { + "epoch": 61.5, + "grad_norm": 3.6825904846191406, + "learning_rate": 3.849624060150376e-06, + "loss": 0.2074, + "step": 40900 + }, + { + "epoch": 61.52, + "grad_norm": 2.7955589294433594, + "learning_rate": 3.848120300751881e-06, + "loss": 0.1649, + "step": 40910 + }, + { + "epoch": 61.53, + "grad_norm": 5.636986255645752, + "learning_rate": 3.846616541353384e-06, + "loss": 0.1815, + "step": 40920 + }, + { + "epoch": 61.55, + "grad_norm": 5.697868347167969, + "learning_rate": 3.845112781954888e-06, + "loss": 0.1894, + "step": 40930 + }, + { + "epoch": 61.56, + "grad_norm": 4.8334808349609375, + "learning_rate": 3.843609022556391e-06, + "loss": 0.2018, + "step": 40940 + }, + { + "epoch": 61.58, + "grad_norm": 6.514235496520996, + "learning_rate": 3.842105263157895e-06, + "loss": 0.2102, + "step": 40950 + }, + { + "epoch": 61.59, + "grad_norm": 4.935739994049072, + "learning_rate": 3.840601503759398e-06, + "loss": 0.1594, + "step": 40960 + }, + { + "epoch": 61.61, + "grad_norm": 14.143031120300293, + "learning_rate": 3.839097744360903e-06, + "loss": 0.2094, + "step": 40970 + }, + { + "epoch": 61.62, + "grad_norm": 4.090781211853027, + "learning_rate": 3.837593984962406e-06, + "loss": 0.2485, + "step": 40980 + }, + { + "epoch": 61.64, + "grad_norm": 9.650825500488281, + "learning_rate": 3.83609022556391e-06, + "loss": 0.1868, + "step": 40990 + }, + { + "epoch": 61.65, + "grad_norm": 3.9108171463012695, + "learning_rate": 3.834586466165414e-06, + "loss": 0.2519, + "step": 41000 + }, + { + "epoch": 61.67, + "grad_norm": 3.324169874191284, + "learning_rate": 3.8330827067669175e-06, + "loss": 0.1668, + "step": 41010 + }, + { + "epoch": 61.68, + "grad_norm": 6.214301586151123, + "learning_rate": 3.831578947368421e-06, + "loss": 0.2764, + "step": 41020 + }, + { + "epoch": 61.7, + "grad_norm": 4.068943500518799, + "learning_rate": 3.830075187969925e-06, + "loss": 0.2038, + "step": 41030 + }, + { + "epoch": 61.71, + "grad_norm": 3.677924633026123, + "learning_rate": 3.828571428571429e-06, + "loss": 0.1743, + "step": 41040 + }, + { + "epoch": 61.73, + "grad_norm": 4.874640464782715, + "learning_rate": 3.8270676691729325e-06, + "loss": 0.2046, + "step": 41050 + }, + { + "epoch": 61.74, + "grad_norm": 4.538558006286621, + "learning_rate": 3.825563909774436e-06, + "loss": 0.2542, + "step": 41060 + }, + { + "epoch": 61.76, + "grad_norm": 7.119689464569092, + "learning_rate": 3.82406015037594e-06, + "loss": 0.1539, + "step": 41070 + }, + { + "epoch": 61.77, + "grad_norm": 6.170871257781982, + "learning_rate": 3.822556390977444e-06, + "loss": 0.3122, + "step": 41080 + }, + { + "epoch": 61.79, + "grad_norm": 4.091243267059326, + "learning_rate": 3.821052631578947e-06, + "loss": 0.2069, + "step": 41090 + }, + { + "epoch": 61.8, + "grad_norm": 5.074096202850342, + "learning_rate": 3.819548872180452e-06, + "loss": 0.2046, + "step": 41100 + }, + { + "epoch": 61.82, + "grad_norm": 6.762053966522217, + "learning_rate": 3.818045112781955e-06, + "loss": 0.1954, + "step": 41110 + }, + { + "epoch": 61.83, + "grad_norm": 3.479602098464966, + "learning_rate": 3.816541353383459e-06, + "loss": 0.2208, + "step": 41120 + }, + { + "epoch": 61.85, + "grad_norm": 4.815489768981934, + "learning_rate": 3.815037593984962e-06, + "loss": 0.191, + "step": 41130 + }, + { + "epoch": 61.86, + "grad_norm": 3.5152673721313477, + "learning_rate": 3.8135338345864663e-06, + "loss": 0.245, + "step": 41140 + }, + { + "epoch": 61.88, + "grad_norm": 2.4947726726531982, + "learning_rate": 3.81203007518797e-06, + "loss": 0.2218, + "step": 41150 + }, + { + "epoch": 61.89, + "grad_norm": 3.991682291030884, + "learning_rate": 3.810526315789474e-06, + "loss": 0.166, + "step": 41160 + }, + { + "epoch": 61.91, + "grad_norm": 5.845560550689697, + "learning_rate": 3.8090225563909777e-06, + "loss": 0.1897, + "step": 41170 + }, + { + "epoch": 61.92, + "grad_norm": 4.220813274383545, + "learning_rate": 3.8075187969924816e-06, + "loss": 0.1668, + "step": 41180 + }, + { + "epoch": 61.94, + "grad_norm": 6.980546951293945, + "learning_rate": 3.806015037593985e-06, + "loss": 0.185, + "step": 41190 + }, + { + "epoch": 61.95, + "grad_norm": 4.129945278167725, + "learning_rate": 3.804511278195489e-06, + "loss": 0.2469, + "step": 41200 + }, + { + "epoch": 61.97, + "grad_norm": 2.966998815536499, + "learning_rate": 3.803007518796993e-06, + "loss": 0.2203, + "step": 41210 + }, + { + "epoch": 61.98, + "grad_norm": 4.130844593048096, + "learning_rate": 3.8015037593984965e-06, + "loss": 0.2189, + "step": 41220 + }, + { + "epoch": 62.0, + "grad_norm": 19.200584411621094, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.1696, + "step": 41230 + }, + { + "epoch": 62.0, + "eval_accuracy": 0.9315, + "eval_loss": 0.30856576561927795, + "eval_runtime": 84.694, + "eval_samples_per_second": 118.072, + "eval_steps_per_second": 0.472, + "step": 41230 + }, + { + "epoch": 62.02, + "grad_norm": 3.957939386367798, + "learning_rate": 3.798496240601504e-06, + "loss": 0.1619, + "step": 41240 + }, + { + "epoch": 62.03, + "grad_norm": 5.774256229400635, + "learning_rate": 3.796992481203008e-06, + "loss": 0.1798, + "step": 41250 + }, + { + "epoch": 62.05, + "grad_norm": 4.3362274169921875, + "learning_rate": 3.795488721804512e-06, + "loss": 0.1837, + "step": 41260 + }, + { + "epoch": 62.06, + "grad_norm": 5.490597248077393, + "learning_rate": 3.7939849624060154e-06, + "loss": 0.168, + "step": 41270 + }, + { + "epoch": 62.08, + "grad_norm": 5.736985683441162, + "learning_rate": 3.7924812030075193e-06, + "loss": 0.2156, + "step": 41280 + }, + { + "epoch": 62.09, + "grad_norm": 5.470156192779541, + "learning_rate": 3.790977443609023e-06, + "loss": 0.1872, + "step": 41290 + }, + { + "epoch": 62.11, + "grad_norm": 6.704098701477051, + "learning_rate": 3.789473684210527e-06, + "loss": 0.2517, + "step": 41300 + }, + { + "epoch": 62.12, + "grad_norm": 3.4402177333831787, + "learning_rate": 3.7879699248120303e-06, + "loss": 0.1928, + "step": 41310 + }, + { + "epoch": 62.14, + "grad_norm": 4.056411266326904, + "learning_rate": 3.7864661654135343e-06, + "loss": 0.2138, + "step": 41320 + }, + { + "epoch": 62.15, + "grad_norm": 6.119357109069824, + "learning_rate": 3.784962406015038e-06, + "loss": 0.2381, + "step": 41330 + }, + { + "epoch": 62.17, + "grad_norm": 6.0787787437438965, + "learning_rate": 3.7834586466165417e-06, + "loss": 0.1763, + "step": 41340 + }, + { + "epoch": 62.18, + "grad_norm": 5.722217082977295, + "learning_rate": 3.7819548872180457e-06, + "loss": 0.1505, + "step": 41350 + }, + { + "epoch": 62.2, + "grad_norm": 1.9974137544631958, + "learning_rate": 3.780451127819549e-06, + "loss": 0.1912, + "step": 41360 + }, + { + "epoch": 62.21, + "grad_norm": 3.360246181488037, + "learning_rate": 3.778947368421053e-06, + "loss": 0.2063, + "step": 41370 + }, + { + "epoch": 62.23, + "grad_norm": 2.2372615337371826, + "learning_rate": 3.777443609022557e-06, + "loss": 0.1898, + "step": 41380 + }, + { + "epoch": 62.24, + "grad_norm": 7.954653263092041, + "learning_rate": 3.7759398496240606e-06, + "loss": 0.1837, + "step": 41390 + }, + { + "epoch": 62.26, + "grad_norm": 4.030917644500732, + "learning_rate": 3.7744360902255645e-06, + "loss": 0.206, + "step": 41400 + }, + { + "epoch": 62.27, + "grad_norm": 6.891762733459473, + "learning_rate": 3.772932330827068e-06, + "loss": 0.2181, + "step": 41410 + }, + { + "epoch": 62.29, + "grad_norm": 6.047714710235596, + "learning_rate": 3.771428571428572e-06, + "loss": 0.2758, + "step": 41420 + }, + { + "epoch": 62.3, + "grad_norm": 3.0663442611694336, + "learning_rate": 3.769924812030076e-06, + "loss": 0.1705, + "step": 41430 + }, + { + "epoch": 62.32, + "grad_norm": 5.566664218902588, + "learning_rate": 3.768421052631579e-06, + "loss": 0.1676, + "step": 41440 + }, + { + "epoch": 62.33, + "grad_norm": 3.831843614578247, + "learning_rate": 3.7669172932330825e-06, + "loss": 0.1544, + "step": 41450 + }, + { + "epoch": 62.35, + "grad_norm": 3.094122886657715, + "learning_rate": 3.7654135338345865e-06, + "loss": 0.2025, + "step": 41460 + }, + { + "epoch": 62.36, + "grad_norm": 10.858261108398438, + "learning_rate": 3.7639097744360904e-06, + "loss": 0.2037, + "step": 41470 + }, + { + "epoch": 62.38, + "grad_norm": 3.284749746322632, + "learning_rate": 3.762406015037594e-06, + "loss": 0.177, + "step": 41480 + }, + { + "epoch": 62.39, + "grad_norm": 5.909909248352051, + "learning_rate": 3.760902255639098e-06, + "loss": 0.1802, + "step": 41490 + }, + { + "epoch": 62.41, + "grad_norm": 2.958390474319458, + "learning_rate": 3.7593984962406014e-06, + "loss": 0.1886, + "step": 41500 + }, + { + "epoch": 62.42, + "grad_norm": 5.828741550445557, + "learning_rate": 3.7578947368421053e-06, + "loss": 0.2374, + "step": 41510 + }, + { + "epoch": 62.44, + "grad_norm": 6.119723320007324, + "learning_rate": 3.7563909774436093e-06, + "loss": 0.1787, + "step": 41520 + }, + { + "epoch": 62.45, + "grad_norm": 5.897652626037598, + "learning_rate": 3.754887218045113e-06, + "loss": 0.1704, + "step": 41530 + }, + { + "epoch": 62.47, + "grad_norm": 3.6232078075408936, + "learning_rate": 3.7533834586466168e-06, + "loss": 0.2112, + "step": 41540 + }, + { + "epoch": 62.48, + "grad_norm": 4.357728958129883, + "learning_rate": 3.7518796992481203e-06, + "loss": 0.1745, + "step": 41550 + }, + { + "epoch": 62.5, + "grad_norm": 3.9227254390716553, + "learning_rate": 3.7503759398496242e-06, + "loss": 0.1279, + "step": 41560 + }, + { + "epoch": 62.51, + "grad_norm": 4.716411113739014, + "learning_rate": 3.748872180451128e-06, + "loss": 0.1731, + "step": 41570 + }, + { + "epoch": 62.53, + "grad_norm": 5.446840763092041, + "learning_rate": 3.7473684210526317e-06, + "loss": 0.173, + "step": 41580 + }, + { + "epoch": 62.54, + "grad_norm": 9.927755355834961, + "learning_rate": 3.7458646616541356e-06, + "loss": 0.2158, + "step": 41590 + }, + { + "epoch": 62.56, + "grad_norm": 4.404961585998535, + "learning_rate": 3.744360902255639e-06, + "loss": 0.2027, + "step": 41600 + }, + { + "epoch": 62.57, + "grad_norm": 8.573219299316406, + "learning_rate": 3.742857142857143e-06, + "loss": 0.2426, + "step": 41610 + }, + { + "epoch": 62.59, + "grad_norm": 5.887312889099121, + "learning_rate": 3.741353383458647e-06, + "loss": 0.249, + "step": 41620 + }, + { + "epoch": 62.6, + "grad_norm": 8.018643379211426, + "learning_rate": 3.7398496240601505e-06, + "loss": 0.1449, + "step": 41630 + }, + { + "epoch": 62.62, + "grad_norm": 5.178997993469238, + "learning_rate": 3.7383458646616545e-06, + "loss": 0.2288, + "step": 41640 + }, + { + "epoch": 62.63, + "grad_norm": 4.393047332763672, + "learning_rate": 3.736842105263158e-06, + "loss": 0.2121, + "step": 41650 + }, + { + "epoch": 62.65, + "grad_norm": 4.626613616943359, + "learning_rate": 3.735338345864662e-06, + "loss": 0.1793, + "step": 41660 + }, + { + "epoch": 62.66, + "grad_norm": 2.9352829456329346, + "learning_rate": 3.733834586466166e-06, + "loss": 0.1641, + "step": 41670 + }, + { + "epoch": 62.68, + "grad_norm": 7.237253665924072, + "learning_rate": 3.7323308270676694e-06, + "loss": 0.1669, + "step": 41680 + }, + { + "epoch": 62.69, + "grad_norm": 7.4211835861206055, + "learning_rate": 3.7308270676691734e-06, + "loss": 0.1607, + "step": 41690 + }, + { + "epoch": 62.71, + "grad_norm": 5.203456401824951, + "learning_rate": 3.729323308270677e-06, + "loss": 0.1946, + "step": 41700 + }, + { + "epoch": 62.72, + "grad_norm": 6.299890995025635, + "learning_rate": 3.727819548872181e-06, + "loss": 0.17, + "step": 41710 + }, + { + "epoch": 62.74, + "grad_norm": 6.215569972991943, + "learning_rate": 3.7263157894736848e-06, + "loss": 0.1552, + "step": 41720 + }, + { + "epoch": 62.75, + "grad_norm": 6.16441011428833, + "learning_rate": 3.7248120300751883e-06, + "loss": 0.1511, + "step": 41730 + }, + { + "epoch": 62.77, + "grad_norm": 2.391936779022217, + "learning_rate": 3.7233082706766922e-06, + "loss": 0.2533, + "step": 41740 + }, + { + "epoch": 62.78, + "grad_norm": 5.521732807159424, + "learning_rate": 3.7218045112781957e-06, + "loss": 0.2338, + "step": 41750 + }, + { + "epoch": 62.8, + "grad_norm": 2.640953779220581, + "learning_rate": 3.7203007518796997e-06, + "loss": 0.2111, + "step": 41760 + }, + { + "epoch": 62.81, + "grad_norm": 4.677870273590088, + "learning_rate": 3.718796992481203e-06, + "loss": 0.2166, + "step": 41770 + }, + { + "epoch": 62.83, + "grad_norm": 4.7918291091918945, + "learning_rate": 3.717293233082707e-06, + "loss": 0.2068, + "step": 41780 + }, + { + "epoch": 62.84, + "grad_norm": 4.125340938568115, + "learning_rate": 3.715789473684211e-06, + "loss": 0.1522, + "step": 41790 + }, + { + "epoch": 62.86, + "grad_norm": 5.1403117179870605, + "learning_rate": 3.7142857142857146e-06, + "loss": 0.2198, + "step": 41800 + }, + { + "epoch": 62.87, + "grad_norm": 4.696916580200195, + "learning_rate": 3.7127819548872185e-06, + "loss": 0.1715, + "step": 41810 + }, + { + "epoch": 62.89, + "grad_norm": 3.3158819675445557, + "learning_rate": 3.711278195488722e-06, + "loss": 0.172, + "step": 41820 + }, + { + "epoch": 62.9, + "grad_norm": 6.180699825286865, + "learning_rate": 3.709774436090226e-06, + "loss": 0.2321, + "step": 41830 + }, + { + "epoch": 62.92, + "grad_norm": 5.126304626464844, + "learning_rate": 3.70827067669173e-06, + "loss": 0.1684, + "step": 41840 + }, + { + "epoch": 62.93, + "grad_norm": 7.246421813964844, + "learning_rate": 3.7067669172932335e-06, + "loss": 0.1933, + "step": 41850 + }, + { + "epoch": 62.95, + "grad_norm": 4.105726718902588, + "learning_rate": 3.7052631578947374e-06, + "loss": 0.2189, + "step": 41860 + }, + { + "epoch": 62.96, + "grad_norm": 3.8485569953918457, + "learning_rate": 3.703759398496241e-06, + "loss": 0.1698, + "step": 41870 + }, + { + "epoch": 62.98, + "grad_norm": 3.669081211090088, + "learning_rate": 3.702255639097745e-06, + "loss": 0.1472, + "step": 41880 + }, + { + "epoch": 62.99, + "grad_norm": 8.504827499389648, + "learning_rate": 3.700751879699249e-06, + "loss": 0.2194, + "step": 41890 + }, + { + "epoch": 63.0, + "eval_accuracy": 0.9313, + "eval_loss": 0.31697988510131836, + "eval_runtime": 84.413, + "eval_samples_per_second": 118.465, + "eval_steps_per_second": 0.474, + "step": 41895 + }, + { + "epoch": 63.01, + "grad_norm": 3.0113203525543213, + "learning_rate": 3.6992481203007523e-06, + "loss": 0.3075, + "step": 41900 + }, + { + "epoch": 63.02, + "grad_norm": 5.44133186340332, + "learning_rate": 3.6977443609022563e-06, + "loss": 0.2535, + "step": 41910 + }, + { + "epoch": 63.04, + "grad_norm": 4.054553985595703, + "learning_rate": 3.69624060150376e-06, + "loss": 0.1859, + "step": 41920 + }, + { + "epoch": 63.05, + "grad_norm": 2.3150360584259033, + "learning_rate": 3.6947368421052637e-06, + "loss": 0.1969, + "step": 41930 + }, + { + "epoch": 63.07, + "grad_norm": 5.139898777008057, + "learning_rate": 3.693233082706767e-06, + "loss": 0.1992, + "step": 41940 + }, + { + "epoch": 63.08, + "grad_norm": 3.828563928604126, + "learning_rate": 3.6917293233082708e-06, + "loss": 0.2596, + "step": 41950 + }, + { + "epoch": 63.1, + "grad_norm": 5.204103469848633, + "learning_rate": 3.6902255639097743e-06, + "loss": 0.1468, + "step": 41960 + }, + { + "epoch": 63.11, + "grad_norm": 2.3387629985809326, + "learning_rate": 3.6887218045112782e-06, + "loss": 0.1782, + "step": 41970 + }, + { + "epoch": 63.13, + "grad_norm": 5.02309513092041, + "learning_rate": 3.687218045112782e-06, + "loss": 0.2029, + "step": 41980 + }, + { + "epoch": 63.14, + "grad_norm": 6.977344512939453, + "learning_rate": 3.6857142857142857e-06, + "loss": 0.1883, + "step": 41990 + }, + { + "epoch": 63.16, + "grad_norm": 6.206759929656982, + "learning_rate": 3.6842105263157896e-06, + "loss": 0.2007, + "step": 42000 + }, + { + "epoch": 63.17, + "grad_norm": 7.947546482086182, + "learning_rate": 3.682706766917293e-06, + "loss": 0.2051, + "step": 42010 + }, + { + "epoch": 63.19, + "grad_norm": 5.88115119934082, + "learning_rate": 3.681203007518797e-06, + "loss": 0.168, + "step": 42020 + }, + { + "epoch": 63.2, + "grad_norm": 5.9370198249816895, + "learning_rate": 3.679699248120301e-06, + "loss": 0.1566, + "step": 42030 + }, + { + "epoch": 63.22, + "grad_norm": 4.835606098175049, + "learning_rate": 3.6781954887218046e-06, + "loss": 0.1499, + "step": 42040 + }, + { + "epoch": 63.23, + "grad_norm": 4.858092308044434, + "learning_rate": 3.6766917293233085e-06, + "loss": 0.1637, + "step": 42050 + }, + { + "epoch": 63.25, + "grad_norm": 3.878875970840454, + "learning_rate": 3.675187969924812e-06, + "loss": 0.2207, + "step": 42060 + }, + { + "epoch": 63.26, + "grad_norm": 4.815737724304199, + "learning_rate": 3.673684210526316e-06, + "loss": 0.1478, + "step": 42070 + }, + { + "epoch": 63.28, + "grad_norm": 5.654510498046875, + "learning_rate": 3.67218045112782e-06, + "loss": 0.1988, + "step": 42080 + }, + { + "epoch": 63.29, + "grad_norm": 7.474091053009033, + "learning_rate": 3.6706766917293234e-06, + "loss": 0.1832, + "step": 42090 + }, + { + "epoch": 63.31, + "grad_norm": 8.080961227416992, + "learning_rate": 3.6691729323308274e-06, + "loss": 0.1956, + "step": 42100 + }, + { + "epoch": 63.32, + "grad_norm": 5.121028423309326, + "learning_rate": 3.667669172932331e-06, + "loss": 0.1844, + "step": 42110 + }, + { + "epoch": 63.34, + "grad_norm": 5.617638111114502, + "learning_rate": 3.666165413533835e-06, + "loss": 0.2037, + "step": 42120 + }, + { + "epoch": 63.35, + "grad_norm": 5.1372270584106445, + "learning_rate": 3.6646616541353388e-06, + "loss": 0.171, + "step": 42130 + }, + { + "epoch": 63.37, + "grad_norm": 5.457404136657715, + "learning_rate": 3.6631578947368423e-06, + "loss": 0.1537, + "step": 42140 + }, + { + "epoch": 63.38, + "grad_norm": 9.514487266540527, + "learning_rate": 3.6616541353383462e-06, + "loss": 0.2131, + "step": 42150 + }, + { + "epoch": 63.4, + "grad_norm": 6.3367414474487305, + "learning_rate": 3.6601503759398498e-06, + "loss": 0.1886, + "step": 42160 + }, + { + "epoch": 63.41, + "grad_norm": 7.155886650085449, + "learning_rate": 3.6586466165413537e-06, + "loss": 0.2358, + "step": 42170 + }, + { + "epoch": 63.43, + "grad_norm": 4.044269561767578, + "learning_rate": 3.6571428571428576e-06, + "loss": 0.271, + "step": 42180 + }, + { + "epoch": 63.44, + "grad_norm": 7.966317653656006, + "learning_rate": 3.655639097744361e-06, + "loss": 0.182, + "step": 42190 + }, + { + "epoch": 63.46, + "grad_norm": 8.241473197937012, + "learning_rate": 3.654135338345865e-06, + "loss": 0.2768, + "step": 42200 + }, + { + "epoch": 63.47, + "grad_norm": 5.769531726837158, + "learning_rate": 3.6526315789473686e-06, + "loss": 0.2628, + "step": 42210 + }, + { + "epoch": 63.49, + "grad_norm": 4.371389865875244, + "learning_rate": 3.6511278195488726e-06, + "loss": 0.2603, + "step": 42220 + }, + { + "epoch": 63.5, + "grad_norm": 3.9458630084991455, + "learning_rate": 3.649624060150376e-06, + "loss": 0.1757, + "step": 42230 + }, + { + "epoch": 63.52, + "grad_norm": 8.014843940734863, + "learning_rate": 3.64812030075188e-06, + "loss": 0.1979, + "step": 42240 + }, + { + "epoch": 63.53, + "grad_norm": 4.720882892608643, + "learning_rate": 3.646616541353384e-06, + "loss": 0.2039, + "step": 42250 + }, + { + "epoch": 63.55, + "grad_norm": 13.893559455871582, + "learning_rate": 3.6451127819548875e-06, + "loss": 0.2065, + "step": 42260 + }, + { + "epoch": 63.56, + "grad_norm": 5.4362406730651855, + "learning_rate": 3.6436090225563914e-06, + "loss": 0.169, + "step": 42270 + }, + { + "epoch": 63.58, + "grad_norm": 4.523813247680664, + "learning_rate": 3.642105263157895e-06, + "loss": 0.2152, + "step": 42280 + }, + { + "epoch": 63.59, + "grad_norm": 2.5966227054595947, + "learning_rate": 3.640601503759399e-06, + "loss": 0.1696, + "step": 42290 + }, + { + "epoch": 63.61, + "grad_norm": 5.089998245239258, + "learning_rate": 3.639097744360903e-06, + "loss": 0.1484, + "step": 42300 + }, + { + "epoch": 63.62, + "grad_norm": 5.716888427734375, + "learning_rate": 3.6375939849624064e-06, + "loss": 0.1763, + "step": 42310 + }, + { + "epoch": 63.64, + "grad_norm": 5.487515926361084, + "learning_rate": 3.6360902255639103e-06, + "loss": 0.1831, + "step": 42320 + }, + { + "epoch": 63.65, + "grad_norm": 3.316849708557129, + "learning_rate": 3.634586466165414e-06, + "loss": 0.2312, + "step": 42330 + }, + { + "epoch": 63.67, + "grad_norm": 5.662327766418457, + "learning_rate": 3.6330827067669178e-06, + "loss": 0.2383, + "step": 42340 + }, + { + "epoch": 63.68, + "grad_norm": 6.571043968200684, + "learning_rate": 3.6315789473684217e-06, + "loss": 0.2039, + "step": 42350 + }, + { + "epoch": 63.7, + "grad_norm": 5.3168044090271, + "learning_rate": 3.6300751879699252e-06, + "loss": 0.1692, + "step": 42360 + }, + { + "epoch": 63.71, + "grad_norm": 3.067420482635498, + "learning_rate": 3.628571428571429e-06, + "loss": 0.1852, + "step": 42370 + }, + { + "epoch": 63.73, + "grad_norm": 8.511592864990234, + "learning_rate": 3.6270676691729327e-06, + "loss": 0.1966, + "step": 42380 + }, + { + "epoch": 63.74, + "grad_norm": 6.0220232009887695, + "learning_rate": 3.6255639097744366e-06, + "loss": 0.1325, + "step": 42390 + }, + { + "epoch": 63.76, + "grad_norm": 5.685608386993408, + "learning_rate": 3.6240601503759406e-06, + "loss": 0.228, + "step": 42400 + }, + { + "epoch": 63.77, + "grad_norm": 6.7746052742004395, + "learning_rate": 3.622556390977444e-06, + "loss": 0.2469, + "step": 42410 + }, + { + "epoch": 63.79, + "grad_norm": 5.100920677185059, + "learning_rate": 3.621052631578948e-06, + "loss": 0.1907, + "step": 42420 + }, + { + "epoch": 63.8, + "grad_norm": 3.674767255783081, + "learning_rate": 3.6195488721804515e-06, + "loss": 0.1922, + "step": 42430 + }, + { + "epoch": 63.82, + "grad_norm": 7.852967739105225, + "learning_rate": 3.618045112781955e-06, + "loss": 0.1973, + "step": 42440 + }, + { + "epoch": 63.83, + "grad_norm": 3.643383741378784, + "learning_rate": 3.6165413533834586e-06, + "loss": 0.1633, + "step": 42450 + }, + { + "epoch": 63.85, + "grad_norm": 5.850439071655273, + "learning_rate": 3.6150375939849625e-06, + "loss": 0.1915, + "step": 42460 + }, + { + "epoch": 63.86, + "grad_norm": 4.816995620727539, + "learning_rate": 3.613533834586466e-06, + "loss": 0.2416, + "step": 42470 + }, + { + "epoch": 63.88, + "grad_norm": 4.024276256561279, + "learning_rate": 3.61203007518797e-06, + "loss": 0.207, + "step": 42480 + }, + { + "epoch": 63.89, + "grad_norm": 4.65853214263916, + "learning_rate": 3.610526315789474e-06, + "loss": 0.1428, + "step": 42490 + }, + { + "epoch": 63.91, + "grad_norm": 1.6227343082427979, + "learning_rate": 3.6090225563909775e-06, + "loss": 0.2439, + "step": 42500 + }, + { + "epoch": 63.92, + "grad_norm": 5.130610466003418, + "learning_rate": 3.6075187969924814e-06, + "loss": 0.2491, + "step": 42510 + }, + { + "epoch": 63.94, + "grad_norm": 7.856945991516113, + "learning_rate": 3.606015037593985e-06, + "loss": 0.2532, + "step": 42520 + }, + { + "epoch": 63.95, + "grad_norm": 3.6116113662719727, + "learning_rate": 3.604511278195489e-06, + "loss": 0.258, + "step": 42530 + }, + { + "epoch": 63.97, + "grad_norm": 6.224160671234131, + "learning_rate": 3.603007518796993e-06, + "loss": 0.2729, + "step": 42540 + }, + { + "epoch": 63.98, + "grad_norm": 5.5140156745910645, + "learning_rate": 3.6015037593984963e-06, + "loss": 0.1669, + "step": 42550 + }, + { + "epoch": 64.0, + "grad_norm": 13.927995681762695, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.2297, + "step": 42560 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.9293, + "eval_loss": 0.3231370151042938, + "eval_runtime": 85.3183, + "eval_samples_per_second": 117.208, + "eval_steps_per_second": 0.469, + "step": 42560 + }, + { + "epoch": 64.02, + "grad_norm": 8.015310287475586, + "learning_rate": 3.5984962406015038e-06, + "loss": 0.2013, + "step": 42570 + }, + { + "epoch": 64.03, + "grad_norm": 7.057667255401611, + "learning_rate": 3.5969924812030077e-06, + "loss": 0.2238, + "step": 42580 + }, + { + "epoch": 64.05, + "grad_norm": 4.543213367462158, + "learning_rate": 3.5954887218045117e-06, + "loss": 0.1907, + "step": 42590 + }, + { + "epoch": 64.06, + "grad_norm": 2.699389934539795, + "learning_rate": 3.593984962406015e-06, + "loss": 0.1875, + "step": 42600 + }, + { + "epoch": 64.08, + "grad_norm": 10.956419944763184, + "learning_rate": 3.592481203007519e-06, + "loss": 0.2209, + "step": 42610 + }, + { + "epoch": 64.09, + "grad_norm": 3.574812173843384, + "learning_rate": 3.5909774436090226e-06, + "loss": 0.2168, + "step": 42620 + }, + { + "epoch": 64.11, + "grad_norm": 4.676037311553955, + "learning_rate": 3.5894736842105266e-06, + "loss": 0.1879, + "step": 42630 + }, + { + "epoch": 64.12, + "grad_norm": 3.858914375305176, + "learning_rate": 3.5879699248120305e-06, + "loss": 0.2086, + "step": 42640 + }, + { + "epoch": 64.14, + "grad_norm": 4.243492126464844, + "learning_rate": 3.586466165413534e-06, + "loss": 0.2135, + "step": 42650 + }, + { + "epoch": 64.15, + "grad_norm": 5.530261039733887, + "learning_rate": 3.584962406015038e-06, + "loss": 0.2014, + "step": 42660 + }, + { + "epoch": 64.17, + "grad_norm": 5.095816135406494, + "learning_rate": 3.5834586466165415e-06, + "loss": 0.1919, + "step": 42670 + }, + { + "epoch": 64.18, + "grad_norm": 2.273388624191284, + "learning_rate": 3.5819548872180455e-06, + "loss": 0.15, + "step": 42680 + }, + { + "epoch": 64.2, + "grad_norm": 5.889043807983398, + "learning_rate": 3.580451127819549e-06, + "loss": 0.2731, + "step": 42690 + }, + { + "epoch": 64.21, + "grad_norm": 3.004957437515259, + "learning_rate": 3.578947368421053e-06, + "loss": 0.2121, + "step": 42700 + }, + { + "epoch": 64.23, + "grad_norm": 3.732741594314575, + "learning_rate": 3.577443609022557e-06, + "loss": 0.1757, + "step": 42710 + }, + { + "epoch": 64.24, + "grad_norm": 4.3243584632873535, + "learning_rate": 3.5759398496240604e-06, + "loss": 0.2344, + "step": 42720 + }, + { + "epoch": 64.26, + "grad_norm": 4.092397689819336, + "learning_rate": 3.5744360902255643e-06, + "loss": 0.1653, + "step": 42730 + }, + { + "epoch": 64.27, + "grad_norm": 5.2698187828063965, + "learning_rate": 3.572932330827068e-06, + "loss": 0.19, + "step": 42740 + }, + { + "epoch": 64.29, + "grad_norm": 7.864371299743652, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.1977, + "step": 42750 + }, + { + "epoch": 64.3, + "grad_norm": 2.6256632804870605, + "learning_rate": 3.5699248120300757e-06, + "loss": 0.2334, + "step": 42760 + }, + { + "epoch": 64.32, + "grad_norm": 2.710421085357666, + "learning_rate": 3.5684210526315792e-06, + "loss": 0.1919, + "step": 42770 + }, + { + "epoch": 64.33, + "grad_norm": 5.321908950805664, + "learning_rate": 3.566917293233083e-06, + "loss": 0.1552, + "step": 42780 + }, + { + "epoch": 64.35, + "grad_norm": 6.268770694732666, + "learning_rate": 3.5654135338345867e-06, + "loss": 0.1363, + "step": 42790 + }, + { + "epoch": 64.36, + "grad_norm": 4.892660617828369, + "learning_rate": 3.5639097744360906e-06, + "loss": 0.1603, + "step": 42800 + }, + { + "epoch": 64.38, + "grad_norm": 5.775477409362793, + "learning_rate": 3.5624060150375946e-06, + "loss": 0.2558, + "step": 42810 + }, + { + "epoch": 64.39, + "grad_norm": 7.529040813446045, + "learning_rate": 3.560902255639098e-06, + "loss": 0.2444, + "step": 42820 + }, + { + "epoch": 64.41, + "grad_norm": 2.752485513687134, + "learning_rate": 3.559398496240602e-06, + "loss": 0.2147, + "step": 42830 + }, + { + "epoch": 64.42, + "grad_norm": 7.767008304595947, + "learning_rate": 3.5578947368421056e-06, + "loss": 0.2318, + "step": 42840 + }, + { + "epoch": 64.44, + "grad_norm": 6.706613063812256, + "learning_rate": 3.5563909774436095e-06, + "loss": 0.192, + "step": 42850 + }, + { + "epoch": 64.45, + "grad_norm": 3.0275185108184814, + "learning_rate": 3.5548872180451135e-06, + "loss": 0.1605, + "step": 42860 + }, + { + "epoch": 64.47, + "grad_norm": 4.424815654754639, + "learning_rate": 3.553383458646617e-06, + "loss": 0.1919, + "step": 42870 + }, + { + "epoch": 64.48, + "grad_norm": 4.253983020782471, + "learning_rate": 3.551879699248121e-06, + "loss": 0.2079, + "step": 42880 + }, + { + "epoch": 64.5, + "grad_norm": 4.817580699920654, + "learning_rate": 3.5503759398496244e-06, + "loss": 0.1557, + "step": 42890 + }, + { + "epoch": 64.51, + "grad_norm": 10.810401916503906, + "learning_rate": 3.5488721804511284e-06, + "loss": 0.1925, + "step": 42900 + }, + { + "epoch": 64.53, + "grad_norm": 4.242228984832764, + "learning_rate": 3.5473684210526323e-06, + "loss": 0.1776, + "step": 42910 + }, + { + "epoch": 64.54, + "grad_norm": 3.9990408420562744, + "learning_rate": 3.545864661654136e-06, + "loss": 0.1627, + "step": 42920 + }, + { + "epoch": 64.56, + "grad_norm": 5.8364105224609375, + "learning_rate": 3.544360902255639e-06, + "loss": 0.1859, + "step": 42930 + }, + { + "epoch": 64.57, + "grad_norm": 8.85914134979248, + "learning_rate": 3.542857142857143e-06, + "loss": 0.2357, + "step": 42940 + }, + { + "epoch": 64.59, + "grad_norm": 6.398134708404541, + "learning_rate": 3.541353383458647e-06, + "loss": 0.2228, + "step": 42950 + }, + { + "epoch": 64.6, + "grad_norm": 1.9391348361968994, + "learning_rate": 3.5398496240601503e-06, + "loss": 0.1869, + "step": 42960 + }, + { + "epoch": 64.62, + "grad_norm": 4.14783239364624, + "learning_rate": 3.5383458646616543e-06, + "loss": 0.2012, + "step": 42970 + }, + { + "epoch": 64.63, + "grad_norm": 2.7045633792877197, + "learning_rate": 3.536842105263158e-06, + "loss": 0.1754, + "step": 42980 + }, + { + "epoch": 64.65, + "grad_norm": 4.491335391998291, + "learning_rate": 3.5353383458646617e-06, + "loss": 0.2195, + "step": 42990 + }, + { + "epoch": 64.66, + "grad_norm": 5.1978678703308105, + "learning_rate": 3.5338345864661657e-06, + "loss": 0.1891, + "step": 43000 + }, + { + "epoch": 64.68, + "grad_norm": 1.4553602933883667, + "learning_rate": 3.532330827067669e-06, + "loss": 0.1871, + "step": 43010 + }, + { + "epoch": 64.69, + "grad_norm": 3.4015984535217285, + "learning_rate": 3.530827067669173e-06, + "loss": 0.2434, + "step": 43020 + }, + { + "epoch": 64.71, + "grad_norm": 6.651217937469482, + "learning_rate": 3.5293233082706767e-06, + "loss": 0.1889, + "step": 43030 + }, + { + "epoch": 64.72, + "grad_norm": 4.7721848487854, + "learning_rate": 3.5278195488721806e-06, + "loss": 0.222, + "step": 43040 + }, + { + "epoch": 64.74, + "grad_norm": 2.7029495239257812, + "learning_rate": 3.5263157894736846e-06, + "loss": 0.1711, + "step": 43050 + }, + { + "epoch": 64.75, + "grad_norm": 11.2900972366333, + "learning_rate": 3.524812030075188e-06, + "loss": 0.19, + "step": 43060 + }, + { + "epoch": 64.77, + "grad_norm": 2.7460989952087402, + "learning_rate": 3.523308270676692e-06, + "loss": 0.213, + "step": 43070 + }, + { + "epoch": 64.78, + "grad_norm": 3.8076796531677246, + "learning_rate": 3.5218045112781955e-06, + "loss": 0.1665, + "step": 43080 + }, + { + "epoch": 64.8, + "grad_norm": 4.072009086608887, + "learning_rate": 3.5203007518796995e-06, + "loss": 0.1745, + "step": 43090 + }, + { + "epoch": 64.81, + "grad_norm": 2.836899995803833, + "learning_rate": 3.5187969924812034e-06, + "loss": 0.1815, + "step": 43100 + }, + { + "epoch": 64.83, + "grad_norm": 6.04483699798584, + "learning_rate": 3.517293233082707e-06, + "loss": 0.1649, + "step": 43110 + }, + { + "epoch": 64.84, + "grad_norm": 3.882234811782837, + "learning_rate": 3.515789473684211e-06, + "loss": 0.1793, + "step": 43120 + }, + { + "epoch": 64.86, + "grad_norm": 6.106405735015869, + "learning_rate": 3.5142857142857144e-06, + "loss": 0.2961, + "step": 43130 + }, + { + "epoch": 64.87, + "grad_norm": 3.823786973953247, + "learning_rate": 3.5127819548872183e-06, + "loss": 0.1846, + "step": 43140 + }, + { + "epoch": 64.89, + "grad_norm": 4.050282955169678, + "learning_rate": 3.511278195488722e-06, + "loss": 0.2784, + "step": 43150 + }, + { + "epoch": 64.9, + "grad_norm": 4.701742649078369, + "learning_rate": 3.509774436090226e-06, + "loss": 0.1875, + "step": 43160 + }, + { + "epoch": 64.92, + "grad_norm": 6.7218708992004395, + "learning_rate": 3.5082706766917297e-06, + "loss": 0.1865, + "step": 43170 + }, + { + "epoch": 64.93, + "grad_norm": 4.6365132331848145, + "learning_rate": 3.5067669172932333e-06, + "loss": 0.2224, + "step": 43180 + }, + { + "epoch": 64.95, + "grad_norm": 8.348788261413574, + "learning_rate": 3.505263157894737e-06, + "loss": 0.2309, + "step": 43190 + }, + { + "epoch": 64.96, + "grad_norm": 6.804657936096191, + "learning_rate": 3.5037593984962407e-06, + "loss": 0.1802, + "step": 43200 + }, + { + "epoch": 64.98, + "grad_norm": 6.6085405349731445, + "learning_rate": 3.5022556390977447e-06, + "loss": 0.2321, + "step": 43210 + }, + { + "epoch": 64.99, + "grad_norm": 6.796005725860596, + "learning_rate": 3.5007518796992486e-06, + "loss": 0.2108, + "step": 43220 + }, + { + "epoch": 65.0, + "eval_accuracy": 0.9313, + "eval_loss": 0.31611478328704834, + "eval_runtime": 84.6549, + "eval_samples_per_second": 118.127, + "eval_steps_per_second": 0.473, + "step": 43225 + }, + { + "epoch": 65.01, + "grad_norm": 33.686763763427734, + "learning_rate": 3.499248120300752e-06, + "loss": 0.2085, + "step": 43230 + }, + { + "epoch": 65.02, + "grad_norm": 4.142588138580322, + "learning_rate": 3.497744360902256e-06, + "loss": 0.1216, + "step": 43240 + }, + { + "epoch": 65.04, + "grad_norm": 4.3597612380981445, + "learning_rate": 3.4962406015037596e-06, + "loss": 0.202, + "step": 43250 + }, + { + "epoch": 65.05, + "grad_norm": 5.954403877258301, + "learning_rate": 3.4947368421052635e-06, + "loss": 0.1543, + "step": 43260 + }, + { + "epoch": 65.07, + "grad_norm": 4.9531474113464355, + "learning_rate": 3.4932330827067675e-06, + "loss": 0.2382, + "step": 43270 + }, + { + "epoch": 65.08, + "grad_norm": 4.433995723724365, + "learning_rate": 3.491729323308271e-06, + "loss": 0.2213, + "step": 43280 + }, + { + "epoch": 65.1, + "grad_norm": 9.919368743896484, + "learning_rate": 3.490225563909775e-06, + "loss": 0.2256, + "step": 43290 + }, + { + "epoch": 65.11, + "grad_norm": 1.1881593465805054, + "learning_rate": 3.4887218045112785e-06, + "loss": 0.2056, + "step": 43300 + }, + { + "epoch": 65.13, + "grad_norm": 5.632315635681152, + "learning_rate": 3.4872180451127824e-06, + "loss": 0.2989, + "step": 43310 + }, + { + "epoch": 65.14, + "grad_norm": 7.14915132522583, + "learning_rate": 3.4857142857142863e-06, + "loss": 0.2547, + "step": 43320 + }, + { + "epoch": 65.16, + "grad_norm": 3.911741018295288, + "learning_rate": 3.48421052631579e-06, + "loss": 0.2429, + "step": 43330 + }, + { + "epoch": 65.17, + "grad_norm": 2.1838467121124268, + "learning_rate": 3.482706766917294e-06, + "loss": 0.1845, + "step": 43340 + }, + { + "epoch": 65.19, + "grad_norm": 3.118250846862793, + "learning_rate": 3.4812030075187973e-06, + "loss": 0.2137, + "step": 43350 + }, + { + "epoch": 65.2, + "grad_norm": 4.089922904968262, + "learning_rate": 3.4796992481203013e-06, + "loss": 0.1868, + "step": 43360 + }, + { + "epoch": 65.22, + "grad_norm": 5.602779388427734, + "learning_rate": 3.478195488721805e-06, + "loss": 0.17, + "step": 43370 + }, + { + "epoch": 65.23, + "grad_norm": 8.60183334350586, + "learning_rate": 3.4766917293233087e-06, + "loss": 0.2105, + "step": 43380 + }, + { + "epoch": 65.25, + "grad_norm": 5.107520580291748, + "learning_rate": 3.4751879699248127e-06, + "loss": 0.2588, + "step": 43390 + }, + { + "epoch": 65.26, + "grad_norm": 10.866013526916504, + "learning_rate": 3.473684210526316e-06, + "loss": 0.1978, + "step": 43400 + }, + { + "epoch": 65.28, + "grad_norm": 4.005733489990234, + "learning_rate": 3.47218045112782e-06, + "loss": 0.1811, + "step": 43410 + }, + { + "epoch": 65.29, + "grad_norm": 3.853426694869995, + "learning_rate": 3.470676691729324e-06, + "loss": 0.2217, + "step": 43420 + }, + { + "epoch": 65.31, + "grad_norm": 5.65993070602417, + "learning_rate": 3.469172932330827e-06, + "loss": 0.216, + "step": 43430 + }, + { + "epoch": 65.32, + "grad_norm": 3.9332337379455566, + "learning_rate": 3.4676691729323307e-06, + "loss": 0.2144, + "step": 43440 + }, + { + "epoch": 65.34, + "grad_norm": 5.378814697265625, + "learning_rate": 3.4661654135338346e-06, + "loss": 0.1719, + "step": 43450 + }, + { + "epoch": 65.35, + "grad_norm": 4.357859134674072, + "learning_rate": 3.4646616541353386e-06, + "loss": 0.1897, + "step": 43460 + }, + { + "epoch": 65.37, + "grad_norm": 3.1881186962127686, + "learning_rate": 3.463157894736842e-06, + "loss": 0.1678, + "step": 43470 + }, + { + "epoch": 65.38, + "grad_norm": 3.442866325378418, + "learning_rate": 3.461654135338346e-06, + "loss": 0.1805, + "step": 43480 + }, + { + "epoch": 65.4, + "grad_norm": 9.554397583007812, + "learning_rate": 3.4601503759398496e-06, + "loss": 0.178, + "step": 43490 + }, + { + "epoch": 65.41, + "grad_norm": 3.902580976486206, + "learning_rate": 3.4586466165413535e-06, + "loss": 0.1855, + "step": 43500 + }, + { + "epoch": 65.43, + "grad_norm": 5.2838850021362305, + "learning_rate": 3.4571428571428574e-06, + "loss": 0.2158, + "step": 43510 + }, + { + "epoch": 65.44, + "grad_norm": 3.7892587184906006, + "learning_rate": 3.455639097744361e-06, + "loss": 0.1871, + "step": 43520 + }, + { + "epoch": 65.46, + "grad_norm": 3.122152090072632, + "learning_rate": 3.454135338345865e-06, + "loss": 0.142, + "step": 43530 + }, + { + "epoch": 65.47, + "grad_norm": 5.339186191558838, + "learning_rate": 3.4526315789473684e-06, + "loss": 0.1896, + "step": 43540 + }, + { + "epoch": 65.49, + "grad_norm": 6.880535125732422, + "learning_rate": 3.4511278195488724e-06, + "loss": 0.238, + "step": 43550 + }, + { + "epoch": 65.5, + "grad_norm": 4.716340065002441, + "learning_rate": 3.4496240601503763e-06, + "loss": 0.1584, + "step": 43560 + }, + { + "epoch": 65.52, + "grad_norm": 5.00018835067749, + "learning_rate": 3.44812030075188e-06, + "loss": 0.211, + "step": 43570 + }, + { + "epoch": 65.53, + "grad_norm": 2.4130773544311523, + "learning_rate": 3.4466165413533838e-06, + "loss": 0.2621, + "step": 43580 + }, + { + "epoch": 65.55, + "grad_norm": 0.7229984998703003, + "learning_rate": 3.4451127819548873e-06, + "loss": 0.213, + "step": 43590 + }, + { + "epoch": 65.56, + "grad_norm": 7.815069675445557, + "learning_rate": 3.4436090225563912e-06, + "loss": 0.2066, + "step": 43600 + }, + { + "epoch": 65.58, + "grad_norm": 4.840019702911377, + "learning_rate": 3.4421052631578947e-06, + "loss": 0.182, + "step": 43610 + }, + { + "epoch": 65.59, + "grad_norm": 5.94028377532959, + "learning_rate": 3.4406015037593987e-06, + "loss": 0.1918, + "step": 43620 + }, + { + "epoch": 65.61, + "grad_norm": 5.4512939453125, + "learning_rate": 3.4390977443609026e-06, + "loss": 0.2402, + "step": 43630 + }, + { + "epoch": 65.62, + "grad_norm": 4.47932243347168, + "learning_rate": 3.437593984962406e-06, + "loss": 0.206, + "step": 43640 + }, + { + "epoch": 65.64, + "grad_norm": 5.5348310470581055, + "learning_rate": 3.43609022556391e-06, + "loss": 0.238, + "step": 43650 + }, + { + "epoch": 65.65, + "grad_norm": 6.362865924835205, + "learning_rate": 3.4345864661654136e-06, + "loss": 0.1752, + "step": 43660 + }, + { + "epoch": 65.67, + "grad_norm": 6.699802875518799, + "learning_rate": 3.4330827067669176e-06, + "loss": 0.2295, + "step": 43670 + }, + { + "epoch": 65.68, + "grad_norm": 2.2860307693481445, + "learning_rate": 3.4315789473684215e-06, + "loss": 0.1659, + "step": 43680 + }, + { + "epoch": 65.7, + "grad_norm": 2.297809362411499, + "learning_rate": 3.430075187969925e-06, + "loss": 0.1341, + "step": 43690 + }, + { + "epoch": 65.71, + "grad_norm": 7.806978702545166, + "learning_rate": 3.428571428571429e-06, + "loss": 0.1756, + "step": 43700 + }, + { + "epoch": 65.73, + "grad_norm": 8.060872077941895, + "learning_rate": 3.4270676691729325e-06, + "loss": 0.208, + "step": 43710 + }, + { + "epoch": 65.74, + "grad_norm": 4.351926803588867, + "learning_rate": 3.4255639097744364e-06, + "loss": 0.1765, + "step": 43720 + }, + { + "epoch": 65.76, + "grad_norm": 6.094818592071533, + "learning_rate": 3.4240601503759404e-06, + "loss": 0.219, + "step": 43730 + }, + { + "epoch": 65.77, + "grad_norm": 5.638463497161865, + "learning_rate": 3.422556390977444e-06, + "loss": 0.2396, + "step": 43740 + }, + { + "epoch": 65.79, + "grad_norm": 3.789339303970337, + "learning_rate": 3.421052631578948e-06, + "loss": 0.1845, + "step": 43750 + }, + { + "epoch": 65.8, + "grad_norm": 7.909036636352539, + "learning_rate": 3.4195488721804513e-06, + "loss": 0.2344, + "step": 43760 + }, + { + "epoch": 65.82, + "grad_norm": 6.027281761169434, + "learning_rate": 3.4180451127819553e-06, + "loss": 0.1474, + "step": 43770 + }, + { + "epoch": 65.83, + "grad_norm": 5.84333610534668, + "learning_rate": 3.4165413533834592e-06, + "loss": 0.147, + "step": 43780 + }, + { + "epoch": 65.85, + "grad_norm": 8.227249145507812, + "learning_rate": 3.4150375939849627e-06, + "loss": 0.2314, + "step": 43790 + }, + { + "epoch": 65.86, + "grad_norm": 3.2717232704162598, + "learning_rate": 3.4135338345864667e-06, + "loss": 0.2419, + "step": 43800 + }, + { + "epoch": 65.88, + "grad_norm": 5.8105926513671875, + "learning_rate": 3.41203007518797e-06, + "loss": 0.2375, + "step": 43810 + }, + { + "epoch": 65.89, + "grad_norm": 3.668238878250122, + "learning_rate": 3.410526315789474e-06, + "loss": 0.234, + "step": 43820 + }, + { + "epoch": 65.91, + "grad_norm": 5.884802341461182, + "learning_rate": 3.409022556390978e-06, + "loss": 0.1602, + "step": 43830 + }, + { + "epoch": 65.92, + "grad_norm": 3.720730781555176, + "learning_rate": 3.4075187969924816e-06, + "loss": 0.2016, + "step": 43840 + }, + { + "epoch": 65.94, + "grad_norm": 3.9387388229370117, + "learning_rate": 3.4060150375939856e-06, + "loss": 0.1904, + "step": 43850 + }, + { + "epoch": 65.95, + "grad_norm": 7.595775604248047, + "learning_rate": 3.404511278195489e-06, + "loss": 0.1867, + "step": 43860 + }, + { + "epoch": 65.97, + "grad_norm": 4.808709621429443, + "learning_rate": 3.403007518796993e-06, + "loss": 0.2034, + "step": 43870 + }, + { + "epoch": 65.98, + "grad_norm": 5.746549606323242, + "learning_rate": 3.401503759398497e-06, + "loss": 0.1736, + "step": 43880 + }, + { + "epoch": 66.0, + "grad_norm": 0.12739986181259155, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.1696, + "step": 43890 + }, + { + "epoch": 66.0, + "eval_accuracy": 0.929, + "eval_loss": 0.32690301537513733, + "eval_runtime": 84.7083, + "eval_samples_per_second": 118.052, + "eval_steps_per_second": 0.472, + "step": 43890 + }, + { + "epoch": 66.02, + "grad_norm": 7.90481424331665, + "learning_rate": 3.3984962406015044e-06, + "loss": 0.1554, + "step": 43900 + }, + { + "epoch": 66.03, + "grad_norm": 6.900293350219727, + "learning_rate": 3.396992481203008e-06, + "loss": 0.1998, + "step": 43910 + }, + { + "epoch": 66.05, + "grad_norm": 5.227355003356934, + "learning_rate": 3.3954887218045115e-06, + "loss": 0.1637, + "step": 43920 + }, + { + "epoch": 66.06, + "grad_norm": 3.3653359413146973, + "learning_rate": 3.393984962406015e-06, + "loss": 0.1715, + "step": 43930 + }, + { + "epoch": 66.08, + "grad_norm": 7.483799934387207, + "learning_rate": 3.392481203007519e-06, + "loss": 0.1775, + "step": 43940 + }, + { + "epoch": 66.09, + "grad_norm": 6.125503063201904, + "learning_rate": 3.3909774436090224e-06, + "loss": 0.2052, + "step": 43950 + }, + { + "epoch": 66.11, + "grad_norm": 2.947721004486084, + "learning_rate": 3.3894736842105264e-06, + "loss": 0.1686, + "step": 43960 + }, + { + "epoch": 66.12, + "grad_norm": 4.430096626281738, + "learning_rate": 3.3879699248120303e-06, + "loss": 0.1511, + "step": 43970 + }, + { + "epoch": 66.14, + "grad_norm": 4.3853349685668945, + "learning_rate": 3.386466165413534e-06, + "loss": 0.2122, + "step": 43980 + }, + { + "epoch": 66.15, + "grad_norm": 3.0524966716766357, + "learning_rate": 3.384962406015038e-06, + "loss": 0.138, + "step": 43990 + }, + { + "epoch": 66.17, + "grad_norm": 2.938044786453247, + "learning_rate": 3.3834586466165413e-06, + "loss": 0.2325, + "step": 44000 + }, + { + "epoch": 66.18, + "grad_norm": 4.645559787750244, + "learning_rate": 3.3819548872180453e-06, + "loss": 0.2521, + "step": 44010 + }, + { + "epoch": 66.2, + "grad_norm": 7.084641456604004, + "learning_rate": 3.380451127819549e-06, + "loss": 0.2237, + "step": 44020 + }, + { + "epoch": 66.21, + "grad_norm": 8.430667877197266, + "learning_rate": 3.3789473684210527e-06, + "loss": 0.2012, + "step": 44030 + }, + { + "epoch": 66.23, + "grad_norm": 2.169158697128296, + "learning_rate": 3.3774436090225567e-06, + "loss": 0.1534, + "step": 44040 + }, + { + "epoch": 66.24, + "grad_norm": 4.5548553466796875, + "learning_rate": 3.37593984962406e-06, + "loss": 0.2439, + "step": 44050 + }, + { + "epoch": 66.26, + "grad_norm": 3.5269806385040283, + "learning_rate": 3.374436090225564e-06, + "loss": 0.2261, + "step": 44060 + }, + { + "epoch": 66.27, + "grad_norm": 4.3473334312438965, + "learning_rate": 3.3729323308270676e-06, + "loss": 0.1643, + "step": 44070 + }, + { + "epoch": 66.29, + "grad_norm": 1.9924315214157104, + "learning_rate": 3.3714285714285716e-06, + "loss": 0.1739, + "step": 44080 + }, + { + "epoch": 66.3, + "grad_norm": 4.501370906829834, + "learning_rate": 3.3699248120300755e-06, + "loss": 0.2115, + "step": 44090 + }, + { + "epoch": 66.32, + "grad_norm": 9.435755729675293, + "learning_rate": 3.368421052631579e-06, + "loss": 0.1906, + "step": 44100 + }, + { + "epoch": 66.33, + "grad_norm": 5.539870738983154, + "learning_rate": 3.366917293233083e-06, + "loss": 0.1738, + "step": 44110 + }, + { + "epoch": 66.35, + "grad_norm": 6.538062572479248, + "learning_rate": 3.3654135338345865e-06, + "loss": 0.1997, + "step": 44120 + }, + { + "epoch": 66.36, + "grad_norm": 3.478639602661133, + "learning_rate": 3.3639097744360904e-06, + "loss": 0.2232, + "step": 44130 + }, + { + "epoch": 66.38, + "grad_norm": 3.3659911155700684, + "learning_rate": 3.3624060150375944e-06, + "loss": 0.1294, + "step": 44140 + }, + { + "epoch": 66.39, + "grad_norm": 1.8332750797271729, + "learning_rate": 3.360902255639098e-06, + "loss": 0.171, + "step": 44150 + }, + { + "epoch": 66.41, + "grad_norm": 6.612475872039795, + "learning_rate": 3.359398496240602e-06, + "loss": 0.2437, + "step": 44160 + }, + { + "epoch": 66.42, + "grad_norm": 4.539286136627197, + "learning_rate": 3.3578947368421054e-06, + "loss": 0.1853, + "step": 44170 + }, + { + "epoch": 66.44, + "grad_norm": 6.16352653503418, + "learning_rate": 3.3563909774436093e-06, + "loss": 0.2118, + "step": 44180 + }, + { + "epoch": 66.45, + "grad_norm": 6.134418487548828, + "learning_rate": 3.3548872180451133e-06, + "loss": 0.2087, + "step": 44190 + }, + { + "epoch": 66.47, + "grad_norm": 3.634429454803467, + "learning_rate": 3.3533834586466168e-06, + "loss": 0.1724, + "step": 44200 + }, + { + "epoch": 66.48, + "grad_norm": 3.4543323516845703, + "learning_rate": 3.3518796992481207e-06, + "loss": 0.1575, + "step": 44210 + }, + { + "epoch": 66.5, + "grad_norm": 4.991121768951416, + "learning_rate": 3.3503759398496242e-06, + "loss": 0.2694, + "step": 44220 + }, + { + "epoch": 66.51, + "grad_norm": 5.8151936531066895, + "learning_rate": 3.348872180451128e-06, + "loss": 0.2191, + "step": 44230 + }, + { + "epoch": 66.53, + "grad_norm": 4.985229969024658, + "learning_rate": 3.347368421052632e-06, + "loss": 0.2083, + "step": 44240 + }, + { + "epoch": 66.54, + "grad_norm": 4.52158260345459, + "learning_rate": 3.3458646616541356e-06, + "loss": 0.2116, + "step": 44250 + }, + { + "epoch": 66.56, + "grad_norm": 5.498661518096924, + "learning_rate": 3.3443609022556396e-06, + "loss": 0.2242, + "step": 44260 + }, + { + "epoch": 66.57, + "grad_norm": 4.108017444610596, + "learning_rate": 3.342857142857143e-06, + "loss": 0.2415, + "step": 44270 + }, + { + "epoch": 66.59, + "grad_norm": 7.99500846862793, + "learning_rate": 3.341353383458647e-06, + "loss": 0.2235, + "step": 44280 + }, + { + "epoch": 66.6, + "grad_norm": 3.367448329925537, + "learning_rate": 3.339849624060151e-06, + "loss": 0.1975, + "step": 44290 + }, + { + "epoch": 66.62, + "grad_norm": 3.148461103439331, + "learning_rate": 3.3383458646616545e-06, + "loss": 0.1706, + "step": 44300 + }, + { + "epoch": 66.63, + "grad_norm": 5.184333324432373, + "learning_rate": 3.3368421052631584e-06, + "loss": 0.158, + "step": 44310 + }, + { + "epoch": 66.65, + "grad_norm": 8.876405715942383, + "learning_rate": 3.335338345864662e-06, + "loss": 0.2529, + "step": 44320 + }, + { + "epoch": 66.66, + "grad_norm": 1.495922565460205, + "learning_rate": 3.333834586466166e-06, + "loss": 0.1659, + "step": 44330 + }, + { + "epoch": 66.68, + "grad_norm": 4.976419925689697, + "learning_rate": 3.3323308270676694e-06, + "loss": 0.2705, + "step": 44340 + }, + { + "epoch": 66.69, + "grad_norm": 8.409575462341309, + "learning_rate": 3.3308270676691734e-06, + "loss": 0.237, + "step": 44350 + }, + { + "epoch": 66.71, + "grad_norm": 11.252154350280762, + "learning_rate": 3.3293233082706773e-06, + "loss": 0.225, + "step": 44360 + }, + { + "epoch": 66.72, + "grad_norm": 6.9660820960998535, + "learning_rate": 3.327819548872181e-06, + "loss": 0.1612, + "step": 44370 + }, + { + "epoch": 66.74, + "grad_norm": 7.809027671813965, + "learning_rate": 3.3263157894736848e-06, + "loss": 0.1657, + "step": 44380 + }, + { + "epoch": 66.75, + "grad_norm": 4.254601955413818, + "learning_rate": 3.3248120300751883e-06, + "loss": 0.1746, + "step": 44390 + }, + { + "epoch": 66.77, + "grad_norm": 7.9498291015625, + "learning_rate": 3.3233082706766922e-06, + "loss": 0.1704, + "step": 44400 + }, + { + "epoch": 66.78, + "grad_norm": 2.0477476119995117, + "learning_rate": 3.321804511278196e-06, + "loss": 0.1805, + "step": 44410 + }, + { + "epoch": 66.8, + "grad_norm": 4.7251715660095215, + "learning_rate": 3.3203007518796993e-06, + "loss": 0.1856, + "step": 44420 + }, + { + "epoch": 66.81, + "grad_norm": 3.7293553352355957, + "learning_rate": 3.3187969924812032e-06, + "loss": 0.1233, + "step": 44430 + }, + { + "epoch": 66.83, + "grad_norm": 6.081485748291016, + "learning_rate": 3.3172932330827067e-06, + "loss": 0.1719, + "step": 44440 + }, + { + "epoch": 66.84, + "grad_norm": 5.9813079833984375, + "learning_rate": 3.3157894736842107e-06, + "loss": 0.186, + "step": 44450 + }, + { + "epoch": 66.86, + "grad_norm": 6.273433208465576, + "learning_rate": 3.314285714285714e-06, + "loss": 0.236, + "step": 44460 + }, + { + "epoch": 66.87, + "grad_norm": 6.025407314300537, + "learning_rate": 3.312781954887218e-06, + "loss": 0.2297, + "step": 44470 + }, + { + "epoch": 66.89, + "grad_norm": 4.0798659324646, + "learning_rate": 3.3112781954887217e-06, + "loss": 0.1626, + "step": 44480 + }, + { + "epoch": 66.9, + "grad_norm": 1.6640973091125488, + "learning_rate": 3.3097744360902256e-06, + "loss": 0.1482, + "step": 44490 + }, + { + "epoch": 66.92, + "grad_norm": 5.056227207183838, + "learning_rate": 3.3082706766917295e-06, + "loss": 0.2078, + "step": 44500 + }, + { + "epoch": 66.93, + "grad_norm": 4.699299335479736, + "learning_rate": 3.306766917293233e-06, + "loss": 0.2021, + "step": 44510 + }, + { + "epoch": 66.95, + "grad_norm": 2.141939163208008, + "learning_rate": 3.305263157894737e-06, + "loss": 0.1897, + "step": 44520 + }, + { + "epoch": 66.96, + "grad_norm": 4.3747687339782715, + "learning_rate": 3.3037593984962405e-06, + "loss": 0.2014, + "step": 44530 + }, + { + "epoch": 66.98, + "grad_norm": 5.673415660858154, + "learning_rate": 3.3022556390977445e-06, + "loss": 0.2335, + "step": 44540 + }, + { + "epoch": 66.99, + "grad_norm": 2.832839250564575, + "learning_rate": 3.3007518796992484e-06, + "loss": 0.1946, + "step": 44550 + }, + { + "epoch": 67.0, + "eval_accuracy": 0.9302, + "eval_loss": 0.330706924200058, + "eval_runtime": 84.1739, + "eval_samples_per_second": 118.802, + "eval_steps_per_second": 0.475, + "step": 44555 + }, + { + "epoch": 67.01, + "grad_norm": 5.633582592010498, + "learning_rate": 3.299248120300752e-06, + "loss": 0.1437, + "step": 44560 + }, + { + "epoch": 67.02, + "grad_norm": 3.9651787281036377, + "learning_rate": 3.297744360902256e-06, + "loss": 0.157, + "step": 44570 + }, + { + "epoch": 67.04, + "grad_norm": 3.220197916030884, + "learning_rate": 3.2962406015037594e-06, + "loss": 0.1883, + "step": 44580 + }, + { + "epoch": 67.05, + "grad_norm": 6.965760707855225, + "learning_rate": 3.2947368421052633e-06, + "loss": 0.2133, + "step": 44590 + }, + { + "epoch": 67.07, + "grad_norm": 7.967811107635498, + "learning_rate": 3.2932330827067673e-06, + "loss": 0.1972, + "step": 44600 + }, + { + "epoch": 67.08, + "grad_norm": 8.779460906982422, + "learning_rate": 3.291729323308271e-06, + "loss": 0.2013, + "step": 44610 + }, + { + "epoch": 67.1, + "grad_norm": 4.243956565856934, + "learning_rate": 3.2902255639097747e-06, + "loss": 0.1606, + "step": 44620 + }, + { + "epoch": 67.11, + "grad_norm": 6.18154764175415, + "learning_rate": 3.2887218045112783e-06, + "loss": 0.231, + "step": 44630 + }, + { + "epoch": 67.13, + "grad_norm": 4.825216293334961, + "learning_rate": 3.287218045112782e-06, + "loss": 0.2034, + "step": 44640 + }, + { + "epoch": 67.14, + "grad_norm": 4.287315845489502, + "learning_rate": 3.285714285714286e-06, + "loss": 0.1463, + "step": 44650 + }, + { + "epoch": 67.16, + "grad_norm": 4.091511249542236, + "learning_rate": 3.2842105263157897e-06, + "loss": 0.1423, + "step": 44660 + }, + { + "epoch": 67.17, + "grad_norm": 4.6609978675842285, + "learning_rate": 3.2827067669172936e-06, + "loss": 0.2448, + "step": 44670 + }, + { + "epoch": 67.19, + "grad_norm": 7.273349761962891, + "learning_rate": 3.281203007518797e-06, + "loss": 0.1618, + "step": 44680 + }, + { + "epoch": 67.2, + "grad_norm": 6.079645156860352, + "learning_rate": 3.279699248120301e-06, + "loss": 0.2034, + "step": 44690 + }, + { + "epoch": 67.22, + "grad_norm": 4.420186996459961, + "learning_rate": 3.278195488721805e-06, + "loss": 0.1605, + "step": 44700 + }, + { + "epoch": 67.23, + "grad_norm": 2.5898821353912354, + "learning_rate": 3.2766917293233085e-06, + "loss": 0.1425, + "step": 44710 + }, + { + "epoch": 67.25, + "grad_norm": 5.7673468589782715, + "learning_rate": 3.2751879699248125e-06, + "loss": 0.1487, + "step": 44720 + }, + { + "epoch": 67.26, + "grad_norm": 4.787065505981445, + "learning_rate": 3.273684210526316e-06, + "loss": 0.2058, + "step": 44730 + }, + { + "epoch": 67.28, + "grad_norm": 8.583816528320312, + "learning_rate": 3.27218045112782e-06, + "loss": 0.194, + "step": 44740 + }, + { + "epoch": 67.29, + "grad_norm": 6.30866813659668, + "learning_rate": 3.270676691729324e-06, + "loss": 0.2036, + "step": 44750 + }, + { + "epoch": 67.31, + "grad_norm": 3.9293291568756104, + "learning_rate": 3.2691729323308274e-06, + "loss": 0.1755, + "step": 44760 + }, + { + "epoch": 67.32, + "grad_norm": 6.662651538848877, + "learning_rate": 3.2676691729323313e-06, + "loss": 0.1482, + "step": 44770 + }, + { + "epoch": 67.34, + "grad_norm": 4.203287124633789, + "learning_rate": 3.266165413533835e-06, + "loss": 0.263, + "step": 44780 + }, + { + "epoch": 67.35, + "grad_norm": 5.568451881408691, + "learning_rate": 3.264661654135339e-06, + "loss": 0.1302, + "step": 44790 + }, + { + "epoch": 67.37, + "grad_norm": 11.907421112060547, + "learning_rate": 3.2631578947368423e-06, + "loss": 0.2896, + "step": 44800 + }, + { + "epoch": 67.38, + "grad_norm": 7.300894737243652, + "learning_rate": 3.2616541353383463e-06, + "loss": 0.2324, + "step": 44810 + }, + { + "epoch": 67.4, + "grad_norm": 0.4205104112625122, + "learning_rate": 3.26015037593985e-06, + "loss": 0.1475, + "step": 44820 + }, + { + "epoch": 67.41, + "grad_norm": 2.742250919342041, + "learning_rate": 3.2586466165413537e-06, + "loss": 0.223, + "step": 44830 + }, + { + "epoch": 67.43, + "grad_norm": 4.337477684020996, + "learning_rate": 3.2571428571428577e-06, + "loss": 0.1887, + "step": 44840 + }, + { + "epoch": 67.44, + "grad_norm": 6.059717178344727, + "learning_rate": 3.255639097744361e-06, + "loss": 0.2286, + "step": 44850 + }, + { + "epoch": 67.46, + "grad_norm": 8.457310676574707, + "learning_rate": 3.254135338345865e-06, + "loss": 0.1812, + "step": 44860 + }, + { + "epoch": 67.47, + "grad_norm": 2.976374626159668, + "learning_rate": 3.252631578947369e-06, + "loss": 0.2129, + "step": 44870 + }, + { + "epoch": 67.49, + "grad_norm": 5.421755790710449, + "learning_rate": 3.2511278195488726e-06, + "loss": 0.1674, + "step": 44880 + }, + { + "epoch": 67.5, + "grad_norm": 7.67271614074707, + "learning_rate": 3.2496240601503765e-06, + "loss": 0.1708, + "step": 44890 + }, + { + "epoch": 67.52, + "grad_norm": 7.604538440704346, + "learning_rate": 3.24812030075188e-06, + "loss": 0.205, + "step": 44900 + }, + { + "epoch": 67.53, + "grad_norm": 5.754815578460693, + "learning_rate": 3.246616541353384e-06, + "loss": 0.1922, + "step": 44910 + }, + { + "epoch": 67.55, + "grad_norm": 5.512721061706543, + "learning_rate": 3.245112781954887e-06, + "loss": 0.1671, + "step": 44920 + }, + { + "epoch": 67.56, + "grad_norm": 5.614846706390381, + "learning_rate": 3.243609022556391e-06, + "loss": 0.1837, + "step": 44930 + }, + { + "epoch": 67.58, + "grad_norm": 6.0121612548828125, + "learning_rate": 3.2421052631578945e-06, + "loss": 0.221, + "step": 44940 + }, + { + "epoch": 67.59, + "grad_norm": 4.721285820007324, + "learning_rate": 3.2406015037593985e-06, + "loss": 0.1704, + "step": 44950 + }, + { + "epoch": 67.61, + "grad_norm": 5.494776248931885, + "learning_rate": 3.2390977443609024e-06, + "loss": 0.2032, + "step": 44960 + }, + { + "epoch": 67.62, + "grad_norm": 3.387678861618042, + "learning_rate": 3.237593984962406e-06, + "loss": 0.2162, + "step": 44970 + }, + { + "epoch": 67.64, + "grad_norm": 5.2446417808532715, + "learning_rate": 3.23609022556391e-06, + "loss": 0.138, + "step": 44980 + }, + { + "epoch": 67.65, + "grad_norm": 6.1764349937438965, + "learning_rate": 3.2345864661654134e-06, + "loss": 0.1532, + "step": 44990 + }, + { + "epoch": 67.67, + "grad_norm": 4.519616603851318, + "learning_rate": 3.2330827067669174e-06, + "loss": 0.1948, + "step": 45000 + }, + { + "epoch": 67.68, + "grad_norm": 7.864477634429932, + "learning_rate": 3.2315789473684213e-06, + "loss": 0.1927, + "step": 45010 + }, + { + "epoch": 67.7, + "grad_norm": 2.657796859741211, + "learning_rate": 3.230075187969925e-06, + "loss": 0.1907, + "step": 45020 + }, + { + "epoch": 67.71, + "grad_norm": 6.408271312713623, + "learning_rate": 3.2285714285714288e-06, + "loss": 0.1843, + "step": 45030 + }, + { + "epoch": 67.73, + "grad_norm": 4.096652984619141, + "learning_rate": 3.2270676691729323e-06, + "loss": 0.174, + "step": 45040 + }, + { + "epoch": 67.74, + "grad_norm": 4.657118797302246, + "learning_rate": 3.2255639097744362e-06, + "loss": 0.1888, + "step": 45050 + }, + { + "epoch": 67.76, + "grad_norm": 4.9201860427856445, + "learning_rate": 3.22406015037594e-06, + "loss": 0.2123, + "step": 45060 + }, + { + "epoch": 67.77, + "grad_norm": 5.590874195098877, + "learning_rate": 3.2225563909774437e-06, + "loss": 0.1487, + "step": 45070 + }, + { + "epoch": 67.79, + "grad_norm": 7.1157050132751465, + "learning_rate": 3.2210526315789476e-06, + "loss": 0.193, + "step": 45080 + }, + { + "epoch": 67.8, + "grad_norm": 2.877906560897827, + "learning_rate": 3.219548872180451e-06, + "loss": 0.1419, + "step": 45090 + }, + { + "epoch": 67.82, + "grad_norm": 5.331236839294434, + "learning_rate": 3.218045112781955e-06, + "loss": 0.1581, + "step": 45100 + }, + { + "epoch": 67.83, + "grad_norm": 6.2138991355896, + "learning_rate": 3.216541353383459e-06, + "loss": 0.201, + "step": 45110 + }, + { + "epoch": 67.85, + "grad_norm": 3.055180072784424, + "learning_rate": 3.2150375939849625e-06, + "loss": 0.1932, + "step": 45120 + }, + { + "epoch": 67.86, + "grad_norm": 5.881607532501221, + "learning_rate": 3.2135338345864665e-06, + "loss": 0.2327, + "step": 45130 + }, + { + "epoch": 67.88, + "grad_norm": 7.188892841339111, + "learning_rate": 3.21203007518797e-06, + "loss": 0.1651, + "step": 45140 + }, + { + "epoch": 67.89, + "grad_norm": 3.549654960632324, + "learning_rate": 3.210526315789474e-06, + "loss": 0.2572, + "step": 45150 + }, + { + "epoch": 67.91, + "grad_norm": 9.034676551818848, + "learning_rate": 3.209022556390978e-06, + "loss": 0.1358, + "step": 45160 + }, + { + "epoch": 67.92, + "grad_norm": 4.870662689208984, + "learning_rate": 3.2075187969924814e-06, + "loss": 0.207, + "step": 45170 + }, + { + "epoch": 67.94, + "grad_norm": 1.617182731628418, + "learning_rate": 3.2060150375939854e-06, + "loss": 0.2525, + "step": 45180 + }, + { + "epoch": 67.95, + "grad_norm": 2.620441198348999, + "learning_rate": 3.204511278195489e-06, + "loss": 0.1688, + "step": 45190 + }, + { + "epoch": 67.97, + "grad_norm": 9.695724487304688, + "learning_rate": 3.203007518796993e-06, + "loss": 0.233, + "step": 45200 + }, + { + "epoch": 67.98, + "grad_norm": 6.005362033843994, + "learning_rate": 3.2015037593984968e-06, + "loss": 0.1813, + "step": 45210 + }, + { + "epoch": 68.0, + "grad_norm": 0.01271333172917366, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.1492, + "step": 45220 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.9296, + "eval_loss": 0.32478708028793335, + "eval_runtime": 84.6456, + "eval_samples_per_second": 118.14, + "eval_steps_per_second": 0.473, + "step": 45220 + }, + { + "epoch": 68.02, + "grad_norm": 9.229415893554688, + "learning_rate": 3.1984962406015042e-06, + "loss": 0.2163, + "step": 45230 + }, + { + "epoch": 68.03, + "grad_norm": 5.0831298828125, + "learning_rate": 3.1969924812030077e-06, + "loss": 0.2027, + "step": 45240 + }, + { + "epoch": 68.05, + "grad_norm": 4.618161678314209, + "learning_rate": 3.1954887218045117e-06, + "loss": 0.2085, + "step": 45250 + }, + { + "epoch": 68.06, + "grad_norm": 2.447524070739746, + "learning_rate": 3.193984962406015e-06, + "loss": 0.1593, + "step": 45260 + }, + { + "epoch": 68.08, + "grad_norm": 5.367290496826172, + "learning_rate": 3.192481203007519e-06, + "loss": 0.1527, + "step": 45270 + }, + { + "epoch": 68.09, + "grad_norm": 3.5726702213287354, + "learning_rate": 3.190977443609023e-06, + "loss": 0.1963, + "step": 45280 + }, + { + "epoch": 68.11, + "grad_norm": 6.074667930603027, + "learning_rate": 3.1894736842105266e-06, + "loss": 0.2226, + "step": 45290 + }, + { + "epoch": 68.12, + "grad_norm": 6.223268508911133, + "learning_rate": 3.1879699248120305e-06, + "loss": 0.277, + "step": 45300 + }, + { + "epoch": 68.14, + "grad_norm": 4.411190509796143, + "learning_rate": 3.186466165413534e-06, + "loss": 0.1662, + "step": 45310 + }, + { + "epoch": 68.15, + "grad_norm": 5.630961894989014, + "learning_rate": 3.184962406015038e-06, + "loss": 0.2317, + "step": 45320 + }, + { + "epoch": 68.17, + "grad_norm": 6.117316246032715, + "learning_rate": 3.183458646616542e-06, + "loss": 0.2044, + "step": 45330 + }, + { + "epoch": 68.18, + "grad_norm": 6.058127403259277, + "learning_rate": 3.1819548872180455e-06, + "loss": 0.1925, + "step": 45340 + }, + { + "epoch": 68.2, + "grad_norm": 4.601078033447266, + "learning_rate": 3.1804511278195494e-06, + "loss": 0.2433, + "step": 45350 + }, + { + "epoch": 68.21, + "grad_norm": 11.11681842803955, + "learning_rate": 3.178947368421053e-06, + "loss": 0.2035, + "step": 45360 + }, + { + "epoch": 68.23, + "grad_norm": 4.8060503005981445, + "learning_rate": 3.177443609022557e-06, + "loss": 0.1838, + "step": 45370 + }, + { + "epoch": 68.24, + "grad_norm": 3.692517042160034, + "learning_rate": 3.175939849624061e-06, + "loss": 0.1765, + "step": 45380 + }, + { + "epoch": 68.26, + "grad_norm": 7.551518440246582, + "learning_rate": 3.1744360902255643e-06, + "loss": 0.1897, + "step": 45390 + }, + { + "epoch": 68.27, + "grad_norm": 4.985076427459717, + "learning_rate": 3.1729323308270683e-06, + "loss": 0.216, + "step": 45400 + }, + { + "epoch": 68.29, + "grad_norm": 8.353325843811035, + "learning_rate": 3.1714285714285714e-06, + "loss": 0.196, + "step": 45410 + }, + { + "epoch": 68.3, + "grad_norm": 6.800642490386963, + "learning_rate": 3.1699248120300753e-06, + "loss": 0.1798, + "step": 45420 + }, + { + "epoch": 68.32, + "grad_norm": 7.222949028015137, + "learning_rate": 3.168421052631579e-06, + "loss": 0.1988, + "step": 45430 + }, + { + "epoch": 68.33, + "grad_norm": 7.158170700073242, + "learning_rate": 3.1669172932330828e-06, + "loss": 0.2191, + "step": 45440 + }, + { + "epoch": 68.35, + "grad_norm": 5.271254062652588, + "learning_rate": 3.1654135338345863e-06, + "loss": 0.2169, + "step": 45450 + }, + { + "epoch": 68.36, + "grad_norm": 2.8725428581237793, + "learning_rate": 3.1639097744360902e-06, + "loss": 0.2091, + "step": 45460 + }, + { + "epoch": 68.38, + "grad_norm": 6.836148262023926, + "learning_rate": 3.162406015037594e-06, + "loss": 0.2319, + "step": 45470 + }, + { + "epoch": 68.39, + "grad_norm": 3.735705614089966, + "learning_rate": 3.1609022556390977e-06, + "loss": 0.2037, + "step": 45480 + }, + { + "epoch": 68.41, + "grad_norm": 6.039109706878662, + "learning_rate": 3.1593984962406016e-06, + "loss": 0.2016, + "step": 45490 + }, + { + "epoch": 68.42, + "grad_norm": 6.477905750274658, + "learning_rate": 3.157894736842105e-06, + "loss": 0.185, + "step": 45500 + }, + { + "epoch": 68.44, + "grad_norm": 5.0428900718688965, + "learning_rate": 3.156390977443609e-06, + "loss": 0.192, + "step": 45510 + }, + { + "epoch": 68.45, + "grad_norm": 4.29931640625, + "learning_rate": 3.154887218045113e-06, + "loss": 0.1585, + "step": 45520 + }, + { + "epoch": 68.47, + "grad_norm": 4.26162052154541, + "learning_rate": 3.1533834586466166e-06, + "loss": 0.1979, + "step": 45530 + }, + { + "epoch": 68.48, + "grad_norm": 6.1737380027771, + "learning_rate": 3.1518796992481205e-06, + "loss": 0.1783, + "step": 45540 + }, + { + "epoch": 68.5, + "grad_norm": 3.1564579010009766, + "learning_rate": 3.150375939849624e-06, + "loss": 0.1443, + "step": 45550 + }, + { + "epoch": 68.51, + "grad_norm": 2.920849323272705, + "learning_rate": 3.148872180451128e-06, + "loss": 0.2208, + "step": 45560 + }, + { + "epoch": 68.53, + "grad_norm": 4.952502727508545, + "learning_rate": 3.147368421052632e-06, + "loss": 0.1836, + "step": 45570 + }, + { + "epoch": 68.54, + "grad_norm": 2.473483085632324, + "learning_rate": 3.1458646616541354e-06, + "loss": 0.1324, + "step": 45580 + }, + { + "epoch": 68.56, + "grad_norm": 5.73502779006958, + "learning_rate": 3.1443609022556394e-06, + "loss": 0.2821, + "step": 45590 + }, + { + "epoch": 68.57, + "grad_norm": 3.3458425998687744, + "learning_rate": 3.142857142857143e-06, + "loss": 0.1789, + "step": 45600 + }, + { + "epoch": 68.59, + "grad_norm": 5.300927639007568, + "learning_rate": 3.141353383458647e-06, + "loss": 0.2018, + "step": 45610 + }, + { + "epoch": 68.6, + "grad_norm": 6.269550323486328, + "learning_rate": 3.1398496240601508e-06, + "loss": 0.1991, + "step": 45620 + }, + { + "epoch": 68.62, + "grad_norm": 6.622176170349121, + "learning_rate": 3.1383458646616543e-06, + "loss": 0.1805, + "step": 45630 + }, + { + "epoch": 68.63, + "grad_norm": 7.759119987487793, + "learning_rate": 3.1368421052631582e-06, + "loss": 0.2455, + "step": 45640 + }, + { + "epoch": 68.65, + "grad_norm": 5.007221221923828, + "learning_rate": 3.1353383458646618e-06, + "loss": 0.1756, + "step": 45650 + }, + { + "epoch": 68.66, + "grad_norm": 5.451401710510254, + "learning_rate": 3.1338345864661657e-06, + "loss": 0.2058, + "step": 45660 + }, + { + "epoch": 68.68, + "grad_norm": 0.3662077784538269, + "learning_rate": 3.1323308270676696e-06, + "loss": 0.2024, + "step": 45670 + }, + { + "epoch": 68.69, + "grad_norm": 7.444847583770752, + "learning_rate": 3.130827067669173e-06, + "loss": 0.2119, + "step": 45680 + }, + { + "epoch": 68.71, + "grad_norm": 4.96779203414917, + "learning_rate": 3.129323308270677e-06, + "loss": 0.2708, + "step": 45690 + }, + { + "epoch": 68.72, + "grad_norm": 5.92564582824707, + "learning_rate": 3.1278195488721806e-06, + "loss": 0.1653, + "step": 45700 + }, + { + "epoch": 68.74, + "grad_norm": 5.052578449249268, + "learning_rate": 3.1263157894736846e-06, + "loss": 0.1841, + "step": 45710 + }, + { + "epoch": 68.75, + "grad_norm": 3.4545114040374756, + "learning_rate": 3.124812030075188e-06, + "loss": 0.2163, + "step": 45720 + }, + { + "epoch": 68.77, + "grad_norm": 5.480828762054443, + "learning_rate": 3.123308270676692e-06, + "loss": 0.1242, + "step": 45730 + }, + { + "epoch": 68.78, + "grad_norm": 4.088112831115723, + "learning_rate": 3.121804511278196e-06, + "loss": 0.1112, + "step": 45740 + }, + { + "epoch": 68.8, + "grad_norm": 4.465242862701416, + "learning_rate": 3.1203007518796995e-06, + "loss": 0.2095, + "step": 45750 + }, + { + "epoch": 68.81, + "grad_norm": 7.828632831573486, + "learning_rate": 3.1187969924812034e-06, + "loss": 0.246, + "step": 45760 + }, + { + "epoch": 68.83, + "grad_norm": 4.115504741668701, + "learning_rate": 3.117293233082707e-06, + "loss": 0.1527, + "step": 45770 + }, + { + "epoch": 68.84, + "grad_norm": 4.330153465270996, + "learning_rate": 3.115789473684211e-06, + "loss": 0.1638, + "step": 45780 + }, + { + "epoch": 68.86, + "grad_norm": 5.463741302490234, + "learning_rate": 3.114285714285715e-06, + "loss": 0.2238, + "step": 45790 + }, + { + "epoch": 68.87, + "grad_norm": 7.970693111419678, + "learning_rate": 3.1127819548872184e-06, + "loss": 0.1802, + "step": 45800 + }, + { + "epoch": 68.89, + "grad_norm": 4.3561577796936035, + "learning_rate": 3.1112781954887223e-06, + "loss": 0.2137, + "step": 45810 + }, + { + "epoch": 68.9, + "grad_norm": 5.0850958824157715, + "learning_rate": 3.109774436090226e-06, + "loss": 0.1676, + "step": 45820 + }, + { + "epoch": 68.92, + "grad_norm": 4.8231730461120605, + "learning_rate": 3.1082706766917298e-06, + "loss": 0.1825, + "step": 45830 + }, + { + "epoch": 68.93, + "grad_norm": 3.689068555831909, + "learning_rate": 3.1067669172932337e-06, + "loss": 0.1906, + "step": 45840 + }, + { + "epoch": 68.95, + "grad_norm": 3.177267074584961, + "learning_rate": 3.1052631578947372e-06, + "loss": 0.1653, + "step": 45850 + }, + { + "epoch": 68.96, + "grad_norm": 3.5612058639526367, + "learning_rate": 3.103759398496241e-06, + "loss": 0.1703, + "step": 45860 + }, + { + "epoch": 68.98, + "grad_norm": 7.400056838989258, + "learning_rate": 3.1022556390977447e-06, + "loss": 0.1732, + "step": 45870 + }, + { + "epoch": 68.99, + "grad_norm": 8.14755916595459, + "learning_rate": 3.1007518796992486e-06, + "loss": 0.223, + "step": 45880 + }, + { + "epoch": 69.0, + "eval_accuracy": 0.9293, + "eval_loss": 0.33156681060791016, + "eval_runtime": 84.519, + "eval_samples_per_second": 118.317, + "eval_steps_per_second": 0.473, + "step": 45885 + }, + { + "epoch": 69.01, + "grad_norm": 4.614025592803955, + "learning_rate": 3.0992481203007526e-06, + "loss": 0.1577, + "step": 45890 + }, + { + "epoch": 69.02, + "grad_norm": 5.442816257476807, + "learning_rate": 3.097744360902256e-06, + "loss": 0.2265, + "step": 45900 + }, + { + "epoch": 69.04, + "grad_norm": 5.297948837280273, + "learning_rate": 3.096240601503759e-06, + "loss": 0.199, + "step": 45910 + }, + { + "epoch": 69.05, + "grad_norm": 5.042107582092285, + "learning_rate": 3.094736842105263e-06, + "loss": 0.1949, + "step": 45920 + }, + { + "epoch": 69.07, + "grad_norm": 6.529865264892578, + "learning_rate": 3.093233082706767e-06, + "loss": 0.2015, + "step": 45930 + }, + { + "epoch": 69.08, + "grad_norm": 6.556023120880127, + "learning_rate": 3.0917293233082706e-06, + "loss": 0.2051, + "step": 45940 + }, + { + "epoch": 69.1, + "grad_norm": 5.027461051940918, + "learning_rate": 3.0902255639097745e-06, + "loss": 0.1894, + "step": 45950 + }, + { + "epoch": 69.11, + "grad_norm": 3.367723226547241, + "learning_rate": 3.088721804511278e-06, + "loss": 0.1466, + "step": 45960 + }, + { + "epoch": 69.13, + "grad_norm": 4.521204471588135, + "learning_rate": 3.087218045112782e-06, + "loss": 0.1885, + "step": 45970 + }, + { + "epoch": 69.14, + "grad_norm": 5.166673183441162, + "learning_rate": 3.085714285714286e-06, + "loss": 0.1593, + "step": 45980 + }, + { + "epoch": 69.16, + "grad_norm": 4.3414812088012695, + "learning_rate": 3.0842105263157895e-06, + "loss": 0.1804, + "step": 45990 + }, + { + "epoch": 69.17, + "grad_norm": 6.790599822998047, + "learning_rate": 3.0827067669172934e-06, + "loss": 0.177, + "step": 46000 + }, + { + "epoch": 69.19, + "grad_norm": 5.932426452636719, + "learning_rate": 3.081203007518797e-06, + "loss": 0.2503, + "step": 46010 + }, + { + "epoch": 69.2, + "grad_norm": 3.4542813301086426, + "learning_rate": 3.079699248120301e-06, + "loss": 0.1616, + "step": 46020 + }, + { + "epoch": 69.22, + "grad_norm": 3.6076695919036865, + "learning_rate": 3.078195488721805e-06, + "loss": 0.1794, + "step": 46030 + }, + { + "epoch": 69.23, + "grad_norm": 6.4292378425598145, + "learning_rate": 3.0766917293233083e-06, + "loss": 0.1487, + "step": 46040 + }, + { + "epoch": 69.25, + "grad_norm": 7.210880279541016, + "learning_rate": 3.0751879699248123e-06, + "loss": 0.1703, + "step": 46050 + }, + { + "epoch": 69.26, + "grad_norm": 2.0395233631134033, + "learning_rate": 3.0736842105263158e-06, + "loss": 0.1636, + "step": 46060 + }, + { + "epoch": 69.28, + "grad_norm": 9.782295227050781, + "learning_rate": 3.0721804511278197e-06, + "loss": 0.1979, + "step": 46070 + }, + { + "epoch": 69.29, + "grad_norm": 3.0649471282958984, + "learning_rate": 3.0706766917293237e-06, + "loss": 0.2242, + "step": 46080 + }, + { + "epoch": 69.31, + "grad_norm": 7.512526035308838, + "learning_rate": 3.069172932330827e-06, + "loss": 0.2796, + "step": 46090 + }, + { + "epoch": 69.32, + "grad_norm": 3.8779592514038086, + "learning_rate": 3.067669172932331e-06, + "loss": 0.1872, + "step": 46100 + }, + { + "epoch": 69.34, + "grad_norm": 4.834461212158203, + "learning_rate": 3.0661654135338346e-06, + "loss": 0.1784, + "step": 46110 + }, + { + "epoch": 69.35, + "grad_norm": 4.335732460021973, + "learning_rate": 3.0646616541353386e-06, + "loss": 0.1715, + "step": 46120 + }, + { + "epoch": 69.37, + "grad_norm": 8.79086971282959, + "learning_rate": 3.0631578947368425e-06, + "loss": 0.2193, + "step": 46130 + }, + { + "epoch": 69.38, + "grad_norm": 3.5167338848114014, + "learning_rate": 3.061654135338346e-06, + "loss": 0.1857, + "step": 46140 + }, + { + "epoch": 69.4, + "grad_norm": 3.6358115673065186, + "learning_rate": 3.06015037593985e-06, + "loss": 0.1648, + "step": 46150 + }, + { + "epoch": 69.41, + "grad_norm": 4.419709205627441, + "learning_rate": 3.0586466165413535e-06, + "loss": 0.1463, + "step": 46160 + }, + { + "epoch": 69.43, + "grad_norm": 12.194480895996094, + "learning_rate": 3.0571428571428575e-06, + "loss": 0.2125, + "step": 46170 + }, + { + "epoch": 69.44, + "grad_norm": 6.218225002288818, + "learning_rate": 3.055639097744361e-06, + "loss": 0.1879, + "step": 46180 + }, + { + "epoch": 69.46, + "grad_norm": 4.367229461669922, + "learning_rate": 3.054135338345865e-06, + "loss": 0.1527, + "step": 46190 + }, + { + "epoch": 69.47, + "grad_norm": 4.397371768951416, + "learning_rate": 3.052631578947369e-06, + "loss": 0.214, + "step": 46200 + }, + { + "epoch": 69.49, + "grad_norm": 9.216353416442871, + "learning_rate": 3.0511278195488724e-06, + "loss": 0.2037, + "step": 46210 + }, + { + "epoch": 69.5, + "grad_norm": 4.541748523712158, + "learning_rate": 3.0496240601503763e-06, + "loss": 0.1943, + "step": 46220 + }, + { + "epoch": 69.52, + "grad_norm": 2.6720430850982666, + "learning_rate": 3.04812030075188e-06, + "loss": 0.1699, + "step": 46230 + }, + { + "epoch": 69.53, + "grad_norm": 3.862180471420288, + "learning_rate": 3.0466165413533838e-06, + "loss": 0.1799, + "step": 46240 + }, + { + "epoch": 69.55, + "grad_norm": 3.2997727394104004, + "learning_rate": 3.0451127819548877e-06, + "loss": 0.1758, + "step": 46250 + }, + { + "epoch": 69.56, + "grad_norm": 6.742743015289307, + "learning_rate": 3.0436090225563912e-06, + "loss": 0.1675, + "step": 46260 + }, + { + "epoch": 69.58, + "grad_norm": 3.4689953327178955, + "learning_rate": 3.042105263157895e-06, + "loss": 0.1566, + "step": 46270 + }, + { + "epoch": 69.59, + "grad_norm": 4.367392063140869, + "learning_rate": 3.0406015037593987e-06, + "loss": 0.1839, + "step": 46280 + }, + { + "epoch": 69.61, + "grad_norm": 2.263700008392334, + "learning_rate": 3.0390977443609027e-06, + "loss": 0.1933, + "step": 46290 + }, + { + "epoch": 69.62, + "grad_norm": 7.104362487792969, + "learning_rate": 3.0375939849624066e-06, + "loss": 0.1841, + "step": 46300 + }, + { + "epoch": 69.64, + "grad_norm": 7.7885236740112305, + "learning_rate": 3.03609022556391e-06, + "loss": 0.2721, + "step": 46310 + }, + { + "epoch": 69.65, + "grad_norm": 5.928685665130615, + "learning_rate": 3.034586466165414e-06, + "loss": 0.2017, + "step": 46320 + }, + { + "epoch": 69.67, + "grad_norm": 3.4155795574188232, + "learning_rate": 3.0330827067669176e-06, + "loss": 0.1821, + "step": 46330 + }, + { + "epoch": 69.68, + "grad_norm": 4.222193717956543, + "learning_rate": 3.0315789473684215e-06, + "loss": 0.2059, + "step": 46340 + }, + { + "epoch": 69.7, + "grad_norm": 6.882046222686768, + "learning_rate": 3.0300751879699255e-06, + "loss": 0.1614, + "step": 46350 + }, + { + "epoch": 69.71, + "grad_norm": 4.649013996124268, + "learning_rate": 3.028571428571429e-06, + "loss": 0.2039, + "step": 46360 + }, + { + "epoch": 69.73, + "grad_norm": 3.898035764694214, + "learning_rate": 3.027067669172933e-06, + "loss": 0.1677, + "step": 46370 + }, + { + "epoch": 69.74, + "grad_norm": 4.432783603668213, + "learning_rate": 3.0255639097744364e-06, + "loss": 0.1762, + "step": 46380 + }, + { + "epoch": 69.76, + "grad_norm": 8.104802131652832, + "learning_rate": 3.0240601503759404e-06, + "loss": 0.1619, + "step": 46390 + }, + { + "epoch": 69.77, + "grad_norm": 4.149446487426758, + "learning_rate": 3.0225563909774443e-06, + "loss": 0.1904, + "step": 46400 + }, + { + "epoch": 69.79, + "grad_norm": 6.308215141296387, + "learning_rate": 3.0210526315789474e-06, + "loss": 0.1996, + "step": 46410 + }, + { + "epoch": 69.8, + "grad_norm": 8.179604530334473, + "learning_rate": 3.019548872180451e-06, + "loss": 0.2209, + "step": 46420 + }, + { + "epoch": 69.82, + "grad_norm": 5.956761837005615, + "learning_rate": 3.018045112781955e-06, + "loss": 0.1432, + "step": 46430 + }, + { + "epoch": 69.83, + "grad_norm": 5.909665584564209, + "learning_rate": 3.016541353383459e-06, + "loss": 0.2528, + "step": 46440 + }, + { + "epoch": 69.85, + "grad_norm": 6.1077399253845215, + "learning_rate": 3.0150375939849623e-06, + "loss": 0.2055, + "step": 46450 + }, + { + "epoch": 69.86, + "grad_norm": 4.353606700897217, + "learning_rate": 3.0135338345864663e-06, + "loss": 0.2351, + "step": 46460 + }, + { + "epoch": 69.88, + "grad_norm": 5.350019931793213, + "learning_rate": 3.01203007518797e-06, + "loss": 0.1732, + "step": 46470 + }, + { + "epoch": 69.89, + "grad_norm": 3.917721748352051, + "learning_rate": 3.0105263157894737e-06, + "loss": 0.1791, + "step": 46480 + }, + { + "epoch": 69.91, + "grad_norm": 10.580587387084961, + "learning_rate": 3.0090225563909777e-06, + "loss": 0.1899, + "step": 46490 + }, + { + "epoch": 69.92, + "grad_norm": 4.388562202453613, + "learning_rate": 3.007518796992481e-06, + "loss": 0.2287, + "step": 46500 + }, + { + "epoch": 69.94, + "grad_norm": 5.693699359893799, + "learning_rate": 3.006015037593985e-06, + "loss": 0.2259, + "step": 46510 + }, + { + "epoch": 69.95, + "grad_norm": 3.3817298412323, + "learning_rate": 3.0045112781954887e-06, + "loss": 0.2412, + "step": 46520 + }, + { + "epoch": 69.97, + "grad_norm": 1.847433090209961, + "learning_rate": 3.0030075187969926e-06, + "loss": 0.1723, + "step": 46530 + }, + { + "epoch": 69.98, + "grad_norm": 6.4717254638671875, + "learning_rate": 3.0015037593984966e-06, + "loss": 0.2032, + "step": 46540 + }, + { + "epoch": 70.0, + "grad_norm": 0.9074813723564148, + "learning_rate": 3e-06, + "loss": 0.1738, + "step": 46550 + }, + { + "epoch": 70.0, + "eval_accuracy": 0.9295, + "eval_loss": 0.3248044550418854, + "eval_runtime": 84.7278, + "eval_samples_per_second": 118.025, + "eval_steps_per_second": 0.472, + "step": 46550 + }, + { + "epoch": 70.02, + "grad_norm": 5.463174819946289, + "learning_rate": 2.998496240601504e-06, + "loss": 0.2036, + "step": 46560 + }, + { + "epoch": 70.03, + "grad_norm": 2.8487234115600586, + "learning_rate": 2.9969924812030075e-06, + "loss": 0.1619, + "step": 46570 + }, + { + "epoch": 70.05, + "grad_norm": 4.683668613433838, + "learning_rate": 2.9954887218045115e-06, + "loss": 0.2471, + "step": 46580 + }, + { + "epoch": 70.06, + "grad_norm": 5.568604946136475, + "learning_rate": 2.9939849624060154e-06, + "loss": 0.2199, + "step": 46590 + }, + { + "epoch": 70.08, + "grad_norm": 8.11655330657959, + "learning_rate": 2.992481203007519e-06, + "loss": 0.1753, + "step": 46600 + }, + { + "epoch": 70.09, + "grad_norm": 7.468130588531494, + "learning_rate": 2.990977443609023e-06, + "loss": 0.1809, + "step": 46610 + }, + { + "epoch": 70.11, + "grad_norm": 6.2125115394592285, + "learning_rate": 2.9894736842105264e-06, + "loss": 0.2751, + "step": 46620 + }, + { + "epoch": 70.12, + "grad_norm": 3.6229348182678223, + "learning_rate": 2.9879699248120303e-06, + "loss": 0.1723, + "step": 46630 + }, + { + "epoch": 70.14, + "grad_norm": 4.364720344543457, + "learning_rate": 2.986466165413534e-06, + "loss": 0.1679, + "step": 46640 + }, + { + "epoch": 70.15, + "grad_norm": 6.14267110824585, + "learning_rate": 2.984962406015038e-06, + "loss": 0.2083, + "step": 46650 + }, + { + "epoch": 70.17, + "grad_norm": 3.9673428535461426, + "learning_rate": 2.9834586466165418e-06, + "loss": 0.1551, + "step": 46660 + }, + { + "epoch": 70.18, + "grad_norm": 4.621110916137695, + "learning_rate": 2.9819548872180453e-06, + "loss": 0.1569, + "step": 46670 + }, + { + "epoch": 70.2, + "grad_norm": 8.371132850646973, + "learning_rate": 2.9804511278195492e-06, + "loss": 0.1963, + "step": 46680 + }, + { + "epoch": 70.21, + "grad_norm": 5.156669616699219, + "learning_rate": 2.9789473684210527e-06, + "loss": 0.2096, + "step": 46690 + }, + { + "epoch": 70.23, + "grad_norm": 4.981484413146973, + "learning_rate": 2.9774436090225567e-06, + "loss": 0.1313, + "step": 46700 + }, + { + "epoch": 70.24, + "grad_norm": 5.359274387359619, + "learning_rate": 2.9759398496240606e-06, + "loss": 0.183, + "step": 46710 + }, + { + "epoch": 70.26, + "grad_norm": 6.65374231338501, + "learning_rate": 2.974436090225564e-06, + "loss": 0.2083, + "step": 46720 + }, + { + "epoch": 70.27, + "grad_norm": 7.133225917816162, + "learning_rate": 2.972932330827068e-06, + "loss": 0.2509, + "step": 46730 + }, + { + "epoch": 70.29, + "grad_norm": 4.245519638061523, + "learning_rate": 2.9714285714285716e-06, + "loss": 0.1871, + "step": 46740 + }, + { + "epoch": 70.3, + "grad_norm": 5.218483924865723, + "learning_rate": 2.9699248120300755e-06, + "loss": 0.1595, + "step": 46750 + }, + { + "epoch": 70.32, + "grad_norm": 6.059344291687012, + "learning_rate": 2.9684210526315795e-06, + "loss": 0.1771, + "step": 46760 + }, + { + "epoch": 70.33, + "grad_norm": 4.6386237144470215, + "learning_rate": 2.966917293233083e-06, + "loss": 0.1868, + "step": 46770 + }, + { + "epoch": 70.35, + "grad_norm": 4.3146257400512695, + "learning_rate": 2.965413533834587e-06, + "loss": 0.1941, + "step": 46780 + }, + { + "epoch": 70.36, + "grad_norm": 4.2247395515441895, + "learning_rate": 2.9639097744360905e-06, + "loss": 0.2796, + "step": 46790 + }, + { + "epoch": 70.38, + "grad_norm": 4.537256717681885, + "learning_rate": 2.9624060150375944e-06, + "loss": 0.1675, + "step": 46800 + }, + { + "epoch": 70.39, + "grad_norm": 10.793876647949219, + "learning_rate": 2.9609022556390983e-06, + "loss": 0.1738, + "step": 46810 + }, + { + "epoch": 70.41, + "grad_norm": 3.7752058506011963, + "learning_rate": 2.959398496240602e-06, + "loss": 0.1864, + "step": 46820 + }, + { + "epoch": 70.42, + "grad_norm": 5.952646732330322, + "learning_rate": 2.957894736842106e-06, + "loss": 0.1994, + "step": 46830 + }, + { + "epoch": 70.44, + "grad_norm": 2.576373815536499, + "learning_rate": 2.9563909774436093e-06, + "loss": 0.1891, + "step": 46840 + }, + { + "epoch": 70.45, + "grad_norm": 3.7196900844573975, + "learning_rate": 2.9548872180451133e-06, + "loss": 0.2391, + "step": 46850 + }, + { + "epoch": 70.47, + "grad_norm": 4.4083571434021, + "learning_rate": 2.9533834586466172e-06, + "loss": 0.2241, + "step": 46860 + }, + { + "epoch": 70.48, + "grad_norm": 4.769841194152832, + "learning_rate": 2.9518796992481207e-06, + "loss": 0.1591, + "step": 46870 + }, + { + "epoch": 70.5, + "grad_norm": 4.998278617858887, + "learning_rate": 2.9503759398496247e-06, + "loss": 0.1785, + "step": 46880 + }, + { + "epoch": 70.51, + "grad_norm": 4.37769889831543, + "learning_rate": 2.948872180451128e-06, + "loss": 0.16, + "step": 46890 + }, + { + "epoch": 70.53, + "grad_norm": 7.736121654510498, + "learning_rate": 2.9473684210526317e-06, + "loss": 0.2141, + "step": 46900 + }, + { + "epoch": 70.54, + "grad_norm": 8.18359375, + "learning_rate": 2.9458646616541352e-06, + "loss": 0.2537, + "step": 46910 + }, + { + "epoch": 70.56, + "grad_norm": 7.136547088623047, + "learning_rate": 2.944360902255639e-06, + "loss": 0.1986, + "step": 46920 + }, + { + "epoch": 70.57, + "grad_norm": 2.9580116271972656, + "learning_rate": 2.9428571428571427e-06, + "loss": 0.1481, + "step": 46930 + }, + { + "epoch": 70.59, + "grad_norm": 5.023125171661377, + "learning_rate": 2.9413533834586466e-06, + "loss": 0.2149, + "step": 46940 + }, + { + "epoch": 70.6, + "grad_norm": 5.102907657623291, + "learning_rate": 2.9398496240601506e-06, + "loss": 0.1215, + "step": 46950 + }, + { + "epoch": 70.62, + "grad_norm": 3.1318459510803223, + "learning_rate": 2.938345864661654e-06, + "loss": 0.1812, + "step": 46960 + }, + { + "epoch": 70.63, + "grad_norm": 3.511244058609009, + "learning_rate": 2.936842105263158e-06, + "loss": 0.1735, + "step": 46970 + }, + { + "epoch": 70.65, + "grad_norm": 8.335837364196777, + "learning_rate": 2.9353383458646616e-06, + "loss": 0.1951, + "step": 46980 + }, + { + "epoch": 70.66, + "grad_norm": 3.936018228530884, + "learning_rate": 2.9338345864661655e-06, + "loss": 0.2294, + "step": 46990 + }, + { + "epoch": 70.68, + "grad_norm": 6.189547538757324, + "learning_rate": 2.9323308270676694e-06, + "loss": 0.2205, + "step": 47000 + }, + { + "epoch": 70.69, + "grad_norm": 3.3733437061309814, + "learning_rate": 2.930827067669173e-06, + "loss": 0.1982, + "step": 47010 + }, + { + "epoch": 70.71, + "grad_norm": 3.5287833213806152, + "learning_rate": 2.929323308270677e-06, + "loss": 0.1972, + "step": 47020 + }, + { + "epoch": 70.72, + "grad_norm": 21.65901756286621, + "learning_rate": 2.9278195488721804e-06, + "loss": 0.2244, + "step": 47030 + }, + { + "epoch": 70.74, + "grad_norm": 7.8496904373168945, + "learning_rate": 2.9263157894736844e-06, + "loss": 0.114, + "step": 47040 + }, + { + "epoch": 70.75, + "grad_norm": 3.459660053253174, + "learning_rate": 2.9248120300751883e-06, + "loss": 0.2043, + "step": 47050 + }, + { + "epoch": 70.77, + "grad_norm": 6.128682613372803, + "learning_rate": 2.923308270676692e-06, + "loss": 0.1774, + "step": 47060 + }, + { + "epoch": 70.78, + "grad_norm": 6.694797515869141, + "learning_rate": 2.9218045112781958e-06, + "loss": 0.1901, + "step": 47070 + }, + { + "epoch": 70.8, + "grad_norm": 7.470389366149902, + "learning_rate": 2.9203007518796993e-06, + "loss": 0.1957, + "step": 47080 + }, + { + "epoch": 70.81, + "grad_norm": 5.193848133087158, + "learning_rate": 2.9187969924812032e-06, + "loss": 0.1428, + "step": 47090 + }, + { + "epoch": 70.83, + "grad_norm": 5.4394049644470215, + "learning_rate": 2.9172932330827068e-06, + "loss": 0.1873, + "step": 47100 + }, + { + "epoch": 70.84, + "grad_norm": 5.622503280639648, + "learning_rate": 2.9157894736842107e-06, + "loss": 0.2147, + "step": 47110 + }, + { + "epoch": 70.86, + "grad_norm": 9.180618286132812, + "learning_rate": 2.9142857142857146e-06, + "loss": 0.1662, + "step": 47120 + }, + { + "epoch": 70.87, + "grad_norm": 3.7462944984436035, + "learning_rate": 2.912781954887218e-06, + "loss": 0.1925, + "step": 47130 + }, + { + "epoch": 70.89, + "grad_norm": 6.142997741699219, + "learning_rate": 2.911278195488722e-06, + "loss": 0.1398, + "step": 47140 + }, + { + "epoch": 70.9, + "grad_norm": 7.028275966644287, + "learning_rate": 2.9097744360902256e-06, + "loss": 0.2052, + "step": 47150 + }, + { + "epoch": 70.92, + "grad_norm": 4.399960517883301, + "learning_rate": 2.9082706766917296e-06, + "loss": 0.2089, + "step": 47160 + }, + { + "epoch": 70.93, + "grad_norm": 2.562202215194702, + "learning_rate": 2.9067669172932335e-06, + "loss": 0.1564, + "step": 47170 + }, + { + "epoch": 70.95, + "grad_norm": 9.725334167480469, + "learning_rate": 2.905263157894737e-06, + "loss": 0.1702, + "step": 47180 + }, + { + "epoch": 70.96, + "grad_norm": 4.6251654624938965, + "learning_rate": 2.903759398496241e-06, + "loss": 0.247, + "step": 47190 + }, + { + "epoch": 70.98, + "grad_norm": 5.6048760414123535, + "learning_rate": 2.9022556390977445e-06, + "loss": 0.2587, + "step": 47200 + }, + { + "epoch": 70.99, + "grad_norm": 2.857663631439209, + "learning_rate": 2.9007518796992484e-06, + "loss": 0.2251, + "step": 47210 + }, + { + "epoch": 71.0, + "eval_accuracy": 0.9305, + "eval_loss": 0.32967016100883484, + "eval_runtime": 84.8297, + "eval_samples_per_second": 117.883, + "eval_steps_per_second": 0.472, + "step": 47215 + }, + { + "epoch": 71.01, + "grad_norm": 3.354151487350464, + "learning_rate": 2.8992481203007524e-06, + "loss": 0.1378, + "step": 47220 + }, + { + "epoch": 71.02, + "grad_norm": 14.883716583251953, + "learning_rate": 2.897744360902256e-06, + "loss": 0.1888, + "step": 47230 + }, + { + "epoch": 71.04, + "grad_norm": 3.659508228302002, + "learning_rate": 2.89624060150376e-06, + "loss": 0.1611, + "step": 47240 + }, + { + "epoch": 71.05, + "grad_norm": 4.597053527832031, + "learning_rate": 2.8947368421052634e-06, + "loss": 0.1918, + "step": 47250 + }, + { + "epoch": 71.07, + "grad_norm": 4.022745132446289, + "learning_rate": 2.8932330827067673e-06, + "loss": 0.2231, + "step": 47260 + }, + { + "epoch": 71.08, + "grad_norm": 3.9845337867736816, + "learning_rate": 2.8917293233082712e-06, + "loss": 0.2327, + "step": 47270 + }, + { + "epoch": 71.1, + "grad_norm": 6.1547698974609375, + "learning_rate": 2.8902255639097748e-06, + "loss": 0.2559, + "step": 47280 + }, + { + "epoch": 71.11, + "grad_norm": 10.55440616607666, + "learning_rate": 2.8887218045112787e-06, + "loss": 0.166, + "step": 47290 + }, + { + "epoch": 71.13, + "grad_norm": 4.419111251831055, + "learning_rate": 2.8872180451127822e-06, + "loss": 0.1644, + "step": 47300 + }, + { + "epoch": 71.14, + "grad_norm": 2.8730411529541016, + "learning_rate": 2.885714285714286e-06, + "loss": 0.1481, + "step": 47310 + }, + { + "epoch": 71.16, + "grad_norm": 4.056236267089844, + "learning_rate": 2.88421052631579e-06, + "loss": 0.2006, + "step": 47320 + }, + { + "epoch": 71.17, + "grad_norm": 5.246281623840332, + "learning_rate": 2.8827067669172936e-06, + "loss": 0.2015, + "step": 47330 + }, + { + "epoch": 71.19, + "grad_norm": 4.46673059463501, + "learning_rate": 2.8812030075187976e-06, + "loss": 0.1138, + "step": 47340 + }, + { + "epoch": 71.2, + "grad_norm": 9.421850204467773, + "learning_rate": 2.879699248120301e-06, + "loss": 0.2566, + "step": 47350 + }, + { + "epoch": 71.22, + "grad_norm": 5.661787033081055, + "learning_rate": 2.878195488721805e-06, + "loss": 0.1804, + "step": 47360 + }, + { + "epoch": 71.23, + "grad_norm": 4.693019390106201, + "learning_rate": 2.876691729323309e-06, + "loss": 0.1765, + "step": 47370 + }, + { + "epoch": 71.25, + "grad_norm": 6.069882392883301, + "learning_rate": 2.8751879699248125e-06, + "loss": 0.173, + "step": 47380 + }, + { + "epoch": 71.26, + "grad_norm": 8.419574737548828, + "learning_rate": 2.8736842105263164e-06, + "loss": 0.1686, + "step": 47390 + }, + { + "epoch": 71.28, + "grad_norm": 2.8149406909942627, + "learning_rate": 2.8721804511278195e-06, + "loss": 0.2335, + "step": 47400 + }, + { + "epoch": 71.29, + "grad_norm": 5.66928243637085, + "learning_rate": 2.8706766917293235e-06, + "loss": 0.2031, + "step": 47410 + }, + { + "epoch": 71.31, + "grad_norm": 3.403899908065796, + "learning_rate": 2.869172932330827e-06, + "loss": 0.2124, + "step": 47420 + }, + { + "epoch": 71.32, + "grad_norm": 5.2177910804748535, + "learning_rate": 2.867669172932331e-06, + "loss": 0.1582, + "step": 47430 + }, + { + "epoch": 71.34, + "grad_norm": 5.742595195770264, + "learning_rate": 2.8661654135338344e-06, + "loss": 0.186, + "step": 47440 + }, + { + "epoch": 71.35, + "grad_norm": 4.816529273986816, + "learning_rate": 2.8646616541353384e-06, + "loss": 0.1572, + "step": 47450 + }, + { + "epoch": 71.37, + "grad_norm": 4.075284004211426, + "learning_rate": 2.8631578947368423e-06, + "loss": 0.1586, + "step": 47460 + }, + { + "epoch": 71.38, + "grad_norm": 5.9553446769714355, + "learning_rate": 2.861654135338346e-06, + "loss": 0.1871, + "step": 47470 + }, + { + "epoch": 71.4, + "grad_norm": 5.976673126220703, + "learning_rate": 2.86015037593985e-06, + "loss": 0.1897, + "step": 47480 + }, + { + "epoch": 71.41, + "grad_norm": 3.2416350841522217, + "learning_rate": 2.8586466165413533e-06, + "loss": 0.1402, + "step": 47490 + }, + { + "epoch": 71.43, + "grad_norm": 3.216466188430786, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.1308, + "step": 47500 + }, + { + "epoch": 71.44, + "grad_norm": 3.74507737159729, + "learning_rate": 2.855639097744361e-06, + "loss": 0.2048, + "step": 47510 + }, + { + "epoch": 71.46, + "grad_norm": 2.735323905944824, + "learning_rate": 2.8541353383458647e-06, + "loss": 0.2007, + "step": 47520 + }, + { + "epoch": 71.47, + "grad_norm": 4.694336414337158, + "learning_rate": 2.8526315789473687e-06, + "loss": 0.1947, + "step": 47530 + }, + { + "epoch": 71.49, + "grad_norm": 4.027325630187988, + "learning_rate": 2.851127819548872e-06, + "loss": 0.1442, + "step": 47540 + }, + { + "epoch": 71.5, + "grad_norm": 4.482545852661133, + "learning_rate": 2.849624060150376e-06, + "loss": 0.1949, + "step": 47550 + }, + { + "epoch": 71.52, + "grad_norm": 21.274612426757812, + "learning_rate": 2.8481203007518796e-06, + "loss": 0.2099, + "step": 47560 + }, + { + "epoch": 71.53, + "grad_norm": 7.941802978515625, + "learning_rate": 2.8466165413533836e-06, + "loss": 0.1769, + "step": 47570 + }, + { + "epoch": 71.55, + "grad_norm": 11.027046203613281, + "learning_rate": 2.8451127819548875e-06, + "loss": 0.2497, + "step": 47580 + }, + { + "epoch": 71.56, + "grad_norm": 1.5711476802825928, + "learning_rate": 2.843609022556391e-06, + "loss": 0.1957, + "step": 47590 + }, + { + "epoch": 71.58, + "grad_norm": 8.369176864624023, + "learning_rate": 2.842105263157895e-06, + "loss": 0.2131, + "step": 47600 + }, + { + "epoch": 71.59, + "grad_norm": 4.709144592285156, + "learning_rate": 2.8406015037593985e-06, + "loss": 0.2316, + "step": 47610 + }, + { + "epoch": 71.61, + "grad_norm": 4.679172992706299, + "learning_rate": 2.8390977443609024e-06, + "loss": 0.1916, + "step": 47620 + }, + { + "epoch": 71.62, + "grad_norm": 7.5052924156188965, + "learning_rate": 2.8375939849624064e-06, + "loss": 0.1613, + "step": 47630 + }, + { + "epoch": 71.64, + "grad_norm": 6.649926662445068, + "learning_rate": 2.83609022556391e-06, + "loss": 0.1848, + "step": 47640 + }, + { + "epoch": 71.65, + "grad_norm": 5.853745937347412, + "learning_rate": 2.834586466165414e-06, + "loss": 0.1762, + "step": 47650 + }, + { + "epoch": 71.67, + "grad_norm": 6.767156600952148, + "learning_rate": 2.8330827067669174e-06, + "loss": 0.1358, + "step": 47660 + }, + { + "epoch": 71.68, + "grad_norm": 2.320805311203003, + "learning_rate": 2.8315789473684213e-06, + "loss": 0.1576, + "step": 47670 + }, + { + "epoch": 71.7, + "grad_norm": 6.782162666320801, + "learning_rate": 2.8300751879699253e-06, + "loss": 0.2299, + "step": 47680 + }, + { + "epoch": 71.71, + "grad_norm": 5.48624324798584, + "learning_rate": 2.8285714285714288e-06, + "loss": 0.1981, + "step": 47690 + }, + { + "epoch": 71.73, + "grad_norm": 4.4011335372924805, + "learning_rate": 2.8270676691729327e-06, + "loss": 0.1662, + "step": 47700 + }, + { + "epoch": 71.74, + "grad_norm": 2.359865188598633, + "learning_rate": 2.8255639097744362e-06, + "loss": 0.1419, + "step": 47710 + }, + { + "epoch": 71.76, + "grad_norm": 6.685113430023193, + "learning_rate": 2.82406015037594e-06, + "loss": 0.2208, + "step": 47720 + }, + { + "epoch": 71.77, + "grad_norm": 4.598508358001709, + "learning_rate": 2.822556390977444e-06, + "loss": 0.1872, + "step": 47730 + }, + { + "epoch": 71.79, + "grad_norm": 4.389933109283447, + "learning_rate": 2.8210526315789476e-06, + "loss": 0.196, + "step": 47740 + }, + { + "epoch": 71.8, + "grad_norm": 3.9596188068389893, + "learning_rate": 2.8195488721804516e-06, + "loss": 0.121, + "step": 47750 + }, + { + "epoch": 71.82, + "grad_norm": 1.1457980871200562, + "learning_rate": 2.818045112781955e-06, + "loss": 0.2009, + "step": 47760 + }, + { + "epoch": 71.83, + "grad_norm": 6.121349811553955, + "learning_rate": 2.816541353383459e-06, + "loss": 0.1835, + "step": 47770 + }, + { + "epoch": 71.85, + "grad_norm": 4.898597717285156, + "learning_rate": 2.815037593984963e-06, + "loss": 0.1345, + "step": 47780 + }, + { + "epoch": 71.86, + "grad_norm": 10.096803665161133, + "learning_rate": 2.8135338345864665e-06, + "loss": 0.1851, + "step": 47790 + }, + { + "epoch": 71.88, + "grad_norm": 5.386133193969727, + "learning_rate": 2.8120300751879705e-06, + "loss": 0.1939, + "step": 47800 + }, + { + "epoch": 71.89, + "grad_norm": 4.9606428146362305, + "learning_rate": 2.810526315789474e-06, + "loss": 0.1756, + "step": 47810 + }, + { + "epoch": 71.91, + "grad_norm": 3.927384853363037, + "learning_rate": 2.809022556390978e-06, + "loss": 0.1958, + "step": 47820 + }, + { + "epoch": 71.92, + "grad_norm": 1.8455919027328491, + "learning_rate": 2.807518796992482e-06, + "loss": 0.1686, + "step": 47830 + }, + { + "epoch": 71.94, + "grad_norm": 4.488922119140625, + "learning_rate": 2.8060150375939854e-06, + "loss": 0.1532, + "step": 47840 + }, + { + "epoch": 71.95, + "grad_norm": 5.330088138580322, + "learning_rate": 2.8045112781954893e-06, + "loss": 0.1574, + "step": 47850 + }, + { + "epoch": 71.97, + "grad_norm": 5.4682207107543945, + "learning_rate": 2.803007518796993e-06, + "loss": 0.1734, + "step": 47860 + }, + { + "epoch": 71.98, + "grad_norm": 6.902002811431885, + "learning_rate": 2.8015037593984968e-06, + "loss": 0.1673, + "step": 47870 + }, + { + "epoch": 72.0, + "grad_norm": 0.017349636182188988, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.1518, + "step": 47880 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.9311, + "eval_loss": 0.3322134017944336, + "eval_runtime": 84.7025, + "eval_samples_per_second": 118.06, + "eval_steps_per_second": 0.472, + "step": 47880 + }, + { + "epoch": 72.02, + "grad_norm": 2.0250072479248047, + "learning_rate": 2.798496240601504e-06, + "loss": 0.2221, + "step": 47890 + }, + { + "epoch": 72.03, + "grad_norm": 3.505276679992676, + "learning_rate": 2.7969924812030073e-06, + "loss": 0.1402, + "step": 47900 + }, + { + "epoch": 72.05, + "grad_norm": 4.723272323608398, + "learning_rate": 2.7954887218045113e-06, + "loss": 0.1832, + "step": 47910 + }, + { + "epoch": 72.06, + "grad_norm": 11.436491012573242, + "learning_rate": 2.7939849624060152e-06, + "loss": 0.1595, + "step": 47920 + }, + { + "epoch": 72.08, + "grad_norm": 8.044598579406738, + "learning_rate": 2.7924812030075187e-06, + "loss": 0.2156, + "step": 47930 + }, + { + "epoch": 72.09, + "grad_norm": 5.603691577911377, + "learning_rate": 2.7909774436090227e-06, + "loss": 0.1677, + "step": 47940 + }, + { + "epoch": 72.11, + "grad_norm": 3.3360657691955566, + "learning_rate": 2.789473684210526e-06, + "loss": 0.1723, + "step": 47950 + }, + { + "epoch": 72.12, + "grad_norm": 5.758209228515625, + "learning_rate": 2.78796992481203e-06, + "loss": 0.1905, + "step": 47960 + }, + { + "epoch": 72.14, + "grad_norm": 6.080915451049805, + "learning_rate": 2.786466165413534e-06, + "loss": 0.2027, + "step": 47970 + }, + { + "epoch": 72.15, + "grad_norm": 4.1375017166137695, + "learning_rate": 2.7849624060150376e-06, + "loss": 0.1241, + "step": 47980 + }, + { + "epoch": 72.17, + "grad_norm": 4.435542106628418, + "learning_rate": 2.7834586466165415e-06, + "loss": 0.2314, + "step": 47990 + }, + { + "epoch": 72.18, + "grad_norm": 3.8110594749450684, + "learning_rate": 2.781954887218045e-06, + "loss": 0.1544, + "step": 48000 + }, + { + "epoch": 72.2, + "grad_norm": 3.3765087127685547, + "learning_rate": 2.780451127819549e-06, + "loss": 0.2048, + "step": 48010 + }, + { + "epoch": 72.21, + "grad_norm": 6.244101047515869, + "learning_rate": 2.7789473684210525e-06, + "loss": 0.1541, + "step": 48020 + }, + { + "epoch": 72.23, + "grad_norm": 8.180159568786621, + "learning_rate": 2.7774436090225565e-06, + "loss": 0.1783, + "step": 48030 + }, + { + "epoch": 72.24, + "grad_norm": 2.5639536380767822, + "learning_rate": 2.7759398496240604e-06, + "loss": 0.1643, + "step": 48040 + }, + { + "epoch": 72.26, + "grad_norm": 3.273380756378174, + "learning_rate": 2.774436090225564e-06, + "loss": 0.1452, + "step": 48050 + }, + { + "epoch": 72.27, + "grad_norm": 5.514414310455322, + "learning_rate": 2.772932330827068e-06, + "loss": 0.1401, + "step": 48060 + }, + { + "epoch": 72.29, + "grad_norm": 6.009876251220703, + "learning_rate": 2.7714285714285714e-06, + "loss": 0.1812, + "step": 48070 + }, + { + "epoch": 72.3, + "grad_norm": 1.5807594060897827, + "learning_rate": 2.7699248120300753e-06, + "loss": 0.1392, + "step": 48080 + }, + { + "epoch": 72.32, + "grad_norm": 4.397557258605957, + "learning_rate": 2.7684210526315793e-06, + "loss": 0.1882, + "step": 48090 + }, + { + "epoch": 72.33, + "grad_norm": 5.294480323791504, + "learning_rate": 2.766917293233083e-06, + "loss": 0.2203, + "step": 48100 + }, + { + "epoch": 72.35, + "grad_norm": 2.36199688911438, + "learning_rate": 2.7654135338345867e-06, + "loss": 0.2092, + "step": 48110 + }, + { + "epoch": 72.36, + "grad_norm": 6.017412185668945, + "learning_rate": 2.7639097744360903e-06, + "loss": 0.1849, + "step": 48120 + }, + { + "epoch": 72.38, + "grad_norm": 6.914893627166748, + "learning_rate": 2.762406015037594e-06, + "loss": 0.1619, + "step": 48130 + }, + { + "epoch": 72.39, + "grad_norm": 3.912938117980957, + "learning_rate": 2.760902255639098e-06, + "loss": 0.2099, + "step": 48140 + }, + { + "epoch": 72.41, + "grad_norm": 4.893369197845459, + "learning_rate": 2.7593984962406017e-06, + "loss": 0.245, + "step": 48150 + }, + { + "epoch": 72.42, + "grad_norm": 3.4922471046447754, + "learning_rate": 2.7578947368421056e-06, + "loss": 0.1452, + "step": 48160 + }, + { + "epoch": 72.44, + "grad_norm": 10.81135082244873, + "learning_rate": 2.756390977443609e-06, + "loss": 0.1703, + "step": 48170 + }, + { + "epoch": 72.45, + "grad_norm": 5.498162269592285, + "learning_rate": 2.754887218045113e-06, + "loss": 0.1733, + "step": 48180 + }, + { + "epoch": 72.47, + "grad_norm": 3.2235593795776367, + "learning_rate": 2.753383458646617e-06, + "loss": 0.2167, + "step": 48190 + }, + { + "epoch": 72.48, + "grad_norm": 7.933281421661377, + "learning_rate": 2.7518796992481205e-06, + "loss": 0.2088, + "step": 48200 + }, + { + "epoch": 72.5, + "grad_norm": 3.2224316596984863, + "learning_rate": 2.7503759398496245e-06, + "loss": 0.1841, + "step": 48210 + }, + { + "epoch": 72.51, + "grad_norm": 5.602688312530518, + "learning_rate": 2.748872180451128e-06, + "loss": 0.1924, + "step": 48220 + }, + { + "epoch": 72.53, + "grad_norm": 4.668603897094727, + "learning_rate": 2.747368421052632e-06, + "loss": 0.2254, + "step": 48230 + }, + { + "epoch": 72.54, + "grad_norm": 5.575139045715332, + "learning_rate": 2.745864661654136e-06, + "loss": 0.1475, + "step": 48240 + }, + { + "epoch": 72.56, + "grad_norm": 5.640378475189209, + "learning_rate": 2.7443609022556394e-06, + "loss": 0.2488, + "step": 48250 + }, + { + "epoch": 72.57, + "grad_norm": 5.799993515014648, + "learning_rate": 2.7428571428571433e-06, + "loss": 0.206, + "step": 48260 + }, + { + "epoch": 72.59, + "grad_norm": 4.525552272796631, + "learning_rate": 2.741353383458647e-06, + "loss": 0.1442, + "step": 48270 + }, + { + "epoch": 72.6, + "grad_norm": 6.230297565460205, + "learning_rate": 2.739849624060151e-06, + "loss": 0.2157, + "step": 48280 + }, + { + "epoch": 72.62, + "grad_norm": 4.961235046386719, + "learning_rate": 2.7383458646616543e-06, + "loss": 0.1658, + "step": 48290 + }, + { + "epoch": 72.63, + "grad_norm": 6.751160621643066, + "learning_rate": 2.7368421052631583e-06, + "loss": 0.2332, + "step": 48300 + }, + { + "epoch": 72.65, + "grad_norm": 4.099827766418457, + "learning_rate": 2.735338345864662e-06, + "loss": 0.185, + "step": 48310 + }, + { + "epoch": 72.66, + "grad_norm": 6.973018646240234, + "learning_rate": 2.7338345864661657e-06, + "loss": 0.1865, + "step": 48320 + }, + { + "epoch": 72.68, + "grad_norm": 4.904829978942871, + "learning_rate": 2.7323308270676697e-06, + "loss": 0.1717, + "step": 48330 + }, + { + "epoch": 72.69, + "grad_norm": 7.860152721405029, + "learning_rate": 2.730827067669173e-06, + "loss": 0.1862, + "step": 48340 + }, + { + "epoch": 72.71, + "grad_norm": 5.617115020751953, + "learning_rate": 2.729323308270677e-06, + "loss": 0.2077, + "step": 48350 + }, + { + "epoch": 72.72, + "grad_norm": 4.507359504699707, + "learning_rate": 2.727819548872181e-06, + "loss": 0.2003, + "step": 48360 + }, + { + "epoch": 72.74, + "grad_norm": 4.982139587402344, + "learning_rate": 2.7263157894736846e-06, + "loss": 0.237, + "step": 48370 + }, + { + "epoch": 72.75, + "grad_norm": 4.0217604637146, + "learning_rate": 2.7248120300751885e-06, + "loss": 0.1453, + "step": 48380 + }, + { + "epoch": 72.77, + "grad_norm": 7.618423938751221, + "learning_rate": 2.7233082706766916e-06, + "loss": 0.1914, + "step": 48390 + }, + { + "epoch": 72.78, + "grad_norm": 6.787216663360596, + "learning_rate": 2.7218045112781956e-06, + "loss": 0.1451, + "step": 48400 + }, + { + "epoch": 72.8, + "grad_norm": 4.273850440979004, + "learning_rate": 2.720300751879699e-06, + "loss": 0.2088, + "step": 48410 + }, + { + "epoch": 72.81, + "grad_norm": 5.214813709259033, + "learning_rate": 2.718796992481203e-06, + "loss": 0.1597, + "step": 48420 + }, + { + "epoch": 72.83, + "grad_norm": 4.873306751251221, + "learning_rate": 2.7172932330827066e-06, + "loss": 0.1827, + "step": 48430 + }, + { + "epoch": 72.84, + "grad_norm": 5.74524450302124, + "learning_rate": 2.7157894736842105e-06, + "loss": 0.1795, + "step": 48440 + }, + { + "epoch": 72.86, + "grad_norm": 6.21406364440918, + "learning_rate": 2.7142857142857144e-06, + "loss": 0.2171, + "step": 48450 + }, + { + "epoch": 72.87, + "grad_norm": 3.394968032836914, + "learning_rate": 2.712781954887218e-06, + "loss": 0.1758, + "step": 48460 + }, + { + "epoch": 72.89, + "grad_norm": 2.936285972595215, + "learning_rate": 2.711278195488722e-06, + "loss": 0.2162, + "step": 48470 + }, + { + "epoch": 72.9, + "grad_norm": 2.2436184883117676, + "learning_rate": 2.7097744360902254e-06, + "loss": 0.1331, + "step": 48480 + }, + { + "epoch": 72.92, + "grad_norm": 4.331397533416748, + "learning_rate": 2.7082706766917294e-06, + "loss": 0.1934, + "step": 48490 + }, + { + "epoch": 72.93, + "grad_norm": 8.824311256408691, + "learning_rate": 2.7067669172932333e-06, + "loss": 0.2538, + "step": 48500 + }, + { + "epoch": 72.95, + "grad_norm": 7.463961124420166, + "learning_rate": 2.705263157894737e-06, + "loss": 0.212, + "step": 48510 + }, + { + "epoch": 72.96, + "grad_norm": 9.955108642578125, + "learning_rate": 2.7037593984962408e-06, + "loss": 0.2191, + "step": 48520 + }, + { + "epoch": 72.98, + "grad_norm": 3.3155603408813477, + "learning_rate": 2.7022556390977443e-06, + "loss": 0.1842, + "step": 48530 + }, + { + "epoch": 72.99, + "grad_norm": 3.515291929244995, + "learning_rate": 2.7007518796992482e-06, + "loss": 0.1914, + "step": 48540 + }, + { + "epoch": 73.0, + "eval_accuracy": 0.931, + "eval_loss": 0.3263280391693115, + "eval_runtime": 84.3768, + "eval_samples_per_second": 118.516, + "eval_steps_per_second": 0.474, + "step": 48545 + }, + { + "epoch": 73.01, + "grad_norm": 5.640552520751953, + "learning_rate": 2.699248120300752e-06, + "loss": 0.1428, + "step": 48550 + }, + { + "epoch": 73.02, + "grad_norm": 4.772902965545654, + "learning_rate": 2.6977443609022557e-06, + "loss": 0.1821, + "step": 48560 + }, + { + "epoch": 73.04, + "grad_norm": 3.3709876537323, + "learning_rate": 2.6962406015037596e-06, + "loss": 0.1939, + "step": 48570 + }, + { + "epoch": 73.05, + "grad_norm": 8.709583282470703, + "learning_rate": 2.694736842105263e-06, + "loss": 0.24, + "step": 48580 + }, + { + "epoch": 73.07, + "grad_norm": 3.556995391845703, + "learning_rate": 2.693233082706767e-06, + "loss": 0.1939, + "step": 48590 + }, + { + "epoch": 73.08, + "grad_norm": 4.488801002502441, + "learning_rate": 2.691729323308271e-06, + "loss": 0.1767, + "step": 48600 + }, + { + "epoch": 73.1, + "grad_norm": 3.0920093059539795, + "learning_rate": 2.6902255639097746e-06, + "loss": 0.148, + "step": 48610 + }, + { + "epoch": 73.11, + "grad_norm": 7.643060207366943, + "learning_rate": 2.6887218045112785e-06, + "loss": 0.1924, + "step": 48620 + }, + { + "epoch": 73.13, + "grad_norm": 11.217390060424805, + "learning_rate": 2.687218045112782e-06, + "loss": 0.2318, + "step": 48630 + }, + { + "epoch": 73.14, + "grad_norm": 8.313175201416016, + "learning_rate": 2.685714285714286e-06, + "loss": 0.1629, + "step": 48640 + }, + { + "epoch": 73.16, + "grad_norm": 4.154397010803223, + "learning_rate": 2.68421052631579e-06, + "loss": 0.1958, + "step": 48650 + }, + { + "epoch": 73.17, + "grad_norm": 3.842160224914551, + "learning_rate": 2.6827067669172934e-06, + "loss": 0.1451, + "step": 48660 + }, + { + "epoch": 73.19, + "grad_norm": 6.915648460388184, + "learning_rate": 2.6812030075187974e-06, + "loss": 0.1452, + "step": 48670 + }, + { + "epoch": 73.2, + "grad_norm": 4.855998992919922, + "learning_rate": 2.679699248120301e-06, + "loss": 0.1455, + "step": 48680 + }, + { + "epoch": 73.22, + "grad_norm": 1.8087424039840698, + "learning_rate": 2.678195488721805e-06, + "loss": 0.1554, + "step": 48690 + }, + { + "epoch": 73.23, + "grad_norm": 1.4605119228363037, + "learning_rate": 2.6766917293233088e-06, + "loss": 0.2174, + "step": 48700 + }, + { + "epoch": 73.25, + "grad_norm": 6.26193380355835, + "learning_rate": 2.6751879699248123e-06, + "loss": 0.2441, + "step": 48710 + }, + { + "epoch": 73.26, + "grad_norm": 6.614605903625488, + "learning_rate": 2.6736842105263162e-06, + "loss": 0.2272, + "step": 48720 + }, + { + "epoch": 73.28, + "grad_norm": 3.6685211658477783, + "learning_rate": 2.6721804511278197e-06, + "loss": 0.1562, + "step": 48730 + }, + { + "epoch": 73.29, + "grad_norm": 4.120753288269043, + "learning_rate": 2.6706766917293237e-06, + "loss": 0.1567, + "step": 48740 + }, + { + "epoch": 73.31, + "grad_norm": 16.986003875732422, + "learning_rate": 2.669172932330827e-06, + "loss": 0.2109, + "step": 48750 + }, + { + "epoch": 73.32, + "grad_norm": 6.729957580566406, + "learning_rate": 2.667669172932331e-06, + "loss": 0.2327, + "step": 48760 + }, + { + "epoch": 73.34, + "grad_norm": 3.7136998176574707, + "learning_rate": 2.666165413533835e-06, + "loss": 0.2161, + "step": 48770 + }, + { + "epoch": 73.35, + "grad_norm": 3.925506591796875, + "learning_rate": 2.6646616541353386e-06, + "loss": 0.2049, + "step": 48780 + }, + { + "epoch": 73.37, + "grad_norm": 11.8861665725708, + "learning_rate": 2.6631578947368426e-06, + "loss": 0.1432, + "step": 48790 + }, + { + "epoch": 73.38, + "grad_norm": 4.450233459472656, + "learning_rate": 2.661654135338346e-06, + "loss": 0.1958, + "step": 48800 + }, + { + "epoch": 73.4, + "grad_norm": 7.333054542541504, + "learning_rate": 2.66015037593985e-06, + "loss": 0.2231, + "step": 48810 + }, + { + "epoch": 73.41, + "grad_norm": 5.88569974899292, + "learning_rate": 2.658646616541354e-06, + "loss": 0.1502, + "step": 48820 + }, + { + "epoch": 73.43, + "grad_norm": 4.52285099029541, + "learning_rate": 2.6571428571428575e-06, + "loss": 0.2144, + "step": 48830 + }, + { + "epoch": 73.44, + "grad_norm": 6.34915828704834, + "learning_rate": 2.6556390977443614e-06, + "loss": 0.221, + "step": 48840 + }, + { + "epoch": 73.46, + "grad_norm": 2.9954657554626465, + "learning_rate": 2.654135338345865e-06, + "loss": 0.1306, + "step": 48850 + }, + { + "epoch": 73.47, + "grad_norm": 4.723332405090332, + "learning_rate": 2.652631578947369e-06, + "loss": 0.1725, + "step": 48860 + }, + { + "epoch": 73.49, + "grad_norm": 5.571535587310791, + "learning_rate": 2.651127819548873e-06, + "loss": 0.2305, + "step": 48870 + }, + { + "epoch": 73.5, + "grad_norm": 6.174119472503662, + "learning_rate": 2.6496240601503763e-06, + "loss": 0.2203, + "step": 48880 + }, + { + "epoch": 73.52, + "grad_norm": 3.709364175796509, + "learning_rate": 2.6481203007518794e-06, + "loss": 0.1775, + "step": 48890 + }, + { + "epoch": 73.53, + "grad_norm": 7.268237590789795, + "learning_rate": 2.6466165413533834e-06, + "loss": 0.212, + "step": 48900 + }, + { + "epoch": 73.55, + "grad_norm": 1.238796591758728, + "learning_rate": 2.6451127819548873e-06, + "loss": 0.1747, + "step": 48910 + }, + { + "epoch": 73.56, + "grad_norm": 3.121783971786499, + "learning_rate": 2.643609022556391e-06, + "loss": 0.1938, + "step": 48920 + }, + { + "epoch": 73.58, + "grad_norm": 4.7542500495910645, + "learning_rate": 2.6421052631578948e-06, + "loss": 0.2038, + "step": 48930 + }, + { + "epoch": 73.59, + "grad_norm": 5.120677471160889, + "learning_rate": 2.6406015037593983e-06, + "loss": 0.2355, + "step": 48940 + }, + { + "epoch": 73.61, + "grad_norm": 6.299097537994385, + "learning_rate": 2.6390977443609022e-06, + "loss": 0.2069, + "step": 48950 + }, + { + "epoch": 73.62, + "grad_norm": 4.0084428787231445, + "learning_rate": 2.637593984962406e-06, + "loss": 0.1697, + "step": 48960 + }, + { + "epoch": 73.64, + "grad_norm": 3.608576536178589, + "learning_rate": 2.6360902255639097e-06, + "loss": 0.1726, + "step": 48970 + }, + { + "epoch": 73.65, + "grad_norm": 5.771646499633789, + "learning_rate": 2.6345864661654137e-06, + "loss": 0.2716, + "step": 48980 + }, + { + "epoch": 73.67, + "grad_norm": 8.164382934570312, + "learning_rate": 2.633082706766917e-06, + "loss": 0.1862, + "step": 48990 + }, + { + "epoch": 73.68, + "grad_norm": 5.360281467437744, + "learning_rate": 2.631578947368421e-06, + "loss": 0.163, + "step": 49000 + }, + { + "epoch": 73.7, + "grad_norm": 5.156652450561523, + "learning_rate": 2.630075187969925e-06, + "loss": 0.1928, + "step": 49010 + }, + { + "epoch": 73.71, + "grad_norm": 3.7978944778442383, + "learning_rate": 2.6285714285714286e-06, + "loss": 0.1318, + "step": 49020 + }, + { + "epoch": 73.73, + "grad_norm": 7.672252655029297, + "learning_rate": 2.6270676691729325e-06, + "loss": 0.2079, + "step": 49030 + }, + { + "epoch": 73.74, + "grad_norm": 4.497259616851807, + "learning_rate": 2.625563909774436e-06, + "loss": 0.2074, + "step": 49040 + }, + { + "epoch": 73.76, + "grad_norm": 4.420539855957031, + "learning_rate": 2.62406015037594e-06, + "loss": 0.1372, + "step": 49050 + }, + { + "epoch": 73.77, + "grad_norm": 4.705691814422607, + "learning_rate": 2.622556390977444e-06, + "loss": 0.2247, + "step": 49060 + }, + { + "epoch": 73.79, + "grad_norm": 4.325020790100098, + "learning_rate": 2.6210526315789474e-06, + "loss": 0.1768, + "step": 49070 + }, + { + "epoch": 73.8, + "grad_norm": 4.0322442054748535, + "learning_rate": 2.6195488721804514e-06, + "loss": 0.1707, + "step": 49080 + }, + { + "epoch": 73.82, + "grad_norm": 7.42966890335083, + "learning_rate": 2.618045112781955e-06, + "loss": 0.2521, + "step": 49090 + }, + { + "epoch": 73.83, + "grad_norm": 4.630422115325928, + "learning_rate": 2.616541353383459e-06, + "loss": 0.2168, + "step": 49100 + }, + { + "epoch": 73.85, + "grad_norm": 5.983313083648682, + "learning_rate": 2.6150375939849628e-06, + "loss": 0.2102, + "step": 49110 + }, + { + "epoch": 73.86, + "grad_norm": 2.551830291748047, + "learning_rate": 2.6135338345864663e-06, + "loss": 0.1586, + "step": 49120 + }, + { + "epoch": 73.88, + "grad_norm": 6.646661281585693, + "learning_rate": 2.6120300751879702e-06, + "loss": 0.2198, + "step": 49130 + }, + { + "epoch": 73.89, + "grad_norm": 5.7124176025390625, + "learning_rate": 2.6105263157894738e-06, + "loss": 0.2081, + "step": 49140 + }, + { + "epoch": 73.91, + "grad_norm": 15.754138946533203, + "learning_rate": 2.6090225563909777e-06, + "loss": 0.1661, + "step": 49150 + }, + { + "epoch": 73.92, + "grad_norm": 5.884487152099609, + "learning_rate": 2.6075187969924817e-06, + "loss": 0.1548, + "step": 49160 + }, + { + "epoch": 73.94, + "grad_norm": 1.3139573335647583, + "learning_rate": 2.606015037593985e-06, + "loss": 0.1773, + "step": 49170 + }, + { + "epoch": 73.95, + "grad_norm": 4.792730808258057, + "learning_rate": 2.604511278195489e-06, + "loss": 0.1666, + "step": 49180 + }, + { + "epoch": 73.97, + "grad_norm": 1.9457217454910278, + "learning_rate": 2.6030075187969926e-06, + "loss": 0.1575, + "step": 49190 + }, + { + "epoch": 73.98, + "grad_norm": 6.806911468505859, + "learning_rate": 2.6015037593984966e-06, + "loss": 0.1794, + "step": 49200 + }, + { + "epoch": 74.0, + "grad_norm": 0.026680290699005127, + "learning_rate": 2.6e-06, + "loss": 0.2097, + "step": 49210 + }, + { + "epoch": 74.0, + "eval_accuracy": 0.9294, + "eval_loss": 0.33669278025627136, + "eval_runtime": 84.9311, + "eval_samples_per_second": 117.743, + "eval_steps_per_second": 0.471, + "step": 49210 + }, + { + "epoch": 74.02, + "grad_norm": 6.460815906524658, + "learning_rate": 2.598496240601504e-06, + "loss": 0.1717, + "step": 49220 + }, + { + "epoch": 74.03, + "grad_norm": 5.829145908355713, + "learning_rate": 2.596992481203008e-06, + "loss": 0.2016, + "step": 49230 + }, + { + "epoch": 74.05, + "grad_norm": 11.389643669128418, + "learning_rate": 2.5954887218045115e-06, + "loss": 0.2195, + "step": 49240 + }, + { + "epoch": 74.06, + "grad_norm": 2.792567253112793, + "learning_rate": 2.5939849624060154e-06, + "loss": 0.1553, + "step": 49250 + }, + { + "epoch": 74.08, + "grad_norm": 3.7517435550689697, + "learning_rate": 2.592481203007519e-06, + "loss": 0.1597, + "step": 49260 + }, + { + "epoch": 74.09, + "grad_norm": 4.992012023925781, + "learning_rate": 2.590977443609023e-06, + "loss": 0.1487, + "step": 49270 + }, + { + "epoch": 74.11, + "grad_norm": 4.339962482452393, + "learning_rate": 2.589473684210527e-06, + "loss": 0.1671, + "step": 49280 + }, + { + "epoch": 74.12, + "grad_norm": 2.0285701751708984, + "learning_rate": 2.5879699248120304e-06, + "loss": 0.1598, + "step": 49290 + }, + { + "epoch": 74.14, + "grad_norm": 3.2112770080566406, + "learning_rate": 2.5864661654135343e-06, + "loss": 0.1299, + "step": 49300 + }, + { + "epoch": 74.15, + "grad_norm": 3.4635910987854004, + "learning_rate": 2.584962406015038e-06, + "loss": 0.1812, + "step": 49310 + }, + { + "epoch": 74.17, + "grad_norm": 8.604695320129395, + "learning_rate": 2.5834586466165418e-06, + "loss": 0.2481, + "step": 49320 + }, + { + "epoch": 74.18, + "grad_norm": 3.5391902923583984, + "learning_rate": 2.5819548872180457e-06, + "loss": 0.1636, + "step": 49330 + }, + { + "epoch": 74.2, + "grad_norm": 1.695279836654663, + "learning_rate": 2.5804511278195492e-06, + "loss": 0.1739, + "step": 49340 + }, + { + "epoch": 74.21, + "grad_norm": 4.957259178161621, + "learning_rate": 2.578947368421053e-06, + "loss": 0.1369, + "step": 49350 + }, + { + "epoch": 74.23, + "grad_norm": 6.202530860900879, + "learning_rate": 2.5774436090225567e-06, + "loss": 0.2257, + "step": 49360 + }, + { + "epoch": 74.24, + "grad_norm": 7.947466850280762, + "learning_rate": 2.5759398496240606e-06, + "loss": 0.1961, + "step": 49370 + }, + { + "epoch": 74.26, + "grad_norm": 6.451323509216309, + "learning_rate": 2.5744360902255637e-06, + "loss": 0.2195, + "step": 49380 + }, + { + "epoch": 74.27, + "grad_norm": 5.297338008880615, + "learning_rate": 2.5729323308270677e-06, + "loss": 0.1799, + "step": 49390 + }, + { + "epoch": 74.29, + "grad_norm": 4.922117233276367, + "learning_rate": 2.571428571428571e-06, + "loss": 0.2173, + "step": 49400 + }, + { + "epoch": 74.3, + "grad_norm": 1.2803465127944946, + "learning_rate": 2.569924812030075e-06, + "loss": 0.1658, + "step": 49410 + }, + { + "epoch": 74.32, + "grad_norm": 2.5603795051574707, + "learning_rate": 2.568421052631579e-06, + "loss": 0.1427, + "step": 49420 + }, + { + "epoch": 74.33, + "grad_norm": 5.846049785614014, + "learning_rate": 2.5669172932330826e-06, + "loss": 0.2481, + "step": 49430 + }, + { + "epoch": 74.35, + "grad_norm": 6.60819149017334, + "learning_rate": 2.5654135338345865e-06, + "loss": 0.1853, + "step": 49440 + }, + { + "epoch": 74.36, + "grad_norm": 3.9946815967559814, + "learning_rate": 2.56390977443609e-06, + "loss": 0.2012, + "step": 49450 + }, + { + "epoch": 74.38, + "grad_norm": 9.012832641601562, + "learning_rate": 2.562406015037594e-06, + "loss": 0.2052, + "step": 49460 + }, + { + "epoch": 74.39, + "grad_norm": 4.631398677825928, + "learning_rate": 2.560902255639098e-06, + "loss": 0.1995, + "step": 49470 + }, + { + "epoch": 74.41, + "grad_norm": 3.723080635070801, + "learning_rate": 2.5593984962406015e-06, + "loss": 0.1542, + "step": 49480 + }, + { + "epoch": 74.42, + "grad_norm": 5.1829633712768555, + "learning_rate": 2.5578947368421054e-06, + "loss": 0.2244, + "step": 49490 + }, + { + "epoch": 74.44, + "grad_norm": 6.623849391937256, + "learning_rate": 2.556390977443609e-06, + "loss": 0.2161, + "step": 49500 + }, + { + "epoch": 74.45, + "grad_norm": 7.301265239715576, + "learning_rate": 2.554887218045113e-06, + "loss": 0.1301, + "step": 49510 + }, + { + "epoch": 74.47, + "grad_norm": 3.6955883502960205, + "learning_rate": 2.553383458646617e-06, + "loss": 0.1758, + "step": 49520 + }, + { + "epoch": 74.48, + "grad_norm": 9.507878303527832, + "learning_rate": 2.5518796992481203e-06, + "loss": 0.1833, + "step": 49530 + }, + { + "epoch": 74.5, + "grad_norm": 3.5736141204833984, + "learning_rate": 2.5503759398496243e-06, + "loss": 0.1582, + "step": 49540 + }, + { + "epoch": 74.51, + "grad_norm": 4.5012006759643555, + "learning_rate": 2.548872180451128e-06, + "loss": 0.1849, + "step": 49550 + }, + { + "epoch": 74.53, + "grad_norm": 1.9890389442443848, + "learning_rate": 2.5473684210526317e-06, + "loss": 0.2109, + "step": 49560 + }, + { + "epoch": 74.54, + "grad_norm": 6.087845325469971, + "learning_rate": 2.5458646616541357e-06, + "loss": 0.2116, + "step": 49570 + }, + { + "epoch": 74.56, + "grad_norm": 4.9256157875061035, + "learning_rate": 2.544360902255639e-06, + "loss": 0.1751, + "step": 49580 + }, + { + "epoch": 74.57, + "grad_norm": 7.381353855133057, + "learning_rate": 2.542857142857143e-06, + "loss": 0.1588, + "step": 49590 + }, + { + "epoch": 74.59, + "grad_norm": 2.9443299770355225, + "learning_rate": 2.5413533834586467e-06, + "loss": 0.1763, + "step": 49600 + }, + { + "epoch": 74.6, + "grad_norm": 4.252871513366699, + "learning_rate": 2.5398496240601506e-06, + "loss": 0.1819, + "step": 49610 + }, + { + "epoch": 74.62, + "grad_norm": 4.36507511138916, + "learning_rate": 2.5383458646616545e-06, + "loss": 0.1788, + "step": 49620 + }, + { + "epoch": 74.63, + "grad_norm": 5.069661617279053, + "learning_rate": 2.536842105263158e-06, + "loss": 0.2279, + "step": 49630 + }, + { + "epoch": 74.65, + "grad_norm": 1.5738500356674194, + "learning_rate": 2.535338345864662e-06, + "loss": 0.1562, + "step": 49640 + }, + { + "epoch": 74.66, + "grad_norm": 3.425536632537842, + "learning_rate": 2.5338345864661655e-06, + "loss": 0.1367, + "step": 49650 + }, + { + "epoch": 74.68, + "grad_norm": 7.227283954620361, + "learning_rate": 2.5323308270676695e-06, + "loss": 0.1524, + "step": 49660 + }, + { + "epoch": 74.69, + "grad_norm": 4.1548075675964355, + "learning_rate": 2.530827067669173e-06, + "loss": 0.1361, + "step": 49670 + }, + { + "epoch": 74.71, + "grad_norm": 5.1794610023498535, + "learning_rate": 2.529323308270677e-06, + "loss": 0.1373, + "step": 49680 + }, + { + "epoch": 74.72, + "grad_norm": 7.747637748718262, + "learning_rate": 2.527819548872181e-06, + "loss": 0.1758, + "step": 49690 + }, + { + "epoch": 74.74, + "grad_norm": 3.845520496368408, + "learning_rate": 2.5263157894736844e-06, + "loss": 0.2025, + "step": 49700 + }, + { + "epoch": 74.75, + "grad_norm": 8.356101989746094, + "learning_rate": 2.5248120300751883e-06, + "loss": 0.1597, + "step": 49710 + }, + { + "epoch": 74.77, + "grad_norm": 3.1833062171936035, + "learning_rate": 2.523308270676692e-06, + "loss": 0.1411, + "step": 49720 + }, + { + "epoch": 74.78, + "grad_norm": 6.692790508270264, + "learning_rate": 2.521804511278196e-06, + "loss": 0.1809, + "step": 49730 + }, + { + "epoch": 74.8, + "grad_norm": 6.7796454429626465, + "learning_rate": 2.5203007518796997e-06, + "loss": 0.1418, + "step": 49740 + }, + { + "epoch": 74.81, + "grad_norm": 4.885359287261963, + "learning_rate": 2.5187969924812033e-06, + "loss": 0.2639, + "step": 49750 + }, + { + "epoch": 74.83, + "grad_norm": 6.379741668701172, + "learning_rate": 2.517293233082707e-06, + "loss": 0.2007, + "step": 49760 + }, + { + "epoch": 74.84, + "grad_norm": 7.056103706359863, + "learning_rate": 2.5157894736842107e-06, + "loss": 0.2262, + "step": 49770 + }, + { + "epoch": 74.86, + "grad_norm": 7.777561664581299, + "learning_rate": 2.5142857142857147e-06, + "loss": 0.2332, + "step": 49780 + }, + { + "epoch": 74.87, + "grad_norm": 5.086285591125488, + "learning_rate": 2.5127819548872186e-06, + "loss": 0.2878, + "step": 49790 + }, + { + "epoch": 74.89, + "grad_norm": 2.857342481613159, + "learning_rate": 2.511278195488722e-06, + "loss": 0.1854, + "step": 49800 + }, + { + "epoch": 74.9, + "grad_norm": 2.6614773273468018, + "learning_rate": 2.509774436090226e-06, + "loss": 0.1806, + "step": 49810 + }, + { + "epoch": 74.92, + "grad_norm": 6.267194747924805, + "learning_rate": 2.5082706766917296e-06, + "loss": 0.1506, + "step": 49820 + }, + { + "epoch": 74.93, + "grad_norm": 7.167734622955322, + "learning_rate": 2.5067669172932335e-06, + "loss": 0.1308, + "step": 49830 + }, + { + "epoch": 74.95, + "grad_norm": 3.8216676712036133, + "learning_rate": 2.5052631578947375e-06, + "loss": 0.1914, + "step": 49840 + }, + { + "epoch": 74.96, + "grad_norm": 4.203033447265625, + "learning_rate": 2.503759398496241e-06, + "loss": 0.1303, + "step": 49850 + }, + { + "epoch": 74.98, + "grad_norm": 2.4965991973876953, + "learning_rate": 2.502255639097745e-06, + "loss": 0.1785, + "step": 49860 + }, + { + "epoch": 74.99, + "grad_norm": 1.2828130722045898, + "learning_rate": 2.5007518796992484e-06, + "loss": 0.1423, + "step": 49870 + }, + { + "epoch": 75.0, + "eval_accuracy": 0.9299, + "eval_loss": 0.3285817503929138, + "eval_runtime": 84.9307, + "eval_samples_per_second": 117.743, + "eval_steps_per_second": 0.471, + "step": 49875 + }, + { + "epoch": 75.01, + "grad_norm": 3.812678337097168, + "learning_rate": 2.499248120300752e-06, + "loss": 0.1716, + "step": 49880 + }, + { + "epoch": 75.02, + "grad_norm": 1.7645814418792725, + "learning_rate": 2.497744360902256e-06, + "loss": 0.1242, + "step": 49890 + }, + { + "epoch": 75.04, + "grad_norm": 3.069406509399414, + "learning_rate": 2.4962406015037594e-06, + "loss": 0.1708, + "step": 49900 + }, + { + "epoch": 75.05, + "grad_norm": 4.357547283172607, + "learning_rate": 2.4947368421052634e-06, + "loss": 0.1933, + "step": 49910 + }, + { + "epoch": 75.07, + "grad_norm": 6.222829818725586, + "learning_rate": 2.4932330827067673e-06, + "loss": 0.1914, + "step": 49920 + }, + { + "epoch": 75.08, + "grad_norm": 5.128503322601318, + "learning_rate": 2.491729323308271e-06, + "loss": 0.2163, + "step": 49930 + }, + { + "epoch": 75.1, + "grad_norm": 4.528236389160156, + "learning_rate": 2.4902255639097748e-06, + "loss": 0.1979, + "step": 49940 + }, + { + "epoch": 75.11, + "grad_norm": 3.4620494842529297, + "learning_rate": 2.4887218045112783e-06, + "loss": 0.1922, + "step": 49950 + }, + { + "epoch": 75.13, + "grad_norm": 7.054966449737549, + "learning_rate": 2.4872180451127822e-06, + "loss": 0.242, + "step": 49960 + }, + { + "epoch": 75.14, + "grad_norm": 5.699782371520996, + "learning_rate": 2.485714285714286e-06, + "loss": 0.1489, + "step": 49970 + }, + { + "epoch": 75.16, + "grad_norm": 6.1800408363342285, + "learning_rate": 2.4842105263157897e-06, + "loss": 0.1506, + "step": 49980 + }, + { + "epoch": 75.17, + "grad_norm": 2.10766863822937, + "learning_rate": 2.4827067669172936e-06, + "loss": 0.1478, + "step": 49990 + }, + { + "epoch": 75.19, + "grad_norm": 3.423696279525757, + "learning_rate": 2.481203007518797e-06, + "loss": 0.1526, + "step": 50000 + }, + { + "epoch": 75.2, + "grad_norm": 3.7721095085144043, + "learning_rate": 2.4796992481203007e-06, + "loss": 0.2094, + "step": 50010 + }, + { + "epoch": 75.22, + "grad_norm": 5.331075668334961, + "learning_rate": 2.4781954887218046e-06, + "loss": 0.1597, + "step": 50020 + }, + { + "epoch": 75.23, + "grad_norm": 6.622517108917236, + "learning_rate": 2.4766917293233086e-06, + "loss": 0.2322, + "step": 50030 + }, + { + "epoch": 75.25, + "grad_norm": 6.970418930053711, + "learning_rate": 2.475187969924812e-06, + "loss": 0.1812, + "step": 50040 + }, + { + "epoch": 75.26, + "grad_norm": 7.753775119781494, + "learning_rate": 2.473684210526316e-06, + "loss": 0.1515, + "step": 50050 + }, + { + "epoch": 75.28, + "grad_norm": 13.116275787353516, + "learning_rate": 2.4721804511278195e-06, + "loss": 0.1887, + "step": 50060 + }, + { + "epoch": 75.29, + "grad_norm": 4.448867321014404, + "learning_rate": 2.4706766917293235e-06, + "loss": 0.1433, + "step": 50070 + }, + { + "epoch": 75.31, + "grad_norm": 3.7794928550720215, + "learning_rate": 2.4691729323308274e-06, + "loss": 0.1493, + "step": 50080 + }, + { + "epoch": 75.32, + "grad_norm": 5.0668416023254395, + "learning_rate": 2.467669172932331e-06, + "loss": 0.1536, + "step": 50090 + }, + { + "epoch": 75.34, + "grad_norm": 3.405148983001709, + "learning_rate": 2.466165413533835e-06, + "loss": 0.222, + "step": 50100 + }, + { + "epoch": 75.35, + "grad_norm": 5.900206565856934, + "learning_rate": 2.4646616541353384e-06, + "loss": 0.1388, + "step": 50110 + }, + { + "epoch": 75.37, + "grad_norm": 4.416190147399902, + "learning_rate": 2.4631578947368424e-06, + "loss": 0.1744, + "step": 50120 + }, + { + "epoch": 75.38, + "grad_norm": 7.959252834320068, + "learning_rate": 2.461654135338346e-06, + "loss": 0.2386, + "step": 50130 + }, + { + "epoch": 75.4, + "grad_norm": 4.957620143890381, + "learning_rate": 2.46015037593985e-06, + "loss": 0.1717, + "step": 50140 + }, + { + "epoch": 75.41, + "grad_norm": 6.785633087158203, + "learning_rate": 2.4586466165413538e-06, + "loss": 0.1783, + "step": 50150 + }, + { + "epoch": 75.43, + "grad_norm": 8.179920196533203, + "learning_rate": 2.4571428571428573e-06, + "loss": 0.2003, + "step": 50160 + }, + { + "epoch": 75.44, + "grad_norm": 2.4973435401916504, + "learning_rate": 2.4556390977443612e-06, + "loss": 0.1877, + "step": 50170 + }, + { + "epoch": 75.46, + "grad_norm": 4.815390586853027, + "learning_rate": 2.4541353383458647e-06, + "loss": 0.26, + "step": 50180 + }, + { + "epoch": 75.47, + "grad_norm": 6.23560905456543, + "learning_rate": 2.4526315789473687e-06, + "loss": 0.1683, + "step": 50190 + }, + { + "epoch": 75.49, + "grad_norm": 0.35644736886024475, + "learning_rate": 2.4511278195488726e-06, + "loss": 0.1489, + "step": 50200 + }, + { + "epoch": 75.5, + "grad_norm": 5.562798500061035, + "learning_rate": 2.449624060150376e-06, + "loss": 0.2343, + "step": 50210 + }, + { + "epoch": 75.52, + "grad_norm": 6.464638710021973, + "learning_rate": 2.44812030075188e-06, + "loss": 0.2042, + "step": 50220 + }, + { + "epoch": 75.53, + "grad_norm": 3.009085178375244, + "learning_rate": 2.4466165413533836e-06, + "loss": 0.1565, + "step": 50230 + }, + { + "epoch": 75.55, + "grad_norm": 3.3092687129974365, + "learning_rate": 2.4451127819548875e-06, + "loss": 0.1583, + "step": 50240 + }, + { + "epoch": 75.56, + "grad_norm": 1.3774584531784058, + "learning_rate": 2.443609022556391e-06, + "loss": 0.1763, + "step": 50250 + }, + { + "epoch": 75.58, + "grad_norm": 4.398240089416504, + "learning_rate": 2.442105263157895e-06, + "loss": 0.2157, + "step": 50260 + }, + { + "epoch": 75.59, + "grad_norm": 2.5172154903411865, + "learning_rate": 2.4406015037593985e-06, + "loss": 0.2076, + "step": 50270 + }, + { + "epoch": 75.61, + "grad_norm": 5.356011867523193, + "learning_rate": 2.4390977443609025e-06, + "loss": 0.1441, + "step": 50280 + }, + { + "epoch": 75.62, + "grad_norm": 5.028241157531738, + "learning_rate": 2.437593984962406e-06, + "loss": 0.1921, + "step": 50290 + }, + { + "epoch": 75.64, + "grad_norm": 5.889922142028809, + "learning_rate": 2.43609022556391e-06, + "loss": 0.1315, + "step": 50300 + }, + { + "epoch": 75.65, + "grad_norm": 6.243462562561035, + "learning_rate": 2.434586466165414e-06, + "loss": 0.2106, + "step": 50310 + }, + { + "epoch": 75.67, + "grad_norm": 6.075836181640625, + "learning_rate": 2.4330827067669174e-06, + "loss": 0.2055, + "step": 50320 + }, + { + "epoch": 75.68, + "grad_norm": 5.1835150718688965, + "learning_rate": 2.4315789473684213e-06, + "loss": 0.2298, + "step": 50330 + }, + { + "epoch": 75.7, + "grad_norm": 4.660686016082764, + "learning_rate": 2.430075187969925e-06, + "loss": 0.173, + "step": 50340 + }, + { + "epoch": 75.71, + "grad_norm": 3.6795620918273926, + "learning_rate": 2.428571428571429e-06, + "loss": 0.1852, + "step": 50350 + }, + { + "epoch": 75.73, + "grad_norm": 4.784815788269043, + "learning_rate": 2.4270676691729323e-06, + "loss": 0.1739, + "step": 50360 + }, + { + "epoch": 75.74, + "grad_norm": 6.113933563232422, + "learning_rate": 2.4255639097744363e-06, + "loss": 0.2325, + "step": 50370 + }, + { + "epoch": 75.76, + "grad_norm": 3.9142940044403076, + "learning_rate": 2.42406015037594e-06, + "loss": 0.1929, + "step": 50380 + }, + { + "epoch": 75.77, + "grad_norm": 3.883265256881714, + "learning_rate": 2.4225563909774437e-06, + "loss": 0.2146, + "step": 50390 + }, + { + "epoch": 75.79, + "grad_norm": 10.637557029724121, + "learning_rate": 2.4210526315789477e-06, + "loss": 0.2488, + "step": 50400 + }, + { + "epoch": 75.8, + "grad_norm": 3.309053659439087, + "learning_rate": 2.419548872180451e-06, + "loss": 0.2057, + "step": 50410 + }, + { + "epoch": 75.82, + "grad_norm": 7.711801052093506, + "learning_rate": 2.418045112781955e-06, + "loss": 0.2042, + "step": 50420 + }, + { + "epoch": 75.83, + "grad_norm": 3.687312602996826, + "learning_rate": 2.416541353383459e-06, + "loss": 0.1693, + "step": 50430 + }, + { + "epoch": 75.85, + "grad_norm": 5.525257587432861, + "learning_rate": 2.4150375939849626e-06, + "loss": 0.2097, + "step": 50440 + }, + { + "epoch": 75.86, + "grad_norm": 4.86057186126709, + "learning_rate": 2.4135338345864665e-06, + "loss": 0.1844, + "step": 50450 + }, + { + "epoch": 75.88, + "grad_norm": 2.9280784130096436, + "learning_rate": 2.41203007518797e-06, + "loss": 0.13, + "step": 50460 + }, + { + "epoch": 75.89, + "grad_norm": 2.978746175765991, + "learning_rate": 2.410526315789474e-06, + "loss": 0.1435, + "step": 50470 + }, + { + "epoch": 75.91, + "grad_norm": 7.612166404724121, + "learning_rate": 2.409022556390978e-06, + "loss": 0.1784, + "step": 50480 + }, + { + "epoch": 75.92, + "grad_norm": 5.406614303588867, + "learning_rate": 2.4075187969924814e-06, + "loss": 0.2408, + "step": 50490 + }, + { + "epoch": 75.94, + "grad_norm": 7.032416820526123, + "learning_rate": 2.406015037593985e-06, + "loss": 0.1511, + "step": 50500 + }, + { + "epoch": 75.95, + "grad_norm": 6.901172161102295, + "learning_rate": 2.404511278195489e-06, + "loss": 0.2211, + "step": 50510 + }, + { + "epoch": 75.97, + "grad_norm": 5.923864841461182, + "learning_rate": 2.4030075187969924e-06, + "loss": 0.18, + "step": 50520 + }, + { + "epoch": 75.98, + "grad_norm": 7.222829341888428, + "learning_rate": 2.4015037593984964e-06, + "loss": 0.2284, + "step": 50530 + }, + { + "epoch": 76.0, + "grad_norm": 0.01305259671062231, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.1953, + "step": 50540 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.9307, + "eval_loss": 0.3337118625640869, + "eval_runtime": 84.2661, + "eval_samples_per_second": 118.672, + "eval_steps_per_second": 0.475, + "step": 50540 + }, + { + "epoch": 76.02, + "grad_norm": 5.751012802124023, + "learning_rate": 2.398496240601504e-06, + "loss": 0.1967, + "step": 50550 + }, + { + "epoch": 76.03, + "grad_norm": 4.3412089347839355, + "learning_rate": 2.3969924812030078e-06, + "loss": 0.1634, + "step": 50560 + }, + { + "epoch": 76.05, + "grad_norm": 5.5192155838012695, + "learning_rate": 2.3954887218045113e-06, + "loss": 0.1627, + "step": 50570 + }, + { + "epoch": 76.06, + "grad_norm": 12.006901741027832, + "learning_rate": 2.3939849624060152e-06, + "loss": 0.2048, + "step": 50580 + }, + { + "epoch": 76.08, + "grad_norm": 3.821943521499634, + "learning_rate": 2.3924812030075188e-06, + "loss": 0.1681, + "step": 50590 + }, + { + "epoch": 76.09, + "grad_norm": 7.498505115509033, + "learning_rate": 2.3909774436090227e-06, + "loss": 0.2294, + "step": 50600 + }, + { + "epoch": 76.11, + "grad_norm": 1.650476336479187, + "learning_rate": 2.3894736842105266e-06, + "loss": 0.1929, + "step": 50610 + }, + { + "epoch": 76.12, + "grad_norm": 3.873107671737671, + "learning_rate": 2.38796992481203e-06, + "loss": 0.1953, + "step": 50620 + }, + { + "epoch": 76.14, + "grad_norm": 5.848465442657471, + "learning_rate": 2.386466165413534e-06, + "loss": 0.2391, + "step": 50630 + }, + { + "epoch": 76.15, + "grad_norm": 10.972421646118164, + "learning_rate": 2.3849624060150376e-06, + "loss": 0.1747, + "step": 50640 + }, + { + "epoch": 76.17, + "grad_norm": 3.4140889644622803, + "learning_rate": 2.3834586466165416e-06, + "loss": 0.1541, + "step": 50650 + }, + { + "epoch": 76.18, + "grad_norm": 4.576592922210693, + "learning_rate": 2.3819548872180455e-06, + "loss": 0.2146, + "step": 50660 + }, + { + "epoch": 76.2, + "grad_norm": 7.381382465362549, + "learning_rate": 2.380451127819549e-06, + "loss": 0.1912, + "step": 50670 + }, + { + "epoch": 76.21, + "grad_norm": 7.457504749298096, + "learning_rate": 2.378947368421053e-06, + "loss": 0.1681, + "step": 50680 + }, + { + "epoch": 76.23, + "grad_norm": 10.005772590637207, + "learning_rate": 2.3774436090225565e-06, + "loss": 0.2002, + "step": 50690 + }, + { + "epoch": 76.24, + "grad_norm": 8.001717567443848, + "learning_rate": 2.3759398496240604e-06, + "loss": 0.2091, + "step": 50700 + }, + { + "epoch": 76.26, + "grad_norm": 4.301360130310059, + "learning_rate": 2.3744360902255644e-06, + "loss": 0.1585, + "step": 50710 + }, + { + "epoch": 76.27, + "grad_norm": 2.692290782928467, + "learning_rate": 2.372932330827068e-06, + "loss": 0.1992, + "step": 50720 + }, + { + "epoch": 76.29, + "grad_norm": 7.287367820739746, + "learning_rate": 2.371428571428572e-06, + "loss": 0.2296, + "step": 50730 + }, + { + "epoch": 76.3, + "grad_norm": 5.782092094421387, + "learning_rate": 2.3699248120300754e-06, + "loss": 0.1614, + "step": 50740 + }, + { + "epoch": 76.32, + "grad_norm": 1.4290227890014648, + "learning_rate": 2.368421052631579e-06, + "loss": 0.1392, + "step": 50750 + }, + { + "epoch": 76.33, + "grad_norm": 5.241323471069336, + "learning_rate": 2.366917293233083e-06, + "loss": 0.119, + "step": 50760 + }, + { + "epoch": 76.35, + "grad_norm": 5.544551849365234, + "learning_rate": 2.3654135338345868e-06, + "loss": 0.1617, + "step": 50770 + }, + { + "epoch": 76.36, + "grad_norm": 4.271751880645752, + "learning_rate": 2.3639097744360903e-06, + "loss": 0.1435, + "step": 50780 + }, + { + "epoch": 76.38, + "grad_norm": 3.508246660232544, + "learning_rate": 2.3624060150375942e-06, + "loss": 0.1811, + "step": 50790 + }, + { + "epoch": 76.39, + "grad_norm": 6.839121341705322, + "learning_rate": 2.3609022556390977e-06, + "loss": 0.1943, + "step": 50800 + }, + { + "epoch": 76.41, + "grad_norm": 6.446866512298584, + "learning_rate": 2.3593984962406017e-06, + "loss": 0.2103, + "step": 50810 + }, + { + "epoch": 76.42, + "grad_norm": 3.7222402095794678, + "learning_rate": 2.357894736842105e-06, + "loss": 0.1546, + "step": 50820 + }, + { + "epoch": 76.44, + "grad_norm": 9.640228271484375, + "learning_rate": 2.356390977443609e-06, + "loss": 0.2003, + "step": 50830 + }, + { + "epoch": 76.45, + "grad_norm": 5.100734710693359, + "learning_rate": 2.354887218045113e-06, + "loss": 0.1579, + "step": 50840 + }, + { + "epoch": 76.47, + "grad_norm": 2.1371684074401855, + "learning_rate": 2.3533834586466166e-06, + "loss": 0.1618, + "step": 50850 + }, + { + "epoch": 76.48, + "grad_norm": 7.091713905334473, + "learning_rate": 2.3518796992481205e-06, + "loss": 0.1943, + "step": 50860 + }, + { + "epoch": 76.5, + "grad_norm": 5.84473180770874, + "learning_rate": 2.350375939849624e-06, + "loss": 0.15, + "step": 50870 + }, + { + "epoch": 76.51, + "grad_norm": 4.024866104125977, + "learning_rate": 2.348872180451128e-06, + "loss": 0.166, + "step": 50880 + }, + { + "epoch": 76.53, + "grad_norm": 5.06535530090332, + "learning_rate": 2.347368421052632e-06, + "loss": 0.1813, + "step": 50890 + }, + { + "epoch": 76.54, + "grad_norm": 5.515821933746338, + "learning_rate": 2.3458646616541355e-06, + "loss": 0.1672, + "step": 50900 + }, + { + "epoch": 76.56, + "grad_norm": 5.713620185852051, + "learning_rate": 2.3443609022556394e-06, + "loss": 0.2043, + "step": 50910 + }, + { + "epoch": 76.57, + "grad_norm": 4.078904151916504, + "learning_rate": 2.342857142857143e-06, + "loss": 0.1606, + "step": 50920 + }, + { + "epoch": 76.59, + "grad_norm": 5.5044660568237305, + "learning_rate": 2.341353383458647e-06, + "loss": 0.1721, + "step": 50930 + }, + { + "epoch": 76.6, + "grad_norm": 6.557012557983398, + "learning_rate": 2.339849624060151e-06, + "loss": 0.1614, + "step": 50940 + }, + { + "epoch": 76.62, + "grad_norm": 2.52939772605896, + "learning_rate": 2.3383458646616543e-06, + "loss": 0.1872, + "step": 50950 + }, + { + "epoch": 76.63, + "grad_norm": 1.881983757019043, + "learning_rate": 2.3368421052631583e-06, + "loss": 0.1311, + "step": 50960 + }, + { + "epoch": 76.65, + "grad_norm": 3.7978851795196533, + "learning_rate": 2.335338345864662e-06, + "loss": 0.213, + "step": 50970 + }, + { + "epoch": 76.66, + "grad_norm": 6.723447799682617, + "learning_rate": 2.3338345864661657e-06, + "loss": 0.1595, + "step": 50980 + }, + { + "epoch": 76.68, + "grad_norm": 4.094229698181152, + "learning_rate": 2.3323308270676697e-06, + "loss": 0.1888, + "step": 50990 + }, + { + "epoch": 76.69, + "grad_norm": 5.497591495513916, + "learning_rate": 2.330827067669173e-06, + "loss": 0.1733, + "step": 51000 + }, + { + "epoch": 76.71, + "grad_norm": 6.945542812347412, + "learning_rate": 2.3293233082706767e-06, + "loss": 0.225, + "step": 51010 + }, + { + "epoch": 76.72, + "grad_norm": 7.769118309020996, + "learning_rate": 2.3278195488721807e-06, + "loss": 0.2114, + "step": 51020 + }, + { + "epoch": 76.74, + "grad_norm": 5.106688976287842, + "learning_rate": 2.326315789473684e-06, + "loss": 0.1166, + "step": 51030 + }, + { + "epoch": 76.75, + "grad_norm": 7.118248462677002, + "learning_rate": 2.324812030075188e-06, + "loss": 0.1807, + "step": 51040 + }, + { + "epoch": 76.77, + "grad_norm": 4.859402656555176, + "learning_rate": 2.3233082706766916e-06, + "loss": 0.217, + "step": 51050 + }, + { + "epoch": 76.78, + "grad_norm": 4.8244829177856445, + "learning_rate": 2.3218045112781956e-06, + "loss": 0.1698, + "step": 51060 + }, + { + "epoch": 76.8, + "grad_norm": 4.1632914543151855, + "learning_rate": 2.3203007518796995e-06, + "loss": 0.1531, + "step": 51070 + }, + { + "epoch": 76.81, + "grad_norm": 5.575042247772217, + "learning_rate": 2.318796992481203e-06, + "loss": 0.1539, + "step": 51080 + }, + { + "epoch": 76.83, + "grad_norm": 4.370471000671387, + "learning_rate": 2.317293233082707e-06, + "loss": 0.1313, + "step": 51090 + }, + { + "epoch": 76.84, + "grad_norm": 4.721175193786621, + "learning_rate": 2.3157894736842105e-06, + "loss": 0.1698, + "step": 51100 + }, + { + "epoch": 76.86, + "grad_norm": 6.6235857009887695, + "learning_rate": 2.3142857142857145e-06, + "loss": 0.2313, + "step": 51110 + }, + { + "epoch": 76.87, + "grad_norm": 15.119837760925293, + "learning_rate": 2.3127819548872184e-06, + "loss": 0.163, + "step": 51120 + }, + { + "epoch": 76.89, + "grad_norm": 2.5521862506866455, + "learning_rate": 2.311278195488722e-06, + "loss": 0.2031, + "step": 51130 + }, + { + "epoch": 76.9, + "grad_norm": 8.849630355834961, + "learning_rate": 2.309774436090226e-06, + "loss": 0.182, + "step": 51140 + }, + { + "epoch": 76.92, + "grad_norm": 6.023241996765137, + "learning_rate": 2.3082706766917294e-06, + "loss": 0.1807, + "step": 51150 + }, + { + "epoch": 76.93, + "grad_norm": 6.313910007476807, + "learning_rate": 2.3067669172932333e-06, + "loss": 0.1622, + "step": 51160 + }, + { + "epoch": 76.95, + "grad_norm": 7.0026116371154785, + "learning_rate": 2.3052631578947373e-06, + "loss": 0.1677, + "step": 51170 + }, + { + "epoch": 76.96, + "grad_norm": 5.539087772369385, + "learning_rate": 2.3037593984962408e-06, + "loss": 0.1771, + "step": 51180 + }, + { + "epoch": 76.98, + "grad_norm": 2.8735461235046387, + "learning_rate": 2.3022556390977447e-06, + "loss": 0.1641, + "step": 51190 + }, + { + "epoch": 76.99, + "grad_norm": 4.759913921356201, + "learning_rate": 2.3007518796992482e-06, + "loss": 0.1599, + "step": 51200 + }, + { + "epoch": 77.0, + "eval_accuracy": 0.9313, + "eval_loss": 0.3295079469680786, + "eval_runtime": 84.3369, + "eval_samples_per_second": 118.572, + "eval_steps_per_second": 0.474, + "step": 51205 + }, + { + "epoch": 77.01, + "grad_norm": 3.568769693374634, + "learning_rate": 2.299248120300752e-06, + "loss": 0.1626, + "step": 51210 + }, + { + "epoch": 77.02, + "grad_norm": 6.137712478637695, + "learning_rate": 2.297744360902256e-06, + "loss": 0.1651, + "step": 51220 + }, + { + "epoch": 77.04, + "grad_norm": 2.2381041049957275, + "learning_rate": 2.2962406015037596e-06, + "loss": 0.1933, + "step": 51230 + }, + { + "epoch": 77.05, + "grad_norm": 6.399120330810547, + "learning_rate": 2.294736842105263e-06, + "loss": 0.1864, + "step": 51240 + }, + { + "epoch": 77.07, + "grad_norm": 5.5201334953308105, + "learning_rate": 2.293233082706767e-06, + "loss": 0.2318, + "step": 51250 + }, + { + "epoch": 77.08, + "grad_norm": 6.623318195343018, + "learning_rate": 2.2917293233082706e-06, + "loss": 0.2449, + "step": 51260 + }, + { + "epoch": 77.1, + "grad_norm": 5.644393444061279, + "learning_rate": 2.2902255639097746e-06, + "loss": 0.2073, + "step": 51270 + }, + { + "epoch": 77.11, + "grad_norm": 5.483371257781982, + "learning_rate": 2.288721804511278e-06, + "loss": 0.185, + "step": 51280 + }, + { + "epoch": 77.13, + "grad_norm": 5.581244468688965, + "learning_rate": 2.287218045112782e-06, + "loss": 0.2171, + "step": 51290 + }, + { + "epoch": 77.14, + "grad_norm": 7.645112991333008, + "learning_rate": 2.285714285714286e-06, + "loss": 0.2034, + "step": 51300 + }, + { + "epoch": 77.16, + "grad_norm": 5.600865364074707, + "learning_rate": 2.2842105263157895e-06, + "loss": 0.2298, + "step": 51310 + }, + { + "epoch": 77.17, + "grad_norm": 8.792081832885742, + "learning_rate": 2.2827067669172934e-06, + "loss": 0.1311, + "step": 51320 + }, + { + "epoch": 77.19, + "grad_norm": 4.581193447113037, + "learning_rate": 2.281203007518797e-06, + "loss": 0.1835, + "step": 51330 + }, + { + "epoch": 77.2, + "grad_norm": 3.9883158206939697, + "learning_rate": 2.279699248120301e-06, + "loss": 0.1439, + "step": 51340 + }, + { + "epoch": 77.22, + "grad_norm": 3.542494535446167, + "learning_rate": 2.278195488721805e-06, + "loss": 0.1538, + "step": 51350 + }, + { + "epoch": 77.23, + "grad_norm": 3.4795663356781006, + "learning_rate": 2.2766917293233084e-06, + "loss": 0.1669, + "step": 51360 + }, + { + "epoch": 77.25, + "grad_norm": 5.9982194900512695, + "learning_rate": 2.2751879699248123e-06, + "loss": 0.1479, + "step": 51370 + }, + { + "epoch": 77.26, + "grad_norm": 4.513314247131348, + "learning_rate": 2.273684210526316e-06, + "loss": 0.2073, + "step": 51380 + }, + { + "epoch": 77.28, + "grad_norm": 3.709369421005249, + "learning_rate": 2.2721804511278198e-06, + "loss": 0.1449, + "step": 51390 + }, + { + "epoch": 77.29, + "grad_norm": 4.331890106201172, + "learning_rate": 2.2706766917293237e-06, + "loss": 0.1677, + "step": 51400 + }, + { + "epoch": 77.31, + "grad_norm": 3.835942506790161, + "learning_rate": 2.2691729323308272e-06, + "loss": 0.1495, + "step": 51410 + }, + { + "epoch": 77.32, + "grad_norm": 4.771125316619873, + "learning_rate": 2.267669172932331e-06, + "loss": 0.1771, + "step": 51420 + }, + { + "epoch": 77.34, + "grad_norm": 2.807570695877075, + "learning_rate": 2.2661654135338347e-06, + "loss": 0.1593, + "step": 51430 + }, + { + "epoch": 77.35, + "grad_norm": 2.986487627029419, + "learning_rate": 2.2646616541353386e-06, + "loss": 0.1418, + "step": 51440 + }, + { + "epoch": 77.37, + "grad_norm": 3.4816768169403076, + "learning_rate": 2.2631578947368426e-06, + "loss": 0.1787, + "step": 51450 + }, + { + "epoch": 77.38, + "grad_norm": 4.014631271362305, + "learning_rate": 2.261654135338346e-06, + "loss": 0.1266, + "step": 51460 + }, + { + "epoch": 77.4, + "grad_norm": 6.388309955596924, + "learning_rate": 2.26015037593985e-06, + "loss": 0.2472, + "step": 51470 + }, + { + "epoch": 77.41, + "grad_norm": 3.9746556282043457, + "learning_rate": 2.2586466165413536e-06, + "loss": 0.2174, + "step": 51480 + }, + { + "epoch": 77.43, + "grad_norm": 5.2674455642700195, + "learning_rate": 2.257142857142857e-06, + "loss": 0.1419, + "step": 51490 + }, + { + "epoch": 77.44, + "grad_norm": 4.513287544250488, + "learning_rate": 2.255639097744361e-06, + "loss": 0.1721, + "step": 51500 + }, + { + "epoch": 77.46, + "grad_norm": 7.286464691162109, + "learning_rate": 2.2541353383458645e-06, + "loss": 0.1816, + "step": 51510 + }, + { + "epoch": 77.47, + "grad_norm": 3.867978572845459, + "learning_rate": 2.2526315789473685e-06, + "loss": 0.1384, + "step": 51520 + }, + { + "epoch": 77.49, + "grad_norm": 0.8837634921073914, + "learning_rate": 2.2511278195488724e-06, + "loss": 0.1681, + "step": 51530 + }, + { + "epoch": 77.5, + "grad_norm": 8.97939395904541, + "learning_rate": 2.249624060150376e-06, + "loss": 0.1419, + "step": 51540 + }, + { + "epoch": 77.52, + "grad_norm": 3.708096742630005, + "learning_rate": 2.24812030075188e-06, + "loss": 0.1476, + "step": 51550 + }, + { + "epoch": 77.53, + "grad_norm": 3.5969924926757812, + "learning_rate": 2.2466165413533834e-06, + "loss": 0.188, + "step": 51560 + }, + { + "epoch": 77.55, + "grad_norm": 2.7595226764678955, + "learning_rate": 2.2451127819548873e-06, + "loss": 0.2098, + "step": 51570 + }, + { + "epoch": 77.56, + "grad_norm": 5.377525329589844, + "learning_rate": 2.2436090225563913e-06, + "loss": 0.2238, + "step": 51580 + }, + { + "epoch": 77.58, + "grad_norm": 8.016778945922852, + "learning_rate": 2.242105263157895e-06, + "loss": 0.1233, + "step": 51590 + }, + { + "epoch": 77.59, + "grad_norm": 4.480692386627197, + "learning_rate": 2.2406015037593987e-06, + "loss": 0.2026, + "step": 51600 + }, + { + "epoch": 77.61, + "grad_norm": 6.609382629394531, + "learning_rate": 2.2390977443609023e-06, + "loss": 0.1878, + "step": 51610 + }, + { + "epoch": 77.62, + "grad_norm": 6.064496994018555, + "learning_rate": 2.237593984962406e-06, + "loss": 0.2227, + "step": 51620 + }, + { + "epoch": 77.64, + "grad_norm": 2.538266181945801, + "learning_rate": 2.23609022556391e-06, + "loss": 0.1967, + "step": 51630 + }, + { + "epoch": 77.65, + "grad_norm": 5.404106140136719, + "learning_rate": 2.2345864661654137e-06, + "loss": 0.254, + "step": 51640 + }, + { + "epoch": 77.67, + "grad_norm": 5.271445274353027, + "learning_rate": 2.2330827067669176e-06, + "loss": 0.132, + "step": 51650 + }, + { + "epoch": 77.68, + "grad_norm": 4.448204040527344, + "learning_rate": 2.231578947368421e-06, + "loss": 0.1674, + "step": 51660 + }, + { + "epoch": 77.7, + "grad_norm": 8.482146263122559, + "learning_rate": 2.230075187969925e-06, + "loss": 0.2233, + "step": 51670 + }, + { + "epoch": 77.71, + "grad_norm": 3.992495536804199, + "learning_rate": 2.228571428571429e-06, + "loss": 0.141, + "step": 51680 + }, + { + "epoch": 77.73, + "grad_norm": 9.992817878723145, + "learning_rate": 2.2270676691729325e-06, + "loss": 0.2356, + "step": 51690 + }, + { + "epoch": 77.74, + "grad_norm": 3.759432554244995, + "learning_rate": 2.2255639097744365e-06, + "loss": 0.1199, + "step": 51700 + }, + { + "epoch": 77.76, + "grad_norm": 4.334221363067627, + "learning_rate": 2.22406015037594e-06, + "loss": 0.1971, + "step": 51710 + }, + { + "epoch": 77.77, + "grad_norm": 1.510976791381836, + "learning_rate": 2.222556390977444e-06, + "loss": 0.1708, + "step": 51720 + }, + { + "epoch": 77.79, + "grad_norm": 5.777114391326904, + "learning_rate": 2.221052631578948e-06, + "loss": 0.2107, + "step": 51730 + }, + { + "epoch": 77.8, + "grad_norm": 2.896449327468872, + "learning_rate": 2.219548872180451e-06, + "loss": 0.1942, + "step": 51740 + }, + { + "epoch": 77.82, + "grad_norm": 17.983856201171875, + "learning_rate": 2.218045112781955e-06, + "loss": 0.1846, + "step": 51750 + }, + { + "epoch": 77.83, + "grad_norm": 7.573366165161133, + "learning_rate": 2.216541353383459e-06, + "loss": 0.1972, + "step": 51760 + }, + { + "epoch": 77.85, + "grad_norm": 6.8880109786987305, + "learning_rate": 2.2150375939849624e-06, + "loss": 0.18, + "step": 51770 + }, + { + "epoch": 77.86, + "grad_norm": 2.3339192867279053, + "learning_rate": 2.2135338345864663e-06, + "loss": 0.1842, + "step": 51780 + }, + { + "epoch": 77.88, + "grad_norm": 3.973484754562378, + "learning_rate": 2.21203007518797e-06, + "loss": 0.2218, + "step": 51790 + }, + { + "epoch": 77.89, + "grad_norm": 3.132448673248291, + "learning_rate": 2.2105263157894738e-06, + "loss": 0.2052, + "step": 51800 + }, + { + "epoch": 77.91, + "grad_norm": 5.713258743286133, + "learning_rate": 2.2090225563909777e-06, + "loss": 0.1334, + "step": 51810 + }, + { + "epoch": 77.92, + "grad_norm": 4.339270114898682, + "learning_rate": 2.2075187969924812e-06, + "loss": 0.2167, + "step": 51820 + }, + { + "epoch": 77.94, + "grad_norm": 1.3507264852523804, + "learning_rate": 2.206015037593985e-06, + "loss": 0.2518, + "step": 51830 + }, + { + "epoch": 77.95, + "grad_norm": 7.650753974914551, + "learning_rate": 2.2045112781954887e-06, + "loss": 0.2293, + "step": 51840 + }, + { + "epoch": 77.97, + "grad_norm": 5.412219524383545, + "learning_rate": 2.2030075187969927e-06, + "loss": 0.1966, + "step": 51850 + }, + { + "epoch": 77.98, + "grad_norm": 7.676565647125244, + "learning_rate": 2.2015037593984966e-06, + "loss": 0.2069, + "step": 51860 + }, + { + "epoch": 78.0, + "grad_norm": 0.12734095752239227, + "learning_rate": 2.2e-06, + "loss": 0.2077, + "step": 51870 + }, + { + "epoch": 78.0, + "eval_accuracy": 0.9312, + "eval_loss": 0.32849153876304626, + "eval_runtime": 85.6428, + "eval_samples_per_second": 116.764, + "eval_steps_per_second": 0.467, + "step": 51870 + }, + { + "epoch": 78.02, + "grad_norm": 3.1705241203308105, + "learning_rate": 2.198496240601504e-06, + "loss": 0.1415, + "step": 51880 + }, + { + "epoch": 78.03, + "grad_norm": 4.243330955505371, + "learning_rate": 2.1969924812030076e-06, + "loss": 0.1765, + "step": 51890 + }, + { + "epoch": 78.05, + "grad_norm": 7.999730110168457, + "learning_rate": 2.1954887218045115e-06, + "loss": 0.2041, + "step": 51900 + }, + { + "epoch": 78.06, + "grad_norm": 4.726391792297363, + "learning_rate": 2.1939849624060155e-06, + "loss": 0.2337, + "step": 51910 + }, + { + "epoch": 78.08, + "grad_norm": 4.67405891418457, + "learning_rate": 2.192481203007519e-06, + "loss": 0.1231, + "step": 51920 + }, + { + "epoch": 78.09, + "grad_norm": 6.686806678771973, + "learning_rate": 2.190977443609023e-06, + "loss": 0.1047, + "step": 51930 + }, + { + "epoch": 78.11, + "grad_norm": 2.4821736812591553, + "learning_rate": 2.1894736842105264e-06, + "loss": 0.1818, + "step": 51940 + }, + { + "epoch": 78.12, + "grad_norm": 5.056117057800293, + "learning_rate": 2.1879699248120304e-06, + "loss": 0.1644, + "step": 51950 + }, + { + "epoch": 78.14, + "grad_norm": 5.994955062866211, + "learning_rate": 2.1864661654135343e-06, + "loss": 0.1509, + "step": 51960 + }, + { + "epoch": 78.15, + "grad_norm": 4.456724643707275, + "learning_rate": 2.184962406015038e-06, + "loss": 0.1468, + "step": 51970 + }, + { + "epoch": 78.17, + "grad_norm": 3.952169895172119, + "learning_rate": 2.1834586466165418e-06, + "loss": 0.1785, + "step": 51980 + }, + { + "epoch": 78.18, + "grad_norm": 3.9694976806640625, + "learning_rate": 2.1819548872180453e-06, + "loss": 0.2603, + "step": 51990 + }, + { + "epoch": 78.2, + "grad_norm": 4.302563190460205, + "learning_rate": 2.180451127819549e-06, + "loss": 0.1793, + "step": 52000 + }, + { + "epoch": 78.21, + "grad_norm": 1.5013707876205444, + "learning_rate": 2.1789473684210528e-06, + "loss": 0.2053, + "step": 52010 + }, + { + "epoch": 78.23, + "grad_norm": 4.439993858337402, + "learning_rate": 2.1774436090225563e-06, + "loss": 0.1417, + "step": 52020 + }, + { + "epoch": 78.24, + "grad_norm": 4.53973388671875, + "learning_rate": 2.1759398496240602e-06, + "loss": 0.1573, + "step": 52030 + }, + { + "epoch": 78.26, + "grad_norm": 6.344988822937012, + "learning_rate": 2.174436090225564e-06, + "loss": 0.1905, + "step": 52040 + }, + { + "epoch": 78.27, + "grad_norm": 3.4215738773345947, + "learning_rate": 2.1729323308270677e-06, + "loss": 0.1587, + "step": 52050 + }, + { + "epoch": 78.29, + "grad_norm": 5.529082298278809, + "learning_rate": 2.1714285714285716e-06, + "loss": 0.1935, + "step": 52060 + }, + { + "epoch": 78.3, + "grad_norm": 2.7385475635528564, + "learning_rate": 2.169924812030075e-06, + "loss": 0.1677, + "step": 52070 + }, + { + "epoch": 78.32, + "grad_norm": 2.511803150177002, + "learning_rate": 2.168421052631579e-06, + "loss": 0.2112, + "step": 52080 + }, + { + "epoch": 78.33, + "grad_norm": 4.103121757507324, + "learning_rate": 2.166917293233083e-06, + "loss": 0.1629, + "step": 52090 + }, + { + "epoch": 78.35, + "grad_norm": 4.167917728424072, + "learning_rate": 2.1654135338345866e-06, + "loss": 0.1722, + "step": 52100 + }, + { + "epoch": 78.36, + "grad_norm": 3.2801246643066406, + "learning_rate": 2.1639097744360905e-06, + "loss": 0.2039, + "step": 52110 + }, + { + "epoch": 78.38, + "grad_norm": 4.19163703918457, + "learning_rate": 2.162406015037594e-06, + "loss": 0.181, + "step": 52120 + }, + { + "epoch": 78.39, + "grad_norm": 3.487081527709961, + "learning_rate": 2.160902255639098e-06, + "loss": 0.1752, + "step": 52130 + }, + { + "epoch": 78.41, + "grad_norm": 2.9936952590942383, + "learning_rate": 2.159398496240602e-06, + "loss": 0.2173, + "step": 52140 + }, + { + "epoch": 78.42, + "grad_norm": 4.499261379241943, + "learning_rate": 2.1578947368421054e-06, + "loss": 0.1907, + "step": 52150 + }, + { + "epoch": 78.44, + "grad_norm": 5.857420921325684, + "learning_rate": 2.1563909774436094e-06, + "loss": 0.1689, + "step": 52160 + }, + { + "epoch": 78.45, + "grad_norm": 4.8537468910217285, + "learning_rate": 2.154887218045113e-06, + "loss": 0.1216, + "step": 52170 + }, + { + "epoch": 78.47, + "grad_norm": 6.32722806930542, + "learning_rate": 2.153383458646617e-06, + "loss": 0.2262, + "step": 52180 + }, + { + "epoch": 78.48, + "grad_norm": 11.071094512939453, + "learning_rate": 2.1518796992481208e-06, + "loss": 0.1113, + "step": 52190 + }, + { + "epoch": 78.5, + "grad_norm": 5.603166580200195, + "learning_rate": 2.1503759398496243e-06, + "loss": 0.167, + "step": 52200 + }, + { + "epoch": 78.51, + "grad_norm": 5.128636837005615, + "learning_rate": 2.1488721804511282e-06, + "loss": 0.1974, + "step": 52210 + }, + { + "epoch": 78.53, + "grad_norm": 2.067934036254883, + "learning_rate": 2.1473684210526317e-06, + "loss": 0.1654, + "step": 52220 + }, + { + "epoch": 78.54, + "grad_norm": 4.146607398986816, + "learning_rate": 2.1458646616541357e-06, + "loss": 0.1807, + "step": 52230 + }, + { + "epoch": 78.56, + "grad_norm": 2.6241300106048584, + "learning_rate": 2.144360902255639e-06, + "loss": 0.1445, + "step": 52240 + }, + { + "epoch": 78.57, + "grad_norm": 4.428233623504639, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.1427, + "step": 52250 + }, + { + "epoch": 78.59, + "grad_norm": 4.425225734710693, + "learning_rate": 2.1413533834586467e-06, + "loss": 0.1506, + "step": 52260 + }, + { + "epoch": 78.6, + "grad_norm": 2.6560704708099365, + "learning_rate": 2.1398496240601506e-06, + "loss": 0.1788, + "step": 52270 + }, + { + "epoch": 78.62, + "grad_norm": 3.350560188293457, + "learning_rate": 2.138345864661654e-06, + "loss": 0.1513, + "step": 52280 + }, + { + "epoch": 78.63, + "grad_norm": 3.5324301719665527, + "learning_rate": 2.136842105263158e-06, + "loss": 0.1701, + "step": 52290 + }, + { + "epoch": 78.65, + "grad_norm": 4.827585697174072, + "learning_rate": 2.1353383458646616e-06, + "loss": 0.2025, + "step": 52300 + }, + { + "epoch": 78.66, + "grad_norm": 0.8045023679733276, + "learning_rate": 2.1338345864661655e-06, + "loss": 0.1426, + "step": 52310 + }, + { + "epoch": 78.68, + "grad_norm": 4.572910308837891, + "learning_rate": 2.1323308270676695e-06, + "loss": 0.1788, + "step": 52320 + }, + { + "epoch": 78.69, + "grad_norm": 5.141964435577393, + "learning_rate": 2.130827067669173e-06, + "loss": 0.191, + "step": 52330 + }, + { + "epoch": 78.71, + "grad_norm": 16.9417667388916, + "learning_rate": 2.129323308270677e-06, + "loss": 0.1658, + "step": 52340 + }, + { + "epoch": 78.72, + "grad_norm": 7.1351399421691895, + "learning_rate": 2.1278195488721805e-06, + "loss": 0.1802, + "step": 52350 + }, + { + "epoch": 78.74, + "grad_norm": 5.015010833740234, + "learning_rate": 2.1263157894736844e-06, + "loss": 0.227, + "step": 52360 + }, + { + "epoch": 78.75, + "grad_norm": 4.296870231628418, + "learning_rate": 2.1248120300751883e-06, + "loss": 0.1983, + "step": 52370 + }, + { + "epoch": 78.77, + "grad_norm": 6.891214370727539, + "learning_rate": 2.123308270676692e-06, + "loss": 0.2017, + "step": 52380 + }, + { + "epoch": 78.78, + "grad_norm": 4.369607925415039, + "learning_rate": 2.121804511278196e-06, + "loss": 0.1861, + "step": 52390 + }, + { + "epoch": 78.8, + "grad_norm": 7.993550777435303, + "learning_rate": 2.1203007518796993e-06, + "loss": 0.1986, + "step": 52400 + }, + { + "epoch": 78.81, + "grad_norm": 5.03591251373291, + "learning_rate": 2.1187969924812033e-06, + "loss": 0.1679, + "step": 52410 + }, + { + "epoch": 78.83, + "grad_norm": 3.6212151050567627, + "learning_rate": 2.1172932330827072e-06, + "loss": 0.2262, + "step": 52420 + }, + { + "epoch": 78.84, + "grad_norm": 6.3097310066223145, + "learning_rate": 2.1157894736842107e-06, + "loss": 0.2434, + "step": 52430 + }, + { + "epoch": 78.86, + "grad_norm": 8.02677059173584, + "learning_rate": 2.1142857142857147e-06, + "loss": 0.1698, + "step": 52440 + }, + { + "epoch": 78.87, + "grad_norm": 6.799673080444336, + "learning_rate": 2.112781954887218e-06, + "loss": 0.2062, + "step": 52450 + }, + { + "epoch": 78.89, + "grad_norm": 3.5356316566467285, + "learning_rate": 2.111278195488722e-06, + "loss": 0.1969, + "step": 52460 + }, + { + "epoch": 78.9, + "grad_norm": 4.078927993774414, + "learning_rate": 2.1097744360902257e-06, + "loss": 0.1694, + "step": 52470 + }, + { + "epoch": 78.92, + "grad_norm": 4.125448226928711, + "learning_rate": 2.1082706766917296e-06, + "loss": 0.1752, + "step": 52480 + }, + { + "epoch": 78.93, + "grad_norm": 5.614774703979492, + "learning_rate": 2.106766917293233e-06, + "loss": 0.1573, + "step": 52490 + }, + { + "epoch": 78.95, + "grad_norm": 3.6688623428344727, + "learning_rate": 2.105263157894737e-06, + "loss": 0.1792, + "step": 52500 + }, + { + "epoch": 78.96, + "grad_norm": 3.9358396530151367, + "learning_rate": 2.1037593984962406e-06, + "loss": 0.1619, + "step": 52510 + }, + { + "epoch": 78.98, + "grad_norm": 3.6908819675445557, + "learning_rate": 2.1022556390977445e-06, + "loss": 0.1755, + "step": 52520 + }, + { + "epoch": 78.99, + "grad_norm": 6.504554271697998, + "learning_rate": 2.100751879699248e-06, + "loss": 0.2053, + "step": 52530 + }, + { + "epoch": 79.0, + "eval_accuracy": 0.9309, + "eval_loss": 0.32775041460990906, + "eval_runtime": 84.345, + "eval_samples_per_second": 118.561, + "eval_steps_per_second": 0.474, + "step": 52535 + }, + { + "epoch": 79.01, + "grad_norm": 4.1648664474487305, + "learning_rate": 2.099248120300752e-06, + "loss": 0.2267, + "step": 52540 + }, + { + "epoch": 79.02, + "grad_norm": 7.5626020431518555, + "learning_rate": 2.097744360902256e-06, + "loss": 0.2043, + "step": 52550 + }, + { + "epoch": 79.04, + "grad_norm": 4.533354759216309, + "learning_rate": 2.0962406015037594e-06, + "loss": 0.1998, + "step": 52560 + }, + { + "epoch": 79.05, + "grad_norm": 2.196251630783081, + "learning_rate": 2.0947368421052634e-06, + "loss": 0.1536, + "step": 52570 + }, + { + "epoch": 79.07, + "grad_norm": 8.702381134033203, + "learning_rate": 2.093233082706767e-06, + "loss": 0.1829, + "step": 52580 + }, + { + "epoch": 79.08, + "grad_norm": 6.189172267913818, + "learning_rate": 2.091729323308271e-06, + "loss": 0.2646, + "step": 52590 + }, + { + "epoch": 79.1, + "grad_norm": 14.350298881530762, + "learning_rate": 2.090225563909775e-06, + "loss": 0.1486, + "step": 52600 + }, + { + "epoch": 79.11, + "grad_norm": 5.265256404876709, + "learning_rate": 2.0887218045112783e-06, + "loss": 0.1852, + "step": 52610 + }, + { + "epoch": 79.13, + "grad_norm": 5.7867350578308105, + "learning_rate": 2.0872180451127823e-06, + "loss": 0.1778, + "step": 52620 + }, + { + "epoch": 79.14, + "grad_norm": 3.8867268562316895, + "learning_rate": 2.0857142857142858e-06, + "loss": 0.1716, + "step": 52630 + }, + { + "epoch": 79.16, + "grad_norm": 10.908556938171387, + "learning_rate": 2.0842105263157897e-06, + "loss": 0.2087, + "step": 52640 + }, + { + "epoch": 79.17, + "grad_norm": 8.96177864074707, + "learning_rate": 2.0827067669172937e-06, + "loss": 0.1587, + "step": 52650 + }, + { + "epoch": 79.19, + "grad_norm": 4.244720935821533, + "learning_rate": 2.081203007518797e-06, + "loss": 0.1488, + "step": 52660 + }, + { + "epoch": 79.2, + "grad_norm": 3.8978004455566406, + "learning_rate": 2.079699248120301e-06, + "loss": 0.1361, + "step": 52670 + }, + { + "epoch": 79.22, + "grad_norm": 1.49534010887146, + "learning_rate": 2.0781954887218046e-06, + "loss": 0.1506, + "step": 52680 + }, + { + "epoch": 79.23, + "grad_norm": 6.862305164337158, + "learning_rate": 2.0766917293233086e-06, + "loss": 0.2312, + "step": 52690 + }, + { + "epoch": 79.25, + "grad_norm": 5.542629718780518, + "learning_rate": 2.075187969924812e-06, + "loss": 0.1923, + "step": 52700 + }, + { + "epoch": 79.26, + "grad_norm": 5.134133338928223, + "learning_rate": 2.073684210526316e-06, + "loss": 0.1765, + "step": 52710 + }, + { + "epoch": 79.28, + "grad_norm": 4.834011554718018, + "learning_rate": 2.07218045112782e-06, + "loss": 0.1291, + "step": 52720 + }, + { + "epoch": 79.29, + "grad_norm": 9.411290168762207, + "learning_rate": 2.0706766917293235e-06, + "loss": 0.1661, + "step": 52730 + }, + { + "epoch": 79.31, + "grad_norm": 4.970304489135742, + "learning_rate": 2.069172932330827e-06, + "loss": 0.1876, + "step": 52740 + }, + { + "epoch": 79.32, + "grad_norm": 4.562291622161865, + "learning_rate": 2.067669172932331e-06, + "loss": 0.1942, + "step": 52750 + }, + { + "epoch": 79.34, + "grad_norm": 5.195777416229248, + "learning_rate": 2.0661654135338345e-06, + "loss": 0.1821, + "step": 52760 + }, + { + "epoch": 79.35, + "grad_norm": 6.937760829925537, + "learning_rate": 2.0646616541353384e-06, + "loss": 0.2023, + "step": 52770 + }, + { + "epoch": 79.37, + "grad_norm": 6.619980812072754, + "learning_rate": 2.0631578947368424e-06, + "loss": 0.1777, + "step": 52780 + }, + { + "epoch": 79.38, + "grad_norm": 2.9072442054748535, + "learning_rate": 2.061654135338346e-06, + "loss": 0.1435, + "step": 52790 + }, + { + "epoch": 79.4, + "grad_norm": 1.8150124549865723, + "learning_rate": 2.06015037593985e-06, + "loss": 0.1548, + "step": 52800 + }, + { + "epoch": 79.41, + "grad_norm": 4.043828010559082, + "learning_rate": 2.0586466165413533e-06, + "loss": 0.1577, + "step": 52810 + }, + { + "epoch": 79.43, + "grad_norm": 4.577513694763184, + "learning_rate": 2.0571428571428573e-06, + "loss": 0.2405, + "step": 52820 + }, + { + "epoch": 79.44, + "grad_norm": 4.094361782073975, + "learning_rate": 2.0556390977443612e-06, + "loss": 0.1409, + "step": 52830 + }, + { + "epoch": 79.46, + "grad_norm": 3.072739362716675, + "learning_rate": 2.0541353383458648e-06, + "loss": 0.1992, + "step": 52840 + }, + { + "epoch": 79.47, + "grad_norm": 5.828579425811768, + "learning_rate": 2.0526315789473687e-06, + "loss": 0.1278, + "step": 52850 + }, + { + "epoch": 79.49, + "grad_norm": 5.253977298736572, + "learning_rate": 2.0511278195488722e-06, + "loss": 0.1984, + "step": 52860 + }, + { + "epoch": 79.5, + "grad_norm": 3.553118944168091, + "learning_rate": 2.049624060150376e-06, + "loss": 0.1384, + "step": 52870 + }, + { + "epoch": 79.52, + "grad_norm": 7.722125053405762, + "learning_rate": 2.04812030075188e-06, + "loss": 0.2084, + "step": 52880 + }, + { + "epoch": 79.53, + "grad_norm": 3.237384557723999, + "learning_rate": 2.0466165413533836e-06, + "loss": 0.163, + "step": 52890 + }, + { + "epoch": 79.55, + "grad_norm": 11.586670875549316, + "learning_rate": 2.0451127819548876e-06, + "loss": 0.2027, + "step": 52900 + }, + { + "epoch": 79.56, + "grad_norm": 5.43867301940918, + "learning_rate": 2.043609022556391e-06, + "loss": 0.1672, + "step": 52910 + }, + { + "epoch": 79.58, + "grad_norm": 6.87939977645874, + "learning_rate": 2.042105263157895e-06, + "loss": 0.1675, + "step": 52920 + }, + { + "epoch": 79.59, + "grad_norm": 7.075170993804932, + "learning_rate": 2.0406015037593985e-06, + "loss": 0.2382, + "step": 52930 + }, + { + "epoch": 79.61, + "grad_norm": 4.605086326599121, + "learning_rate": 2.0390977443609025e-06, + "loss": 0.1238, + "step": 52940 + }, + { + "epoch": 79.62, + "grad_norm": 4.68049955368042, + "learning_rate": 2.0375939849624064e-06, + "loss": 0.1679, + "step": 52950 + }, + { + "epoch": 79.64, + "grad_norm": 4.511661529541016, + "learning_rate": 2.03609022556391e-06, + "loss": 0.1604, + "step": 52960 + }, + { + "epoch": 79.65, + "grad_norm": 5.428410530090332, + "learning_rate": 2.034586466165414e-06, + "loss": 0.1244, + "step": 52970 + }, + { + "epoch": 79.67, + "grad_norm": 4.728567600250244, + "learning_rate": 2.0330827067669174e-06, + "loss": 0.1375, + "step": 52980 + }, + { + "epoch": 79.68, + "grad_norm": 4.785933017730713, + "learning_rate": 2.031578947368421e-06, + "loss": 0.1517, + "step": 52990 + }, + { + "epoch": 79.7, + "grad_norm": 6.504793167114258, + "learning_rate": 2.030075187969925e-06, + "loss": 0.1692, + "step": 53000 + }, + { + "epoch": 79.71, + "grad_norm": 4.675958156585693, + "learning_rate": 2.028571428571429e-06, + "loss": 0.1915, + "step": 53010 + }, + { + "epoch": 79.73, + "grad_norm": 5.776334285736084, + "learning_rate": 2.0270676691729323e-06, + "loss": 0.1526, + "step": 53020 + }, + { + "epoch": 79.74, + "grad_norm": 11.183079719543457, + "learning_rate": 2.0255639097744363e-06, + "loss": 0.1586, + "step": 53030 + }, + { + "epoch": 79.76, + "grad_norm": 8.511357307434082, + "learning_rate": 2.02406015037594e-06, + "loss": 0.1941, + "step": 53040 + }, + { + "epoch": 79.77, + "grad_norm": 22.351648330688477, + "learning_rate": 2.0225563909774437e-06, + "loss": 0.1433, + "step": 53050 + }, + { + "epoch": 79.79, + "grad_norm": 7.769514083862305, + "learning_rate": 2.0210526315789477e-06, + "loss": 0.1788, + "step": 53060 + }, + { + "epoch": 79.8, + "grad_norm": 5.4393415451049805, + "learning_rate": 2.019548872180451e-06, + "loss": 0.2059, + "step": 53070 + }, + { + "epoch": 79.82, + "grad_norm": 5.8692145347595215, + "learning_rate": 2.018045112781955e-06, + "loss": 0.2085, + "step": 53080 + }, + { + "epoch": 79.83, + "grad_norm": 7.843011379241943, + "learning_rate": 2.0165413533834587e-06, + "loss": 0.2123, + "step": 53090 + }, + { + "epoch": 79.85, + "grad_norm": 3.590719223022461, + "learning_rate": 2.0150375939849626e-06, + "loss": 0.1455, + "step": 53100 + }, + { + "epoch": 79.86, + "grad_norm": 2.503929853439331, + "learning_rate": 2.0135338345864665e-06, + "loss": 0.1364, + "step": 53110 + }, + { + "epoch": 79.88, + "grad_norm": 4.913735866546631, + "learning_rate": 2.01203007518797e-06, + "loss": 0.159, + "step": 53120 + }, + { + "epoch": 79.89, + "grad_norm": 7.748683929443359, + "learning_rate": 2.010526315789474e-06, + "loss": 0.1912, + "step": 53130 + }, + { + "epoch": 79.91, + "grad_norm": 5.099629878997803, + "learning_rate": 2.0090225563909775e-06, + "loss": 0.2093, + "step": 53140 + }, + { + "epoch": 79.92, + "grad_norm": 8.263333320617676, + "learning_rate": 2.0075187969924815e-06, + "loss": 0.1407, + "step": 53150 + }, + { + "epoch": 79.94, + "grad_norm": 9.621796607971191, + "learning_rate": 2.006015037593985e-06, + "loss": 0.1398, + "step": 53160 + }, + { + "epoch": 79.95, + "grad_norm": 4.470506191253662, + "learning_rate": 2.004511278195489e-06, + "loss": 0.1811, + "step": 53170 + }, + { + "epoch": 79.97, + "grad_norm": 1.4590644836425781, + "learning_rate": 2.003007518796993e-06, + "loss": 0.2257, + "step": 53180 + }, + { + "epoch": 79.98, + "grad_norm": 5.552732944488525, + "learning_rate": 2.0015037593984964e-06, + "loss": 0.2092, + "step": 53190 + }, + { + "epoch": 80.0, + "grad_norm": 0.11614461988210678, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.1846, + "step": 53200 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.9307, + "eval_loss": 0.3290669918060303, + "eval_runtime": 85.0743, + "eval_samples_per_second": 117.544, + "eval_steps_per_second": 0.47, + "step": 53200 + }, + { + "epoch": 80.02, + "grad_norm": 7.221068859100342, + "learning_rate": 1.998496240601504e-06, + "loss": 0.2097, + "step": 53210 + }, + { + "epoch": 80.03, + "grad_norm": 8.729702949523926, + "learning_rate": 1.996992481203008e-06, + "loss": 0.1546, + "step": 53220 + }, + { + "epoch": 80.05, + "grad_norm": 6.764802932739258, + "learning_rate": 1.9954887218045113e-06, + "loss": 0.1736, + "step": 53230 + }, + { + "epoch": 80.06, + "grad_norm": 6.79409646987915, + "learning_rate": 1.9939849624060153e-06, + "loss": 0.2107, + "step": 53240 + }, + { + "epoch": 80.08, + "grad_norm": 6.066366672515869, + "learning_rate": 1.9924812030075188e-06, + "loss": 0.2023, + "step": 53250 + }, + { + "epoch": 80.09, + "grad_norm": 6.539670944213867, + "learning_rate": 1.9909774436090227e-06, + "loss": 0.1947, + "step": 53260 + }, + { + "epoch": 80.11, + "grad_norm": 4.000879287719727, + "learning_rate": 1.9894736842105262e-06, + "loss": 0.1613, + "step": 53270 + }, + { + "epoch": 80.12, + "grad_norm": 6.228635787963867, + "learning_rate": 1.98796992481203e-06, + "loss": 0.1893, + "step": 53280 + }, + { + "epoch": 80.14, + "grad_norm": 3.983610153198242, + "learning_rate": 1.986466165413534e-06, + "loss": 0.176, + "step": 53290 + }, + { + "epoch": 80.15, + "grad_norm": 4.762825965881348, + "learning_rate": 1.9849624060150376e-06, + "loss": 0.2192, + "step": 53300 + }, + { + "epoch": 80.17, + "grad_norm": 5.903609275817871, + "learning_rate": 1.9834586466165416e-06, + "loss": 0.169, + "step": 53310 + }, + { + "epoch": 80.18, + "grad_norm": 5.4333176612854, + "learning_rate": 1.981954887218045e-06, + "loss": 0.1943, + "step": 53320 + }, + { + "epoch": 80.2, + "grad_norm": 6.44717264175415, + "learning_rate": 1.980451127819549e-06, + "loss": 0.2043, + "step": 53330 + }, + { + "epoch": 80.21, + "grad_norm": 8.601734161376953, + "learning_rate": 1.978947368421053e-06, + "loss": 0.1577, + "step": 53340 + }, + { + "epoch": 80.23, + "grad_norm": 3.8428754806518555, + "learning_rate": 1.9774436090225565e-06, + "loss": 0.2007, + "step": 53350 + }, + { + "epoch": 80.24, + "grad_norm": 6.386511325836182, + "learning_rate": 1.9759398496240604e-06, + "loss": 0.2227, + "step": 53360 + }, + { + "epoch": 80.26, + "grad_norm": 4.776179790496826, + "learning_rate": 1.974436090225564e-06, + "loss": 0.1553, + "step": 53370 + }, + { + "epoch": 80.27, + "grad_norm": 4.775459289550781, + "learning_rate": 1.972932330827068e-06, + "loss": 0.1948, + "step": 53380 + }, + { + "epoch": 80.29, + "grad_norm": 7.2186760902404785, + "learning_rate": 1.9714285714285714e-06, + "loss": 0.164, + "step": 53390 + }, + { + "epoch": 80.3, + "grad_norm": 12.228165626525879, + "learning_rate": 1.9699248120300754e-06, + "loss": 0.17, + "step": 53400 + }, + { + "epoch": 80.32, + "grad_norm": 3.7471702098846436, + "learning_rate": 1.9684210526315793e-06, + "loss": 0.1536, + "step": 53410 + }, + { + "epoch": 80.33, + "grad_norm": 4.057961940765381, + "learning_rate": 1.966917293233083e-06, + "loss": 0.1399, + "step": 53420 + }, + { + "epoch": 80.35, + "grad_norm": 5.822246074676514, + "learning_rate": 1.9654135338345868e-06, + "loss": 0.1627, + "step": 53430 + }, + { + "epoch": 80.36, + "grad_norm": 2.5543007850646973, + "learning_rate": 1.9639097744360903e-06, + "loss": 0.1273, + "step": 53440 + }, + { + "epoch": 80.38, + "grad_norm": 5.922774314880371, + "learning_rate": 1.9624060150375942e-06, + "loss": 0.212, + "step": 53450 + }, + { + "epoch": 80.39, + "grad_norm": 4.007785320281982, + "learning_rate": 1.960902255639098e-06, + "loss": 0.1515, + "step": 53460 + }, + { + "epoch": 80.41, + "grad_norm": 2.3833940029144287, + "learning_rate": 1.9593984962406017e-06, + "loss": 0.1828, + "step": 53470 + }, + { + "epoch": 80.42, + "grad_norm": 7.1434431076049805, + "learning_rate": 1.9578947368421052e-06, + "loss": 0.2086, + "step": 53480 + }, + { + "epoch": 80.44, + "grad_norm": 5.60752534866333, + "learning_rate": 1.956390977443609e-06, + "loss": 0.1768, + "step": 53490 + }, + { + "epoch": 80.45, + "grad_norm": 9.981124877929688, + "learning_rate": 1.9548872180451127e-06, + "loss": 0.1785, + "step": 53500 + }, + { + "epoch": 80.47, + "grad_norm": 2.1084699630737305, + "learning_rate": 1.9533834586466166e-06, + "loss": 0.2514, + "step": 53510 + }, + { + "epoch": 80.48, + "grad_norm": 3.638641834259033, + "learning_rate": 1.9518796992481206e-06, + "loss": 0.1162, + "step": 53520 + }, + { + "epoch": 80.5, + "grad_norm": 4.928158760070801, + "learning_rate": 1.950375939849624e-06, + "loss": 0.1602, + "step": 53530 + }, + { + "epoch": 80.51, + "grad_norm": 4.468033790588379, + "learning_rate": 1.948872180451128e-06, + "loss": 0.0994, + "step": 53540 + }, + { + "epoch": 80.53, + "grad_norm": 6.67031717300415, + "learning_rate": 1.9473684210526315e-06, + "loss": 0.1815, + "step": 53550 + }, + { + "epoch": 80.54, + "grad_norm": 4.985138893127441, + "learning_rate": 1.9458646616541355e-06, + "loss": 0.2014, + "step": 53560 + }, + { + "epoch": 80.56, + "grad_norm": 4.438155174255371, + "learning_rate": 1.9443609022556394e-06, + "loss": 0.1987, + "step": 53570 + }, + { + "epoch": 80.57, + "grad_norm": 6.645019054412842, + "learning_rate": 1.942857142857143e-06, + "loss": 0.1801, + "step": 53580 + }, + { + "epoch": 80.59, + "grad_norm": 2.9220592975616455, + "learning_rate": 1.941353383458647e-06, + "loss": 0.15, + "step": 53590 + }, + { + "epoch": 80.6, + "grad_norm": 3.196962356567383, + "learning_rate": 1.9398496240601504e-06, + "loss": 0.1512, + "step": 53600 + }, + { + "epoch": 80.62, + "grad_norm": 6.56658935546875, + "learning_rate": 1.9383458646616544e-06, + "loss": 0.1686, + "step": 53610 + }, + { + "epoch": 80.63, + "grad_norm": 8.974480628967285, + "learning_rate": 1.936842105263158e-06, + "loss": 0.1591, + "step": 53620 + }, + { + "epoch": 80.65, + "grad_norm": 9.335490226745605, + "learning_rate": 1.935338345864662e-06, + "loss": 0.1731, + "step": 53630 + }, + { + "epoch": 80.66, + "grad_norm": 8.802009582519531, + "learning_rate": 1.9338345864661658e-06, + "loss": 0.1393, + "step": 53640 + }, + { + "epoch": 80.68, + "grad_norm": 2.8996877670288086, + "learning_rate": 1.9323308270676693e-06, + "loss": 0.1955, + "step": 53650 + }, + { + "epoch": 80.69, + "grad_norm": 8.079301834106445, + "learning_rate": 1.9308270676691732e-06, + "loss": 0.2178, + "step": 53660 + }, + { + "epoch": 80.71, + "grad_norm": 4.58353853225708, + "learning_rate": 1.9293233082706767e-06, + "loss": 0.252, + "step": 53670 + }, + { + "epoch": 80.72, + "grad_norm": 1.9493281841278076, + "learning_rate": 1.9278195488721807e-06, + "loss": 0.15, + "step": 53680 + }, + { + "epoch": 80.74, + "grad_norm": 2.8279833793640137, + "learning_rate": 1.9263157894736846e-06, + "loss": 0.1748, + "step": 53690 + }, + { + "epoch": 80.75, + "grad_norm": 6.2720818519592285, + "learning_rate": 1.924812030075188e-06, + "loss": 0.1943, + "step": 53700 + }, + { + "epoch": 80.77, + "grad_norm": 5.162914752960205, + "learning_rate": 1.923308270676692e-06, + "loss": 0.1387, + "step": 53710 + }, + { + "epoch": 80.78, + "grad_norm": 5.782294273376465, + "learning_rate": 1.9218045112781956e-06, + "loss": 0.1838, + "step": 53720 + }, + { + "epoch": 80.8, + "grad_norm": 5.94707727432251, + "learning_rate": 1.920300751879699e-06, + "loss": 0.1411, + "step": 53730 + }, + { + "epoch": 80.81, + "grad_norm": 7.4608073234558105, + "learning_rate": 1.918796992481203e-06, + "loss": 0.1491, + "step": 53740 + }, + { + "epoch": 80.83, + "grad_norm": 3.1741888523101807, + "learning_rate": 1.917293233082707e-06, + "loss": 0.1506, + "step": 53750 + }, + { + "epoch": 80.84, + "grad_norm": 5.37153959274292, + "learning_rate": 1.9157894736842105e-06, + "loss": 0.2074, + "step": 53760 + }, + { + "epoch": 80.86, + "grad_norm": 2.308962821960449, + "learning_rate": 1.9142857142857145e-06, + "loss": 0.17, + "step": 53770 + }, + { + "epoch": 80.87, + "grad_norm": 2.749995470046997, + "learning_rate": 1.912781954887218e-06, + "loss": 0.1656, + "step": 53780 + }, + { + "epoch": 80.89, + "grad_norm": 4.785193920135498, + "learning_rate": 1.911278195488722e-06, + "loss": 0.1801, + "step": 53790 + }, + { + "epoch": 80.9, + "grad_norm": 5.815794467926025, + "learning_rate": 1.909774436090226e-06, + "loss": 0.1982, + "step": 53800 + }, + { + "epoch": 80.92, + "grad_norm": 4.500646591186523, + "learning_rate": 1.9082706766917294e-06, + "loss": 0.1509, + "step": 53810 + }, + { + "epoch": 80.93, + "grad_norm": 3.7033073902130127, + "learning_rate": 1.9067669172932331e-06, + "loss": 0.1613, + "step": 53820 + }, + { + "epoch": 80.95, + "grad_norm": 5.228376865386963, + "learning_rate": 1.905263157894737e-06, + "loss": 0.1766, + "step": 53830 + }, + { + "epoch": 80.96, + "grad_norm": 5.924206256866455, + "learning_rate": 1.9037593984962408e-06, + "loss": 0.1858, + "step": 53840 + }, + { + "epoch": 80.98, + "grad_norm": 2.352541923522949, + "learning_rate": 1.9022556390977445e-06, + "loss": 0.1322, + "step": 53850 + }, + { + "epoch": 80.99, + "grad_norm": 5.281574249267578, + "learning_rate": 1.9007518796992483e-06, + "loss": 0.1909, + "step": 53860 + }, + { + "epoch": 81.0, + "eval_accuracy": 0.9291, + "eval_loss": 0.34166744351387024, + "eval_runtime": 84.595, + "eval_samples_per_second": 118.21, + "eval_steps_per_second": 0.473, + "step": 53865 + }, + { + "epoch": 81.01, + "grad_norm": 6.562623977661133, + "learning_rate": 1.899248120300752e-06, + "loss": 0.1835, + "step": 53870 + }, + { + "epoch": 81.02, + "grad_norm": 6.715619087219238, + "learning_rate": 1.897744360902256e-06, + "loss": 0.2188, + "step": 53880 + }, + { + "epoch": 81.04, + "grad_norm": 3.6184804439544678, + "learning_rate": 1.8962406015037597e-06, + "loss": 0.1587, + "step": 53890 + }, + { + "epoch": 81.05, + "grad_norm": 7.747524261474609, + "learning_rate": 1.8947368421052634e-06, + "loss": 0.1861, + "step": 53900 + }, + { + "epoch": 81.07, + "grad_norm": 4.222412586212158, + "learning_rate": 1.8932330827067671e-06, + "loss": 0.1427, + "step": 53910 + }, + { + "epoch": 81.08, + "grad_norm": 7.423925399780273, + "learning_rate": 1.8917293233082709e-06, + "loss": 0.2178, + "step": 53920 + }, + { + "epoch": 81.1, + "grad_norm": 6.593136310577393, + "learning_rate": 1.8902255639097746e-06, + "loss": 0.2083, + "step": 53930 + }, + { + "epoch": 81.11, + "grad_norm": 5.739638805389404, + "learning_rate": 1.8887218045112785e-06, + "loss": 0.2182, + "step": 53940 + }, + { + "epoch": 81.13, + "grad_norm": 4.201329708099365, + "learning_rate": 1.8872180451127823e-06, + "loss": 0.1661, + "step": 53950 + }, + { + "epoch": 81.14, + "grad_norm": 4.125972270965576, + "learning_rate": 1.885714285714286e-06, + "loss": 0.177, + "step": 53960 + }, + { + "epoch": 81.16, + "grad_norm": 9.847689628601074, + "learning_rate": 1.8842105263157895e-06, + "loss": 0.1991, + "step": 53970 + }, + { + "epoch": 81.17, + "grad_norm": 2.6463875770568848, + "learning_rate": 1.8827067669172932e-06, + "loss": 0.1789, + "step": 53980 + }, + { + "epoch": 81.19, + "grad_norm": 5.124504089355469, + "learning_rate": 1.881203007518797e-06, + "loss": 0.1799, + "step": 53990 + }, + { + "epoch": 81.2, + "grad_norm": 6.498737335205078, + "learning_rate": 1.8796992481203007e-06, + "loss": 0.1637, + "step": 54000 + }, + { + "epoch": 81.22, + "grad_norm": 5.662612438201904, + "learning_rate": 1.8781954887218046e-06, + "loss": 0.1571, + "step": 54010 + }, + { + "epoch": 81.23, + "grad_norm": 4.6632232666015625, + "learning_rate": 1.8766917293233084e-06, + "loss": 0.1476, + "step": 54020 + }, + { + "epoch": 81.25, + "grad_norm": 3.081489324569702, + "learning_rate": 1.8751879699248121e-06, + "loss": 0.1544, + "step": 54030 + }, + { + "epoch": 81.26, + "grad_norm": 2.8134820461273193, + "learning_rate": 1.8736842105263158e-06, + "loss": 0.173, + "step": 54040 + }, + { + "epoch": 81.28, + "grad_norm": 6.176023483276367, + "learning_rate": 1.8721804511278196e-06, + "loss": 0.1662, + "step": 54050 + }, + { + "epoch": 81.29, + "grad_norm": 5.997969150543213, + "learning_rate": 1.8706766917293235e-06, + "loss": 0.1763, + "step": 54060 + }, + { + "epoch": 81.31, + "grad_norm": 7.557049751281738, + "learning_rate": 1.8691729323308272e-06, + "loss": 0.2356, + "step": 54070 + }, + { + "epoch": 81.32, + "grad_norm": 1.760933518409729, + "learning_rate": 1.867669172932331e-06, + "loss": 0.1678, + "step": 54080 + }, + { + "epoch": 81.34, + "grad_norm": 5.9524993896484375, + "learning_rate": 1.8661654135338347e-06, + "loss": 0.1923, + "step": 54090 + }, + { + "epoch": 81.35, + "grad_norm": 3.9762470722198486, + "learning_rate": 1.8646616541353384e-06, + "loss": 0.1556, + "step": 54100 + }, + { + "epoch": 81.37, + "grad_norm": 3.218543767929077, + "learning_rate": 1.8631578947368424e-06, + "loss": 0.1857, + "step": 54110 + }, + { + "epoch": 81.38, + "grad_norm": 6.120358467102051, + "learning_rate": 1.8616541353383461e-06, + "loss": 0.2167, + "step": 54120 + }, + { + "epoch": 81.4, + "grad_norm": 4.884064674377441, + "learning_rate": 1.8601503759398498e-06, + "loss": 0.1552, + "step": 54130 + }, + { + "epoch": 81.41, + "grad_norm": 5.395893573760986, + "learning_rate": 1.8586466165413536e-06, + "loss": 0.1528, + "step": 54140 + }, + { + "epoch": 81.43, + "grad_norm": 2.572039842605591, + "learning_rate": 1.8571428571428573e-06, + "loss": 0.1606, + "step": 54150 + }, + { + "epoch": 81.44, + "grad_norm": 2.791781425476074, + "learning_rate": 1.855639097744361e-06, + "loss": 0.1705, + "step": 54160 + }, + { + "epoch": 81.46, + "grad_norm": 3.3535261154174805, + "learning_rate": 1.854135338345865e-06, + "loss": 0.2432, + "step": 54170 + }, + { + "epoch": 81.47, + "grad_norm": 4.468529224395752, + "learning_rate": 1.8526315789473687e-06, + "loss": 0.1785, + "step": 54180 + }, + { + "epoch": 81.49, + "grad_norm": 3.004258871078491, + "learning_rate": 1.8511278195488724e-06, + "loss": 0.2144, + "step": 54190 + }, + { + "epoch": 81.5, + "grad_norm": 6.375463485717773, + "learning_rate": 1.8496240601503762e-06, + "loss": 0.1407, + "step": 54200 + }, + { + "epoch": 81.52, + "grad_norm": 7.171412467956543, + "learning_rate": 1.84812030075188e-06, + "loss": 0.1834, + "step": 54210 + }, + { + "epoch": 81.53, + "grad_norm": 4.493435859680176, + "learning_rate": 1.8466165413533834e-06, + "loss": 0.1702, + "step": 54220 + }, + { + "epoch": 81.55, + "grad_norm": 3.168029308319092, + "learning_rate": 1.8451127819548871e-06, + "loss": 0.1886, + "step": 54230 + }, + { + "epoch": 81.56, + "grad_norm": 6.606454372406006, + "learning_rate": 1.843609022556391e-06, + "loss": 0.1678, + "step": 54240 + }, + { + "epoch": 81.58, + "grad_norm": 7.317569255828857, + "learning_rate": 1.8421052631578948e-06, + "loss": 0.2086, + "step": 54250 + }, + { + "epoch": 81.59, + "grad_norm": 5.867735385894775, + "learning_rate": 1.8406015037593986e-06, + "loss": 0.2158, + "step": 54260 + }, + { + "epoch": 81.61, + "grad_norm": 6.016121864318848, + "learning_rate": 1.8390977443609023e-06, + "loss": 0.1706, + "step": 54270 + }, + { + "epoch": 81.62, + "grad_norm": 2.902163028717041, + "learning_rate": 1.837593984962406e-06, + "loss": 0.1605, + "step": 54280 + }, + { + "epoch": 81.64, + "grad_norm": 4.931081771850586, + "learning_rate": 1.83609022556391e-06, + "loss": 0.1675, + "step": 54290 + }, + { + "epoch": 81.65, + "grad_norm": 6.6242241859436035, + "learning_rate": 1.8345864661654137e-06, + "loss": 0.218, + "step": 54300 + }, + { + "epoch": 81.67, + "grad_norm": 4.538776397705078, + "learning_rate": 1.8330827067669174e-06, + "loss": 0.1534, + "step": 54310 + }, + { + "epoch": 81.68, + "grad_norm": 5.3194708824157715, + "learning_rate": 1.8315789473684211e-06, + "loss": 0.1789, + "step": 54320 + }, + { + "epoch": 81.7, + "grad_norm": 6.646446704864502, + "learning_rate": 1.8300751879699249e-06, + "loss": 0.1519, + "step": 54330 + }, + { + "epoch": 81.71, + "grad_norm": 7.2585129737854, + "learning_rate": 1.8285714285714288e-06, + "loss": 0.1952, + "step": 54340 + }, + { + "epoch": 81.73, + "grad_norm": 6.1801323890686035, + "learning_rate": 1.8270676691729326e-06, + "loss": 0.1726, + "step": 54350 + }, + { + "epoch": 81.74, + "grad_norm": 5.834466457366943, + "learning_rate": 1.8255639097744363e-06, + "loss": 0.1767, + "step": 54360 + }, + { + "epoch": 81.76, + "grad_norm": 5.508495807647705, + "learning_rate": 1.82406015037594e-06, + "loss": 0.1731, + "step": 54370 + }, + { + "epoch": 81.77, + "grad_norm": 1.7921135425567627, + "learning_rate": 1.8225563909774437e-06, + "loss": 0.2335, + "step": 54380 + }, + { + "epoch": 81.79, + "grad_norm": 5.268810749053955, + "learning_rate": 1.8210526315789475e-06, + "loss": 0.2098, + "step": 54390 + }, + { + "epoch": 81.8, + "grad_norm": 5.814802169799805, + "learning_rate": 1.8195488721804514e-06, + "loss": 0.1643, + "step": 54400 + }, + { + "epoch": 81.82, + "grad_norm": 4.461940765380859, + "learning_rate": 1.8180451127819551e-06, + "loss": 0.1374, + "step": 54410 + }, + { + "epoch": 81.83, + "grad_norm": 3.247183322906494, + "learning_rate": 1.8165413533834589e-06, + "loss": 0.1355, + "step": 54420 + }, + { + "epoch": 81.85, + "grad_norm": 1.9381792545318604, + "learning_rate": 1.8150375939849626e-06, + "loss": 0.1723, + "step": 54430 + }, + { + "epoch": 81.86, + "grad_norm": 7.478023529052734, + "learning_rate": 1.8135338345864663e-06, + "loss": 0.1846, + "step": 54440 + }, + { + "epoch": 81.88, + "grad_norm": 5.697751522064209, + "learning_rate": 1.8120300751879703e-06, + "loss": 0.1653, + "step": 54450 + }, + { + "epoch": 81.89, + "grad_norm": 5.588354587554932, + "learning_rate": 1.810526315789474e-06, + "loss": 0.1838, + "step": 54460 + }, + { + "epoch": 81.91, + "grad_norm": 6.156862258911133, + "learning_rate": 1.8090225563909775e-06, + "loss": 0.211, + "step": 54470 + }, + { + "epoch": 81.92, + "grad_norm": 7.52768087387085, + "learning_rate": 1.8075187969924813e-06, + "loss": 0.2083, + "step": 54480 + }, + { + "epoch": 81.94, + "grad_norm": 3.558300018310547, + "learning_rate": 1.806015037593985e-06, + "loss": 0.1508, + "step": 54490 + }, + { + "epoch": 81.95, + "grad_norm": 5.417634010314941, + "learning_rate": 1.8045112781954887e-06, + "loss": 0.1776, + "step": 54500 + }, + { + "epoch": 81.97, + "grad_norm": 5.08277702331543, + "learning_rate": 1.8030075187969925e-06, + "loss": 0.1513, + "step": 54510 + }, + { + "epoch": 81.98, + "grad_norm": 5.362541675567627, + "learning_rate": 1.8015037593984964e-06, + "loss": 0.1559, + "step": 54520 + }, + { + "epoch": 82.0, + "grad_norm": 1.1385769844055176, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.1971, + "step": 54530 + }, + { + "epoch": 82.0, + "eval_accuracy": 0.9289, + "eval_loss": 0.33228030800819397, + "eval_runtime": 85.2251, + "eval_samples_per_second": 117.336, + "eval_steps_per_second": 0.469, + "step": 54530 + }, + { + "epoch": 82.02, + "grad_norm": 2.4561429023742676, + "learning_rate": 1.7984962406015039e-06, + "loss": 0.1383, + "step": 54540 + }, + { + "epoch": 82.03, + "grad_norm": 4.4165425300598145, + "learning_rate": 1.7969924812030076e-06, + "loss": 0.1265, + "step": 54550 + }, + { + "epoch": 82.05, + "grad_norm": 8.235807418823242, + "learning_rate": 1.7954887218045113e-06, + "loss": 0.1736, + "step": 54560 + }, + { + "epoch": 82.06, + "grad_norm": 5.088337421417236, + "learning_rate": 1.7939849624060153e-06, + "loss": 0.1613, + "step": 54570 + }, + { + "epoch": 82.08, + "grad_norm": 4.968101501464844, + "learning_rate": 1.792481203007519e-06, + "loss": 0.1572, + "step": 54580 + }, + { + "epoch": 82.09, + "grad_norm": 3.9068446159362793, + "learning_rate": 1.7909774436090227e-06, + "loss": 0.2085, + "step": 54590 + }, + { + "epoch": 82.11, + "grad_norm": 4.162015914916992, + "learning_rate": 1.7894736842105265e-06, + "loss": 0.1924, + "step": 54600 + }, + { + "epoch": 82.12, + "grad_norm": 5.8883256912231445, + "learning_rate": 1.7879699248120302e-06, + "loss": 0.1482, + "step": 54610 + }, + { + "epoch": 82.14, + "grad_norm": 3.305724859237671, + "learning_rate": 1.786466165413534e-06, + "loss": 0.134, + "step": 54620 + }, + { + "epoch": 82.15, + "grad_norm": 3.4198994636535645, + "learning_rate": 1.7849624060150379e-06, + "loss": 0.122, + "step": 54630 + }, + { + "epoch": 82.17, + "grad_norm": 4.999546051025391, + "learning_rate": 1.7834586466165416e-06, + "loss": 0.2511, + "step": 54640 + }, + { + "epoch": 82.18, + "grad_norm": 3.644953489303589, + "learning_rate": 1.7819548872180453e-06, + "loss": 0.1224, + "step": 54650 + }, + { + "epoch": 82.2, + "grad_norm": 2.552194833755493, + "learning_rate": 1.780451127819549e-06, + "loss": 0.1872, + "step": 54660 + }, + { + "epoch": 82.21, + "grad_norm": 6.016075134277344, + "learning_rate": 1.7789473684210528e-06, + "loss": 0.1731, + "step": 54670 + }, + { + "epoch": 82.23, + "grad_norm": 3.725740671157837, + "learning_rate": 1.7774436090225567e-06, + "loss": 0.1697, + "step": 54680 + }, + { + "epoch": 82.24, + "grad_norm": 4.1656999588012695, + "learning_rate": 1.7759398496240605e-06, + "loss": 0.1599, + "step": 54690 + }, + { + "epoch": 82.26, + "grad_norm": 5.3262481689453125, + "learning_rate": 1.7744360902255642e-06, + "loss": 0.2012, + "step": 54700 + }, + { + "epoch": 82.27, + "grad_norm": 4.335447788238525, + "learning_rate": 1.772932330827068e-06, + "loss": 0.1789, + "step": 54710 + }, + { + "epoch": 82.29, + "grad_norm": 5.814763069152832, + "learning_rate": 1.7714285714285714e-06, + "loss": 0.1574, + "step": 54720 + }, + { + "epoch": 82.3, + "grad_norm": 8.759305000305176, + "learning_rate": 1.7699248120300752e-06, + "loss": 0.1821, + "step": 54730 + }, + { + "epoch": 82.32, + "grad_norm": 4.221161842346191, + "learning_rate": 1.768421052631579e-06, + "loss": 0.1759, + "step": 54740 + }, + { + "epoch": 82.33, + "grad_norm": 4.3325910568237305, + "learning_rate": 1.7669172932330828e-06, + "loss": 0.258, + "step": 54750 + }, + { + "epoch": 82.35, + "grad_norm": 4.424845218658447, + "learning_rate": 1.7654135338345866e-06, + "loss": 0.2111, + "step": 54760 + }, + { + "epoch": 82.36, + "grad_norm": 6.010334014892578, + "learning_rate": 1.7639097744360903e-06, + "loss": 0.2227, + "step": 54770 + }, + { + "epoch": 82.38, + "grad_norm": 4.39691162109375, + "learning_rate": 1.762406015037594e-06, + "loss": 0.1556, + "step": 54780 + }, + { + "epoch": 82.39, + "grad_norm": 4.894425868988037, + "learning_rate": 1.7609022556390978e-06, + "loss": 0.1579, + "step": 54790 + }, + { + "epoch": 82.41, + "grad_norm": 5.324883937835693, + "learning_rate": 1.7593984962406017e-06, + "loss": 0.2109, + "step": 54800 + }, + { + "epoch": 82.42, + "grad_norm": 6.122479438781738, + "learning_rate": 1.7578947368421054e-06, + "loss": 0.2117, + "step": 54810 + }, + { + "epoch": 82.44, + "grad_norm": 5.467765808105469, + "learning_rate": 1.7563909774436092e-06, + "loss": 0.1589, + "step": 54820 + }, + { + "epoch": 82.45, + "grad_norm": 8.767003059387207, + "learning_rate": 1.754887218045113e-06, + "loss": 0.1147, + "step": 54830 + }, + { + "epoch": 82.47, + "grad_norm": 9.435832977294922, + "learning_rate": 1.7533834586466166e-06, + "loss": 0.1292, + "step": 54840 + }, + { + "epoch": 82.48, + "grad_norm": 5.217706203460693, + "learning_rate": 1.7518796992481204e-06, + "loss": 0.148, + "step": 54850 + }, + { + "epoch": 82.5, + "grad_norm": 2.389409303665161, + "learning_rate": 1.7503759398496243e-06, + "loss": 0.1228, + "step": 54860 + }, + { + "epoch": 82.51, + "grad_norm": 1.7925937175750732, + "learning_rate": 1.748872180451128e-06, + "loss": 0.1766, + "step": 54870 + }, + { + "epoch": 82.53, + "grad_norm": 2.3438923358917236, + "learning_rate": 1.7473684210526318e-06, + "loss": 0.1621, + "step": 54880 + }, + { + "epoch": 82.54, + "grad_norm": 5.975131511688232, + "learning_rate": 1.7458646616541355e-06, + "loss": 0.1248, + "step": 54890 + }, + { + "epoch": 82.56, + "grad_norm": 12.676349639892578, + "learning_rate": 1.7443609022556392e-06, + "loss": 0.1825, + "step": 54900 + }, + { + "epoch": 82.57, + "grad_norm": 9.043052673339844, + "learning_rate": 1.7428571428571432e-06, + "loss": 0.1963, + "step": 54910 + }, + { + "epoch": 82.59, + "grad_norm": 2.8596065044403076, + "learning_rate": 1.741353383458647e-06, + "loss": 0.1685, + "step": 54920 + }, + { + "epoch": 82.6, + "grad_norm": 4.849795341491699, + "learning_rate": 1.7398496240601506e-06, + "loss": 0.1839, + "step": 54930 + }, + { + "epoch": 82.62, + "grad_norm": 7.803761959075928, + "learning_rate": 1.7383458646616544e-06, + "loss": 0.1732, + "step": 54940 + }, + { + "epoch": 82.63, + "grad_norm": 6.521337985992432, + "learning_rate": 1.736842105263158e-06, + "loss": 0.1562, + "step": 54950 + }, + { + "epoch": 82.65, + "grad_norm": 12.948254585266113, + "learning_rate": 1.735338345864662e-06, + "loss": 0.1772, + "step": 54960 + }, + { + "epoch": 82.66, + "grad_norm": 3.319334030151367, + "learning_rate": 1.7338345864661653e-06, + "loss": 0.2134, + "step": 54970 + }, + { + "epoch": 82.68, + "grad_norm": 2.6258463859558105, + "learning_rate": 1.7323308270676693e-06, + "loss": 0.1354, + "step": 54980 + }, + { + "epoch": 82.69, + "grad_norm": 5.683926105499268, + "learning_rate": 1.730827067669173e-06, + "loss": 0.1092, + "step": 54990 + }, + { + "epoch": 82.71, + "grad_norm": 6.205657958984375, + "learning_rate": 1.7293233082706767e-06, + "loss": 0.1635, + "step": 55000 + }, + { + "epoch": 82.72, + "grad_norm": 2.16133713722229, + "learning_rate": 1.7278195488721805e-06, + "loss": 0.1746, + "step": 55010 + }, + { + "epoch": 82.74, + "grad_norm": 5.119611740112305, + "learning_rate": 1.7263157894736842e-06, + "loss": 0.1608, + "step": 55020 + }, + { + "epoch": 82.75, + "grad_norm": 5.32130241394043, + "learning_rate": 1.7248120300751882e-06, + "loss": 0.1386, + "step": 55030 + }, + { + "epoch": 82.77, + "grad_norm": 5.220561981201172, + "learning_rate": 1.7233082706766919e-06, + "loss": 0.192, + "step": 55040 + }, + { + "epoch": 82.78, + "grad_norm": 5.081111431121826, + "learning_rate": 1.7218045112781956e-06, + "loss": 0.1744, + "step": 55050 + }, + { + "epoch": 82.8, + "grad_norm": 7.443409442901611, + "learning_rate": 1.7203007518796993e-06, + "loss": 0.2045, + "step": 55060 + }, + { + "epoch": 82.81, + "grad_norm": 5.770323276519775, + "learning_rate": 1.718796992481203e-06, + "loss": 0.1719, + "step": 55070 + }, + { + "epoch": 82.83, + "grad_norm": 5.356820106506348, + "learning_rate": 1.7172932330827068e-06, + "loss": 0.2003, + "step": 55080 + }, + { + "epoch": 82.84, + "grad_norm": 4.85361385345459, + "learning_rate": 1.7157894736842107e-06, + "loss": 0.1951, + "step": 55090 + }, + { + "epoch": 82.86, + "grad_norm": 4.393242359161377, + "learning_rate": 1.7142857142857145e-06, + "loss": 0.198, + "step": 55100 + }, + { + "epoch": 82.87, + "grad_norm": 2.715772867202759, + "learning_rate": 1.7127819548872182e-06, + "loss": 0.131, + "step": 55110 + }, + { + "epoch": 82.89, + "grad_norm": 5.954652786254883, + "learning_rate": 1.711278195488722e-06, + "loss": 0.2029, + "step": 55120 + }, + { + "epoch": 82.9, + "grad_norm": 3.8711795806884766, + "learning_rate": 1.7097744360902257e-06, + "loss": 0.1509, + "step": 55130 + }, + { + "epoch": 82.92, + "grad_norm": 4.712859630584717, + "learning_rate": 1.7082706766917296e-06, + "loss": 0.1485, + "step": 55140 + }, + { + "epoch": 82.93, + "grad_norm": 6.353842735290527, + "learning_rate": 1.7067669172932333e-06, + "loss": 0.1959, + "step": 55150 + }, + { + "epoch": 82.95, + "grad_norm": 4.206510066986084, + "learning_rate": 1.705263157894737e-06, + "loss": 0.1318, + "step": 55160 + }, + { + "epoch": 82.96, + "grad_norm": 4.608467102050781, + "learning_rate": 1.7037593984962408e-06, + "loss": 0.2017, + "step": 55170 + }, + { + "epoch": 82.98, + "grad_norm": 5.684893608093262, + "learning_rate": 1.7022556390977445e-06, + "loss": 0.1917, + "step": 55180 + }, + { + "epoch": 82.99, + "grad_norm": 4.197262763977051, + "learning_rate": 1.7007518796992485e-06, + "loss": 0.1739, + "step": 55190 + }, + { + "epoch": 83.0, + "eval_accuracy": 0.9323, + "eval_loss": 0.3265763223171234, + "eval_runtime": 84.4148, + "eval_samples_per_second": 118.463, + "eval_steps_per_second": 0.474, + "step": 55195 + }, + { + "epoch": 83.01, + "grad_norm": 4.629222869873047, + "learning_rate": 1.6992481203007522e-06, + "loss": 0.1463, + "step": 55200 + }, + { + "epoch": 83.02, + "grad_norm": 5.791532516479492, + "learning_rate": 1.6977443609022557e-06, + "loss": 0.1871, + "step": 55210 + }, + { + "epoch": 83.04, + "grad_norm": 6.979246616363525, + "learning_rate": 1.6962406015037595e-06, + "loss": 0.1502, + "step": 55220 + }, + { + "epoch": 83.05, + "grad_norm": 1.3585783243179321, + "learning_rate": 1.6947368421052632e-06, + "loss": 0.1733, + "step": 55230 + }, + { + "epoch": 83.07, + "grad_norm": 1.0612996816635132, + "learning_rate": 1.693233082706767e-06, + "loss": 0.1586, + "step": 55240 + }, + { + "epoch": 83.08, + "grad_norm": 3.546823740005493, + "learning_rate": 1.6917293233082707e-06, + "loss": 0.1586, + "step": 55250 + }, + { + "epoch": 83.1, + "grad_norm": 4.408979892730713, + "learning_rate": 1.6902255639097746e-06, + "loss": 0.2008, + "step": 55260 + }, + { + "epoch": 83.11, + "grad_norm": 4.838244438171387, + "learning_rate": 1.6887218045112783e-06, + "loss": 0.1753, + "step": 55270 + }, + { + "epoch": 83.13, + "grad_norm": 2.8008768558502197, + "learning_rate": 1.687218045112782e-06, + "loss": 0.1515, + "step": 55280 + }, + { + "epoch": 83.14, + "grad_norm": 1.7678091526031494, + "learning_rate": 1.6857142857142858e-06, + "loss": 0.1716, + "step": 55290 + }, + { + "epoch": 83.16, + "grad_norm": 4.2498087882995605, + "learning_rate": 1.6842105263157895e-06, + "loss": 0.2125, + "step": 55300 + }, + { + "epoch": 83.17, + "grad_norm": 4.752126216888428, + "learning_rate": 1.6827067669172933e-06, + "loss": 0.195, + "step": 55310 + }, + { + "epoch": 83.19, + "grad_norm": 2.648486375808716, + "learning_rate": 1.6812030075187972e-06, + "loss": 0.1554, + "step": 55320 + }, + { + "epoch": 83.2, + "grad_norm": 5.214283466339111, + "learning_rate": 1.679699248120301e-06, + "loss": 0.2508, + "step": 55330 + }, + { + "epoch": 83.22, + "grad_norm": 7.676024913787842, + "learning_rate": 1.6781954887218047e-06, + "loss": 0.1883, + "step": 55340 + }, + { + "epoch": 83.23, + "grad_norm": 5.258351802825928, + "learning_rate": 1.6766917293233084e-06, + "loss": 0.1941, + "step": 55350 + }, + { + "epoch": 83.25, + "grad_norm": 5.472609043121338, + "learning_rate": 1.6751879699248121e-06, + "loss": 0.1522, + "step": 55360 + }, + { + "epoch": 83.26, + "grad_norm": 3.57065749168396, + "learning_rate": 1.673684210526316e-06, + "loss": 0.1801, + "step": 55370 + }, + { + "epoch": 83.28, + "grad_norm": 4.2197651863098145, + "learning_rate": 1.6721804511278198e-06, + "loss": 0.1963, + "step": 55380 + }, + { + "epoch": 83.29, + "grad_norm": 7.963762283325195, + "learning_rate": 1.6706766917293235e-06, + "loss": 0.2155, + "step": 55390 + }, + { + "epoch": 83.31, + "grad_norm": 4.304222106933594, + "learning_rate": 1.6691729323308273e-06, + "loss": 0.225, + "step": 55400 + }, + { + "epoch": 83.32, + "grad_norm": 4.590184211730957, + "learning_rate": 1.667669172932331e-06, + "loss": 0.174, + "step": 55410 + }, + { + "epoch": 83.34, + "grad_norm": 2.6624538898468018, + "learning_rate": 1.6661654135338347e-06, + "loss": 0.1555, + "step": 55420 + }, + { + "epoch": 83.35, + "grad_norm": 6.655846118927002, + "learning_rate": 1.6646616541353387e-06, + "loss": 0.1236, + "step": 55430 + }, + { + "epoch": 83.37, + "grad_norm": 5.1804633140563965, + "learning_rate": 1.6631578947368424e-06, + "loss": 0.2144, + "step": 55440 + }, + { + "epoch": 83.38, + "grad_norm": 6.537972450256348, + "learning_rate": 1.6616541353383461e-06, + "loss": 0.1807, + "step": 55450 + }, + { + "epoch": 83.4, + "grad_norm": 3.086345911026001, + "learning_rate": 1.6601503759398496e-06, + "loss": 0.1505, + "step": 55460 + }, + { + "epoch": 83.41, + "grad_norm": 6.289426326751709, + "learning_rate": 1.6586466165413534e-06, + "loss": 0.1967, + "step": 55470 + }, + { + "epoch": 83.43, + "grad_norm": 4.445835113525391, + "learning_rate": 1.657142857142857e-06, + "loss": 0.153, + "step": 55480 + }, + { + "epoch": 83.44, + "grad_norm": 5.393485069274902, + "learning_rate": 1.6556390977443608e-06, + "loss": 0.1946, + "step": 55490 + }, + { + "epoch": 83.46, + "grad_norm": 4.735557556152344, + "learning_rate": 1.6541353383458648e-06, + "loss": 0.1649, + "step": 55500 + }, + { + "epoch": 83.47, + "grad_norm": 2.2363061904907227, + "learning_rate": 1.6526315789473685e-06, + "loss": 0.2182, + "step": 55510 + }, + { + "epoch": 83.49, + "grad_norm": 1.6198318004608154, + "learning_rate": 1.6511278195488722e-06, + "loss": 0.2171, + "step": 55520 + }, + { + "epoch": 83.5, + "grad_norm": 3.38696551322937, + "learning_rate": 1.649624060150376e-06, + "loss": 0.1405, + "step": 55530 + }, + { + "epoch": 83.52, + "grad_norm": 1.2411551475524902, + "learning_rate": 1.6481203007518797e-06, + "loss": 0.1632, + "step": 55540 + }, + { + "epoch": 83.53, + "grad_norm": 3.3203301429748535, + "learning_rate": 1.6466165413533836e-06, + "loss": 0.1592, + "step": 55550 + }, + { + "epoch": 83.55, + "grad_norm": 13.928276062011719, + "learning_rate": 1.6451127819548874e-06, + "loss": 0.1434, + "step": 55560 + }, + { + "epoch": 83.56, + "grad_norm": 3.75093150138855, + "learning_rate": 1.643609022556391e-06, + "loss": 0.2471, + "step": 55570 + }, + { + "epoch": 83.58, + "grad_norm": 6.298830032348633, + "learning_rate": 1.6421052631578948e-06, + "loss": 0.1624, + "step": 55580 + }, + { + "epoch": 83.59, + "grad_norm": 4.140471935272217, + "learning_rate": 1.6406015037593986e-06, + "loss": 0.1668, + "step": 55590 + }, + { + "epoch": 83.61, + "grad_norm": 7.234468936920166, + "learning_rate": 1.6390977443609025e-06, + "loss": 0.1933, + "step": 55600 + }, + { + "epoch": 83.62, + "grad_norm": 5.53890323638916, + "learning_rate": 1.6375939849624062e-06, + "loss": 0.1686, + "step": 55610 + }, + { + "epoch": 83.64, + "grad_norm": 7.111166000366211, + "learning_rate": 1.63609022556391e-06, + "loss": 0.1447, + "step": 55620 + }, + { + "epoch": 83.65, + "grad_norm": 4.891413688659668, + "learning_rate": 1.6345864661654137e-06, + "loss": 0.1894, + "step": 55630 + }, + { + "epoch": 83.67, + "grad_norm": 5.022510051727295, + "learning_rate": 1.6330827067669174e-06, + "loss": 0.1572, + "step": 55640 + }, + { + "epoch": 83.68, + "grad_norm": 6.799849033355713, + "learning_rate": 1.6315789473684212e-06, + "loss": 0.2438, + "step": 55650 + }, + { + "epoch": 83.7, + "grad_norm": 6.336165904998779, + "learning_rate": 1.630075187969925e-06, + "loss": 0.1738, + "step": 55660 + }, + { + "epoch": 83.71, + "grad_norm": 4.65312385559082, + "learning_rate": 1.6285714285714288e-06, + "loss": 0.1406, + "step": 55670 + }, + { + "epoch": 83.73, + "grad_norm": 6.864182472229004, + "learning_rate": 1.6270676691729326e-06, + "loss": 0.1773, + "step": 55680 + }, + { + "epoch": 83.74, + "grad_norm": 3.7218823432922363, + "learning_rate": 1.6255639097744363e-06, + "loss": 0.1292, + "step": 55690 + }, + { + "epoch": 83.76, + "grad_norm": 6.442845821380615, + "learning_rate": 1.62406015037594e-06, + "loss": 0.1512, + "step": 55700 + }, + { + "epoch": 83.77, + "grad_norm": 4.894304275512695, + "learning_rate": 1.6225563909774435e-06, + "loss": 0.162, + "step": 55710 + }, + { + "epoch": 83.79, + "grad_norm": 5.067921161651611, + "learning_rate": 1.6210526315789473e-06, + "loss": 0.1662, + "step": 55720 + }, + { + "epoch": 83.8, + "grad_norm": 10.18139362335205, + "learning_rate": 1.6195488721804512e-06, + "loss": 0.2123, + "step": 55730 + }, + { + "epoch": 83.82, + "grad_norm": 6.891566753387451, + "learning_rate": 1.618045112781955e-06, + "loss": 0.1701, + "step": 55740 + }, + { + "epoch": 83.83, + "grad_norm": 3.438103437423706, + "learning_rate": 1.6165413533834587e-06, + "loss": 0.1598, + "step": 55750 + }, + { + "epoch": 83.85, + "grad_norm": 3.0639941692352295, + "learning_rate": 1.6150375939849624e-06, + "loss": 0.2099, + "step": 55760 + }, + { + "epoch": 83.86, + "grad_norm": 5.10312557220459, + "learning_rate": 1.6135338345864661e-06, + "loss": 0.1678, + "step": 55770 + }, + { + "epoch": 83.88, + "grad_norm": 3.48979115486145, + "learning_rate": 1.61203007518797e-06, + "loss": 0.1578, + "step": 55780 + }, + { + "epoch": 83.89, + "grad_norm": 7.103468418121338, + "learning_rate": 1.6105263157894738e-06, + "loss": 0.1769, + "step": 55790 + }, + { + "epoch": 83.91, + "grad_norm": 4.037766456604004, + "learning_rate": 1.6090225563909775e-06, + "loss": 0.2101, + "step": 55800 + }, + { + "epoch": 83.92, + "grad_norm": 3.874589204788208, + "learning_rate": 1.6075187969924813e-06, + "loss": 0.2364, + "step": 55810 + }, + { + "epoch": 83.94, + "grad_norm": 4.208032608032227, + "learning_rate": 1.606015037593985e-06, + "loss": 0.1712, + "step": 55820 + }, + { + "epoch": 83.95, + "grad_norm": 4.456460475921631, + "learning_rate": 1.604511278195489e-06, + "loss": 0.2004, + "step": 55830 + }, + { + "epoch": 83.97, + "grad_norm": 6.270118236541748, + "learning_rate": 1.6030075187969927e-06, + "loss": 0.156, + "step": 55840 + }, + { + "epoch": 83.98, + "grad_norm": 5.951742172241211, + "learning_rate": 1.6015037593984964e-06, + "loss": 0.2439, + "step": 55850 + }, + { + "epoch": 84.0, + "grad_norm": 0.09190316498279572, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1537, + "step": 55860 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.9294, + "eval_loss": 0.33126822113990784, + "eval_runtime": 84.9554, + "eval_samples_per_second": 117.709, + "eval_steps_per_second": 0.471, + "step": 55860 + }, + { + "epoch": 84.02, + "grad_norm": 4.058501243591309, + "learning_rate": 1.5984962406015039e-06, + "loss": 0.238, + "step": 55870 + }, + { + "epoch": 84.03, + "grad_norm": 3.7976138591766357, + "learning_rate": 1.5969924812030076e-06, + "loss": 0.1423, + "step": 55880 + }, + { + "epoch": 84.05, + "grad_norm": 4.4287614822387695, + "learning_rate": 1.5954887218045115e-06, + "loss": 0.1356, + "step": 55890 + }, + { + "epoch": 84.06, + "grad_norm": 5.282792568206787, + "learning_rate": 1.5939849624060153e-06, + "loss": 0.1902, + "step": 55900 + }, + { + "epoch": 84.08, + "grad_norm": 5.178953170776367, + "learning_rate": 1.592481203007519e-06, + "loss": 0.1712, + "step": 55910 + }, + { + "epoch": 84.09, + "grad_norm": 0.7578862905502319, + "learning_rate": 1.5909774436090227e-06, + "loss": 0.1615, + "step": 55920 + }, + { + "epoch": 84.11, + "grad_norm": 6.100287437438965, + "learning_rate": 1.5894736842105265e-06, + "loss": 0.144, + "step": 55930 + }, + { + "epoch": 84.12, + "grad_norm": 10.830288887023926, + "learning_rate": 1.5879699248120304e-06, + "loss": 0.1449, + "step": 55940 + }, + { + "epoch": 84.14, + "grad_norm": 6.316339015960693, + "learning_rate": 1.5864661654135341e-06, + "loss": 0.202, + "step": 55950 + }, + { + "epoch": 84.15, + "grad_norm": 4.204655647277832, + "learning_rate": 1.5849624060150377e-06, + "loss": 0.1957, + "step": 55960 + }, + { + "epoch": 84.17, + "grad_norm": 5.749233722686768, + "learning_rate": 1.5834586466165414e-06, + "loss": 0.2423, + "step": 55970 + }, + { + "epoch": 84.18, + "grad_norm": 2.607123851776123, + "learning_rate": 1.5819548872180451e-06, + "loss": 0.1752, + "step": 55980 + }, + { + "epoch": 84.2, + "grad_norm": 3.5867726802825928, + "learning_rate": 1.5804511278195489e-06, + "loss": 0.2333, + "step": 55990 + }, + { + "epoch": 84.21, + "grad_norm": 3.1847939491271973, + "learning_rate": 1.5789473684210526e-06, + "loss": 0.1552, + "step": 56000 + }, + { + "epoch": 84.23, + "grad_norm": 5.968934535980225, + "learning_rate": 1.5774436090225565e-06, + "loss": 0.155, + "step": 56010 + }, + { + "epoch": 84.24, + "grad_norm": 3.6189496517181396, + "learning_rate": 1.5759398496240603e-06, + "loss": 0.1612, + "step": 56020 + }, + { + "epoch": 84.26, + "grad_norm": 1.3396971225738525, + "learning_rate": 1.574436090225564e-06, + "loss": 0.1614, + "step": 56030 + }, + { + "epoch": 84.27, + "grad_norm": 2.855109930038452, + "learning_rate": 1.5729323308270677e-06, + "loss": 0.1615, + "step": 56040 + }, + { + "epoch": 84.29, + "grad_norm": 4.582769393920898, + "learning_rate": 1.5714285714285714e-06, + "loss": 0.2042, + "step": 56050 + }, + { + "epoch": 84.3, + "grad_norm": 5.339018821716309, + "learning_rate": 1.5699248120300754e-06, + "loss": 0.1094, + "step": 56060 + }, + { + "epoch": 84.32, + "grad_norm": 7.760763168334961, + "learning_rate": 1.5684210526315791e-06, + "loss": 0.2093, + "step": 56070 + }, + { + "epoch": 84.33, + "grad_norm": 7.610210418701172, + "learning_rate": 1.5669172932330829e-06, + "loss": 0.1975, + "step": 56080 + }, + { + "epoch": 84.35, + "grad_norm": 3.7673017978668213, + "learning_rate": 1.5654135338345866e-06, + "loss": 0.1645, + "step": 56090 + }, + { + "epoch": 84.36, + "grad_norm": 2.311070203781128, + "learning_rate": 1.5639097744360903e-06, + "loss": 0.1574, + "step": 56100 + }, + { + "epoch": 84.38, + "grad_norm": 5.664913177490234, + "learning_rate": 1.562406015037594e-06, + "loss": 0.1414, + "step": 56110 + }, + { + "epoch": 84.39, + "grad_norm": 6.023291110992432, + "learning_rate": 1.560902255639098e-06, + "loss": 0.1766, + "step": 56120 + }, + { + "epoch": 84.41, + "grad_norm": 4.557356357574463, + "learning_rate": 1.5593984962406017e-06, + "loss": 0.1912, + "step": 56130 + }, + { + "epoch": 84.42, + "grad_norm": 3.221834421157837, + "learning_rate": 1.5578947368421054e-06, + "loss": 0.1643, + "step": 56140 + }, + { + "epoch": 84.44, + "grad_norm": 7.018657684326172, + "learning_rate": 1.5563909774436092e-06, + "loss": 0.2211, + "step": 56150 + }, + { + "epoch": 84.45, + "grad_norm": 5.987710475921631, + "learning_rate": 1.554887218045113e-06, + "loss": 0.1389, + "step": 56160 + }, + { + "epoch": 84.47, + "grad_norm": 3.3997559547424316, + "learning_rate": 1.5533834586466169e-06, + "loss": 0.1328, + "step": 56170 + }, + { + "epoch": 84.48, + "grad_norm": 3.822932243347168, + "learning_rate": 1.5518796992481206e-06, + "loss": 0.2355, + "step": 56180 + }, + { + "epoch": 84.5, + "grad_norm": 4.0138349533081055, + "learning_rate": 1.5503759398496243e-06, + "loss": 0.1783, + "step": 56190 + }, + { + "epoch": 84.51, + "grad_norm": 8.904444694519043, + "learning_rate": 1.548872180451128e-06, + "loss": 0.2122, + "step": 56200 + }, + { + "epoch": 84.53, + "grad_norm": 7.065569877624512, + "learning_rate": 1.5473684210526316e-06, + "loss": 0.1961, + "step": 56210 + }, + { + "epoch": 84.54, + "grad_norm": 8.500096321105957, + "learning_rate": 1.5458646616541353e-06, + "loss": 0.1663, + "step": 56220 + }, + { + "epoch": 84.56, + "grad_norm": 4.012298583984375, + "learning_rate": 1.544360902255639e-06, + "loss": 0.2012, + "step": 56230 + }, + { + "epoch": 84.57, + "grad_norm": 7.489271640777588, + "learning_rate": 1.542857142857143e-06, + "loss": 0.1539, + "step": 56240 + }, + { + "epoch": 84.59, + "grad_norm": 3.202110767364502, + "learning_rate": 1.5413533834586467e-06, + "loss": 0.2089, + "step": 56250 + }, + { + "epoch": 84.6, + "grad_norm": 1.4287196397781372, + "learning_rate": 1.5398496240601504e-06, + "loss": 0.1811, + "step": 56260 + }, + { + "epoch": 84.62, + "grad_norm": 5.019970417022705, + "learning_rate": 1.5383458646616542e-06, + "loss": 0.184, + "step": 56270 + }, + { + "epoch": 84.63, + "grad_norm": 2.368131637573242, + "learning_rate": 1.5368421052631579e-06, + "loss": 0.1327, + "step": 56280 + }, + { + "epoch": 84.65, + "grad_norm": 3.924480676651001, + "learning_rate": 1.5353383458646618e-06, + "loss": 0.1848, + "step": 56290 + }, + { + "epoch": 84.66, + "grad_norm": 8.339082717895508, + "learning_rate": 1.5338345864661656e-06, + "loss": 0.2257, + "step": 56300 + }, + { + "epoch": 84.68, + "grad_norm": 4.868645668029785, + "learning_rate": 1.5323308270676693e-06, + "loss": 0.1838, + "step": 56310 + }, + { + "epoch": 84.69, + "grad_norm": 5.948423862457275, + "learning_rate": 1.530827067669173e-06, + "loss": 0.1669, + "step": 56320 + }, + { + "epoch": 84.71, + "grad_norm": 3.8026175498962402, + "learning_rate": 1.5293233082706768e-06, + "loss": 0.1501, + "step": 56330 + }, + { + "epoch": 84.72, + "grad_norm": 6.754022598266602, + "learning_rate": 1.5278195488721805e-06, + "loss": 0.1706, + "step": 56340 + }, + { + "epoch": 84.74, + "grad_norm": 4.1264495849609375, + "learning_rate": 1.5263157894736844e-06, + "loss": 0.2263, + "step": 56350 + }, + { + "epoch": 84.75, + "grad_norm": 7.627476692199707, + "learning_rate": 1.5248120300751882e-06, + "loss": 0.1672, + "step": 56360 + }, + { + "epoch": 84.77, + "grad_norm": 3.8254830837249756, + "learning_rate": 1.5233082706766919e-06, + "loss": 0.2137, + "step": 56370 + }, + { + "epoch": 84.78, + "grad_norm": 4.202638149261475, + "learning_rate": 1.5218045112781956e-06, + "loss": 0.1923, + "step": 56380 + }, + { + "epoch": 84.8, + "grad_norm": 4.201712608337402, + "learning_rate": 1.5203007518796994e-06, + "loss": 0.1436, + "step": 56390 + }, + { + "epoch": 84.81, + "grad_norm": 4.552309989929199, + "learning_rate": 1.5187969924812033e-06, + "loss": 0.1477, + "step": 56400 + }, + { + "epoch": 84.83, + "grad_norm": 5.196654319763184, + "learning_rate": 1.517293233082707e-06, + "loss": 0.2116, + "step": 56410 + }, + { + "epoch": 84.84, + "grad_norm": 6.609741687774658, + "learning_rate": 1.5157894736842108e-06, + "loss": 0.2068, + "step": 56420 + }, + { + "epoch": 84.86, + "grad_norm": 3.7197117805480957, + "learning_rate": 1.5142857142857145e-06, + "loss": 0.2105, + "step": 56430 + }, + { + "epoch": 84.87, + "grad_norm": 1.3698334693908691, + "learning_rate": 1.5127819548872182e-06, + "loss": 0.2096, + "step": 56440 + }, + { + "epoch": 84.89, + "grad_norm": 5.72020959854126, + "learning_rate": 1.5112781954887222e-06, + "loss": 0.1488, + "step": 56450 + }, + { + "epoch": 84.9, + "grad_norm": 2.5910472869873047, + "learning_rate": 1.5097744360902255e-06, + "loss": 0.1275, + "step": 56460 + }, + { + "epoch": 84.92, + "grad_norm": 6.082350254058838, + "learning_rate": 1.5082706766917294e-06, + "loss": 0.1829, + "step": 56470 + }, + { + "epoch": 84.93, + "grad_norm": 3.298006296157837, + "learning_rate": 1.5067669172932331e-06, + "loss": 0.1375, + "step": 56480 + }, + { + "epoch": 84.95, + "grad_norm": 5.607242107391357, + "learning_rate": 1.5052631578947369e-06, + "loss": 0.155, + "step": 56490 + }, + { + "epoch": 84.96, + "grad_norm": 3.7076168060302734, + "learning_rate": 1.5037593984962406e-06, + "loss": 0.1593, + "step": 56500 + }, + { + "epoch": 84.98, + "grad_norm": 2.4990506172180176, + "learning_rate": 1.5022556390977443e-06, + "loss": 0.1322, + "step": 56510 + }, + { + "epoch": 84.99, + "grad_norm": 2.6433887481689453, + "learning_rate": 1.5007518796992483e-06, + "loss": 0.1706, + "step": 56520 + }, + { + "epoch": 85.0, + "eval_accuracy": 0.928, + "eval_loss": 0.3395210802555084, + "eval_runtime": 84.3204, + "eval_samples_per_second": 118.595, + "eval_steps_per_second": 0.474, + "step": 56525 + }, + { + "epoch": 85.01, + "grad_norm": 4.445404529571533, + "learning_rate": 1.499248120300752e-06, + "loss": 0.162, + "step": 56530 + }, + { + "epoch": 85.02, + "grad_norm": 9.234296798706055, + "learning_rate": 1.4977443609022557e-06, + "loss": 0.1999, + "step": 56540 + }, + { + "epoch": 85.04, + "grad_norm": 4.423348426818848, + "learning_rate": 1.4962406015037595e-06, + "loss": 0.1403, + "step": 56550 + }, + { + "epoch": 85.05, + "grad_norm": 2.3429605960845947, + "learning_rate": 1.4947368421052632e-06, + "loss": 0.199, + "step": 56560 + }, + { + "epoch": 85.07, + "grad_norm": 5.699217319488525, + "learning_rate": 1.493233082706767e-06, + "loss": 0.2032, + "step": 56570 + }, + { + "epoch": 85.08, + "grad_norm": 3.4555978775024414, + "learning_rate": 1.4917293233082709e-06, + "loss": 0.1305, + "step": 56580 + }, + { + "epoch": 85.1, + "grad_norm": 4.182602882385254, + "learning_rate": 1.4902255639097746e-06, + "loss": 0.2042, + "step": 56590 + }, + { + "epoch": 85.11, + "grad_norm": 7.7609686851501465, + "learning_rate": 1.4887218045112783e-06, + "loss": 0.2182, + "step": 56600 + }, + { + "epoch": 85.13, + "grad_norm": 4.991521835327148, + "learning_rate": 1.487218045112782e-06, + "loss": 0.1919, + "step": 56610 + }, + { + "epoch": 85.14, + "grad_norm": 5.988854885101318, + "learning_rate": 1.4857142857142858e-06, + "loss": 0.2383, + "step": 56620 + }, + { + "epoch": 85.16, + "grad_norm": 3.479478597640991, + "learning_rate": 1.4842105263157897e-06, + "loss": 0.2143, + "step": 56630 + }, + { + "epoch": 85.17, + "grad_norm": 4.183614253997803, + "learning_rate": 1.4827067669172935e-06, + "loss": 0.2017, + "step": 56640 + }, + { + "epoch": 85.19, + "grad_norm": 4.1281418800354, + "learning_rate": 1.4812030075187972e-06, + "loss": 0.1597, + "step": 56650 + }, + { + "epoch": 85.2, + "grad_norm": 4.900537014007568, + "learning_rate": 1.479699248120301e-06, + "loss": 0.1677, + "step": 56660 + }, + { + "epoch": 85.22, + "grad_norm": 2.788635492324829, + "learning_rate": 1.4781954887218047e-06, + "loss": 0.1888, + "step": 56670 + }, + { + "epoch": 85.23, + "grad_norm": 8.006430625915527, + "learning_rate": 1.4766917293233086e-06, + "loss": 0.1544, + "step": 56680 + }, + { + "epoch": 85.25, + "grad_norm": 4.344766616821289, + "learning_rate": 1.4751879699248123e-06, + "loss": 0.147, + "step": 56690 + }, + { + "epoch": 85.26, + "grad_norm": 5.809935092926025, + "learning_rate": 1.4736842105263159e-06, + "loss": 0.1912, + "step": 56700 + }, + { + "epoch": 85.28, + "grad_norm": 6.294594764709473, + "learning_rate": 1.4721804511278196e-06, + "loss": 0.1366, + "step": 56710 + }, + { + "epoch": 85.29, + "grad_norm": 5.718160152435303, + "learning_rate": 1.4706766917293233e-06, + "loss": 0.1554, + "step": 56720 + }, + { + "epoch": 85.31, + "grad_norm": 4.895095348358154, + "learning_rate": 1.469172932330827e-06, + "loss": 0.2095, + "step": 56730 + }, + { + "epoch": 85.32, + "grad_norm": 4.996368885040283, + "learning_rate": 1.4676691729323308e-06, + "loss": 0.2099, + "step": 56740 + }, + { + "epoch": 85.34, + "grad_norm": 4.071951866149902, + "learning_rate": 1.4661654135338347e-06, + "loss": 0.1789, + "step": 56750 + }, + { + "epoch": 85.35, + "grad_norm": 4.790773391723633, + "learning_rate": 1.4646616541353385e-06, + "loss": 0.2203, + "step": 56760 + }, + { + "epoch": 85.37, + "grad_norm": 3.539854049682617, + "learning_rate": 1.4631578947368422e-06, + "loss": 0.1297, + "step": 56770 + }, + { + "epoch": 85.38, + "grad_norm": 8.937888145446777, + "learning_rate": 1.461654135338346e-06, + "loss": 0.2044, + "step": 56780 + }, + { + "epoch": 85.4, + "grad_norm": 5.165222644805908, + "learning_rate": 1.4601503759398496e-06, + "loss": 0.1447, + "step": 56790 + }, + { + "epoch": 85.41, + "grad_norm": 7.947958946228027, + "learning_rate": 1.4586466165413534e-06, + "loss": 0.2182, + "step": 56800 + }, + { + "epoch": 85.43, + "grad_norm": 3.317690849304199, + "learning_rate": 1.4571428571428573e-06, + "loss": 0.2299, + "step": 56810 + }, + { + "epoch": 85.44, + "grad_norm": 3.5233917236328125, + "learning_rate": 1.455639097744361e-06, + "loss": 0.2098, + "step": 56820 + }, + { + "epoch": 85.46, + "grad_norm": 3.6351675987243652, + "learning_rate": 1.4541353383458648e-06, + "loss": 0.1576, + "step": 56830 + }, + { + "epoch": 85.47, + "grad_norm": 5.808753490447998, + "learning_rate": 1.4526315789473685e-06, + "loss": 0.1761, + "step": 56840 + }, + { + "epoch": 85.49, + "grad_norm": 4.585028171539307, + "learning_rate": 1.4511278195488722e-06, + "loss": 0.1982, + "step": 56850 + }, + { + "epoch": 85.5, + "grad_norm": 5.568889617919922, + "learning_rate": 1.4496240601503762e-06, + "loss": 0.1516, + "step": 56860 + }, + { + "epoch": 85.52, + "grad_norm": 9.540451049804688, + "learning_rate": 1.44812030075188e-06, + "loss": 0.1922, + "step": 56870 + }, + { + "epoch": 85.53, + "grad_norm": 3.1969432830810547, + "learning_rate": 1.4466165413533836e-06, + "loss": 0.1595, + "step": 56880 + }, + { + "epoch": 85.55, + "grad_norm": 5.823395252227783, + "learning_rate": 1.4451127819548874e-06, + "loss": 0.2047, + "step": 56890 + }, + { + "epoch": 85.56, + "grad_norm": 5.089601039886475, + "learning_rate": 1.4436090225563911e-06, + "loss": 0.2095, + "step": 56900 + }, + { + "epoch": 85.58, + "grad_norm": 6.277038097381592, + "learning_rate": 1.442105263157895e-06, + "loss": 0.154, + "step": 56910 + }, + { + "epoch": 85.59, + "grad_norm": 4.296693801879883, + "learning_rate": 1.4406015037593988e-06, + "loss": 0.1405, + "step": 56920 + }, + { + "epoch": 85.61, + "grad_norm": 4.177289962768555, + "learning_rate": 1.4390977443609025e-06, + "loss": 0.1869, + "step": 56930 + }, + { + "epoch": 85.62, + "grad_norm": 5.647763729095459, + "learning_rate": 1.4375939849624062e-06, + "loss": 0.2275, + "step": 56940 + }, + { + "epoch": 85.64, + "grad_norm": 6.22650146484375, + "learning_rate": 1.4360902255639098e-06, + "loss": 0.1854, + "step": 56950 + }, + { + "epoch": 85.65, + "grad_norm": 1.4636141061782837, + "learning_rate": 1.4345864661654135e-06, + "loss": 0.1481, + "step": 56960 + }, + { + "epoch": 85.67, + "grad_norm": 5.929630279541016, + "learning_rate": 1.4330827067669172e-06, + "loss": 0.1671, + "step": 56970 + }, + { + "epoch": 85.68, + "grad_norm": 6.411832809448242, + "learning_rate": 1.4315789473684212e-06, + "loss": 0.1598, + "step": 56980 + }, + { + "epoch": 85.7, + "grad_norm": 4.0760602951049805, + "learning_rate": 1.430075187969925e-06, + "loss": 0.1493, + "step": 56990 + }, + { + "epoch": 85.71, + "grad_norm": 5.515470027923584, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.133, + "step": 57000 + }, + { + "epoch": 85.73, + "grad_norm": 4.49883508682251, + "learning_rate": 1.4270676691729324e-06, + "loss": 0.2057, + "step": 57010 + }, + { + "epoch": 85.74, + "grad_norm": 6.55890417098999, + "learning_rate": 1.425563909774436e-06, + "loss": 0.1498, + "step": 57020 + }, + { + "epoch": 85.76, + "grad_norm": 6.194241523742676, + "learning_rate": 1.4240601503759398e-06, + "loss": 0.1963, + "step": 57030 + }, + { + "epoch": 85.77, + "grad_norm": 5.173956394195557, + "learning_rate": 1.4225563909774438e-06, + "loss": 0.1443, + "step": 57040 + }, + { + "epoch": 85.79, + "grad_norm": 4.395569801330566, + "learning_rate": 1.4210526315789475e-06, + "loss": 0.1895, + "step": 57050 + }, + { + "epoch": 85.8, + "grad_norm": 6.59354305267334, + "learning_rate": 1.4195488721804512e-06, + "loss": 0.1671, + "step": 57060 + }, + { + "epoch": 85.82, + "grad_norm": 4.2721028327941895, + "learning_rate": 1.418045112781955e-06, + "loss": 0.1004, + "step": 57070 + }, + { + "epoch": 85.83, + "grad_norm": 6.9995436668396, + "learning_rate": 1.4165413533834587e-06, + "loss": 0.1782, + "step": 57080 + }, + { + "epoch": 85.85, + "grad_norm": 4.45127010345459, + "learning_rate": 1.4150375939849626e-06, + "loss": 0.2011, + "step": 57090 + }, + { + "epoch": 85.86, + "grad_norm": 7.801600456237793, + "learning_rate": 1.4135338345864664e-06, + "loss": 0.1099, + "step": 57100 + }, + { + "epoch": 85.88, + "grad_norm": 5.012579917907715, + "learning_rate": 1.41203007518797e-06, + "loss": 0.1656, + "step": 57110 + }, + { + "epoch": 85.89, + "grad_norm": 6.178292751312256, + "learning_rate": 1.4105263157894738e-06, + "loss": 0.1934, + "step": 57120 + }, + { + "epoch": 85.91, + "grad_norm": 3.175626039505005, + "learning_rate": 1.4090225563909776e-06, + "loss": 0.1558, + "step": 57130 + }, + { + "epoch": 85.92, + "grad_norm": 7.903815746307373, + "learning_rate": 1.4075187969924815e-06, + "loss": 0.1989, + "step": 57140 + }, + { + "epoch": 85.94, + "grad_norm": 5.2723774909973145, + "learning_rate": 1.4060150375939852e-06, + "loss": 0.1617, + "step": 57150 + }, + { + "epoch": 85.95, + "grad_norm": 7.170572280883789, + "learning_rate": 1.404511278195489e-06, + "loss": 0.1336, + "step": 57160 + }, + { + "epoch": 85.97, + "grad_norm": 8.153802871704102, + "learning_rate": 1.4030075187969927e-06, + "loss": 0.1318, + "step": 57170 + }, + { + "epoch": 85.98, + "grad_norm": 1.0930061340332031, + "learning_rate": 1.4015037593984964e-06, + "loss": 0.1619, + "step": 57180 + }, + { + "epoch": 86.0, + "grad_norm": 1.082709789276123, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.199, + "step": 57190 + }, + { + "epoch": 86.0, + "eval_accuracy": 0.9303, + "eval_loss": 0.33443817496299744, + "eval_runtime": 84.4787, + "eval_samples_per_second": 118.373, + "eval_steps_per_second": 0.473, + "step": 57190 + }, + { + "epoch": 86.02, + "grad_norm": 4.792495250701904, + "learning_rate": 1.3984962406015037e-06, + "loss": 0.1635, + "step": 57200 + }, + { + "epoch": 86.03, + "grad_norm": 6.919180870056152, + "learning_rate": 1.3969924812030076e-06, + "loss": 0.1994, + "step": 57210 + }, + { + "epoch": 86.05, + "grad_norm": 3.3340089321136475, + "learning_rate": 1.3954887218045113e-06, + "loss": 0.1505, + "step": 57220 + }, + { + "epoch": 86.06, + "grad_norm": 4.591991424560547, + "learning_rate": 1.393984962406015e-06, + "loss": 0.1473, + "step": 57230 + }, + { + "epoch": 86.08, + "grad_norm": 5.2993550300598145, + "learning_rate": 1.3924812030075188e-06, + "loss": 0.1736, + "step": 57240 + }, + { + "epoch": 86.09, + "grad_norm": 3.2715325355529785, + "learning_rate": 1.3909774436090225e-06, + "loss": 0.1788, + "step": 57250 + }, + { + "epoch": 86.11, + "grad_norm": 5.598659515380859, + "learning_rate": 1.3894736842105263e-06, + "loss": 0.1715, + "step": 57260 + }, + { + "epoch": 86.12, + "grad_norm": 5.6875, + "learning_rate": 1.3879699248120302e-06, + "loss": 0.1439, + "step": 57270 + }, + { + "epoch": 86.14, + "grad_norm": 3.4048891067504883, + "learning_rate": 1.386466165413534e-06, + "loss": 0.1514, + "step": 57280 + }, + { + "epoch": 86.15, + "grad_norm": 6.038202285766602, + "learning_rate": 1.3849624060150377e-06, + "loss": 0.1718, + "step": 57290 + }, + { + "epoch": 86.17, + "grad_norm": 4.095704078674316, + "learning_rate": 1.3834586466165414e-06, + "loss": 0.159, + "step": 57300 + }, + { + "epoch": 86.18, + "grad_norm": 6.841707706451416, + "learning_rate": 1.3819548872180451e-06, + "loss": 0.1779, + "step": 57310 + }, + { + "epoch": 86.2, + "grad_norm": 2.8215293884277344, + "learning_rate": 1.380451127819549e-06, + "loss": 0.1868, + "step": 57320 + }, + { + "epoch": 86.21, + "grad_norm": 6.776797294616699, + "learning_rate": 1.3789473684210528e-06, + "loss": 0.183, + "step": 57330 + }, + { + "epoch": 86.23, + "grad_norm": 9.345000267028809, + "learning_rate": 1.3774436090225565e-06, + "loss": 0.1724, + "step": 57340 + }, + { + "epoch": 86.24, + "grad_norm": 5.53676700592041, + "learning_rate": 1.3759398496240603e-06, + "loss": 0.186, + "step": 57350 + }, + { + "epoch": 86.26, + "grad_norm": 6.88644552230835, + "learning_rate": 1.374436090225564e-06, + "loss": 0.1801, + "step": 57360 + }, + { + "epoch": 86.27, + "grad_norm": 10.067435264587402, + "learning_rate": 1.372932330827068e-06, + "loss": 0.1977, + "step": 57370 + }, + { + "epoch": 86.29, + "grad_norm": 3.2209675312042236, + "learning_rate": 1.3714285714285717e-06, + "loss": 0.1715, + "step": 57380 + }, + { + "epoch": 86.3, + "grad_norm": 5.24599027633667, + "learning_rate": 1.3699248120300754e-06, + "loss": 0.1733, + "step": 57390 + }, + { + "epoch": 86.32, + "grad_norm": 4.074387550354004, + "learning_rate": 1.3684210526315791e-06, + "loss": 0.1588, + "step": 57400 + }, + { + "epoch": 86.33, + "grad_norm": 3.2896995544433594, + "learning_rate": 1.3669172932330829e-06, + "loss": 0.1384, + "step": 57410 + }, + { + "epoch": 86.35, + "grad_norm": 6.851515293121338, + "learning_rate": 1.3654135338345866e-06, + "loss": 0.2287, + "step": 57420 + }, + { + "epoch": 86.36, + "grad_norm": 3.76686954498291, + "learning_rate": 1.3639097744360905e-06, + "loss": 0.1623, + "step": 57430 + }, + { + "epoch": 86.38, + "grad_norm": 7.297066688537598, + "learning_rate": 1.3624060150375943e-06, + "loss": 0.1177, + "step": 57440 + }, + { + "epoch": 86.39, + "grad_norm": 4.633073329925537, + "learning_rate": 1.3609022556390978e-06, + "loss": 0.1634, + "step": 57450 + }, + { + "epoch": 86.41, + "grad_norm": 5.141488552093506, + "learning_rate": 1.3593984962406015e-06, + "loss": 0.2001, + "step": 57460 + }, + { + "epoch": 86.42, + "grad_norm": 4.2999467849731445, + "learning_rate": 1.3578947368421052e-06, + "loss": 0.1434, + "step": 57470 + }, + { + "epoch": 86.44, + "grad_norm": 1.721150517463684, + "learning_rate": 1.356390977443609e-06, + "loss": 0.2166, + "step": 57480 + }, + { + "epoch": 86.45, + "grad_norm": 6.083622455596924, + "learning_rate": 1.3548872180451127e-06, + "loss": 0.1603, + "step": 57490 + }, + { + "epoch": 86.47, + "grad_norm": 8.409628868103027, + "learning_rate": 1.3533834586466167e-06, + "loss": 0.1633, + "step": 57500 + }, + { + "epoch": 86.48, + "grad_norm": 5.203959941864014, + "learning_rate": 1.3518796992481204e-06, + "loss": 0.191, + "step": 57510 + }, + { + "epoch": 86.5, + "grad_norm": 3.071166515350342, + "learning_rate": 1.3503759398496241e-06, + "loss": 0.1879, + "step": 57520 + }, + { + "epoch": 86.51, + "grad_norm": 3.929776191711426, + "learning_rate": 1.3488721804511278e-06, + "loss": 0.1791, + "step": 57530 + }, + { + "epoch": 86.53, + "grad_norm": 4.9922709465026855, + "learning_rate": 1.3473684210526316e-06, + "loss": 0.1235, + "step": 57540 + }, + { + "epoch": 86.54, + "grad_norm": 3.8462400436401367, + "learning_rate": 1.3458646616541355e-06, + "loss": 0.1654, + "step": 57550 + }, + { + "epoch": 86.56, + "grad_norm": 3.1140220165252686, + "learning_rate": 1.3443609022556392e-06, + "loss": 0.2052, + "step": 57560 + }, + { + "epoch": 86.57, + "grad_norm": 8.724045753479004, + "learning_rate": 1.342857142857143e-06, + "loss": 0.1703, + "step": 57570 + }, + { + "epoch": 86.59, + "grad_norm": 4.744037628173828, + "learning_rate": 1.3413533834586467e-06, + "loss": 0.2456, + "step": 57580 + }, + { + "epoch": 86.6, + "grad_norm": 2.585832118988037, + "learning_rate": 1.3398496240601504e-06, + "loss": 0.1464, + "step": 57590 + }, + { + "epoch": 86.62, + "grad_norm": 5.518332004547119, + "learning_rate": 1.3383458646616544e-06, + "loss": 0.1899, + "step": 57600 + }, + { + "epoch": 86.63, + "grad_norm": 12.576923370361328, + "learning_rate": 1.3368421052631581e-06, + "loss": 0.1964, + "step": 57610 + }, + { + "epoch": 86.65, + "grad_norm": 2.010282516479492, + "learning_rate": 1.3353383458646618e-06, + "loss": 0.1554, + "step": 57620 + }, + { + "epoch": 86.66, + "grad_norm": 2.587585687637329, + "learning_rate": 1.3338345864661656e-06, + "loss": 0.1413, + "step": 57630 + }, + { + "epoch": 86.68, + "grad_norm": 4.673181533813477, + "learning_rate": 1.3323308270676693e-06, + "loss": 0.209, + "step": 57640 + }, + { + "epoch": 86.69, + "grad_norm": 3.9027647972106934, + "learning_rate": 1.330827067669173e-06, + "loss": 0.176, + "step": 57650 + }, + { + "epoch": 86.71, + "grad_norm": 7.965885639190674, + "learning_rate": 1.329323308270677e-06, + "loss": 0.1014, + "step": 57660 + }, + { + "epoch": 86.72, + "grad_norm": 4.3658881187438965, + "learning_rate": 1.3278195488721807e-06, + "loss": 0.1876, + "step": 57670 + }, + { + "epoch": 86.74, + "grad_norm": 8.340523719787598, + "learning_rate": 1.3263157894736844e-06, + "loss": 0.1785, + "step": 57680 + }, + { + "epoch": 86.75, + "grad_norm": 4.654315948486328, + "learning_rate": 1.3248120300751882e-06, + "loss": 0.1522, + "step": 57690 + }, + { + "epoch": 86.77, + "grad_norm": 7.557748317718506, + "learning_rate": 1.3233082706766917e-06, + "loss": 0.2559, + "step": 57700 + }, + { + "epoch": 86.78, + "grad_norm": 5.0106329917907715, + "learning_rate": 1.3218045112781954e-06, + "loss": 0.1944, + "step": 57710 + }, + { + "epoch": 86.8, + "grad_norm": 5.76795768737793, + "learning_rate": 1.3203007518796992e-06, + "loss": 0.1632, + "step": 57720 + }, + { + "epoch": 86.81, + "grad_norm": 4.331972599029541, + "learning_rate": 1.318796992481203e-06, + "loss": 0.1461, + "step": 57730 + }, + { + "epoch": 86.83, + "grad_norm": 5.625306606292725, + "learning_rate": 1.3172932330827068e-06, + "loss": 0.0988, + "step": 57740 + }, + { + "epoch": 86.84, + "grad_norm": 3.771822452545166, + "learning_rate": 1.3157894736842106e-06, + "loss": 0.139, + "step": 57750 + }, + { + "epoch": 86.86, + "grad_norm": 2.988506555557251, + "learning_rate": 1.3142857142857143e-06, + "loss": 0.1482, + "step": 57760 + }, + { + "epoch": 86.87, + "grad_norm": 6.377354145050049, + "learning_rate": 1.312781954887218e-06, + "loss": 0.1966, + "step": 57770 + }, + { + "epoch": 86.89, + "grad_norm": 26.265287399291992, + "learning_rate": 1.311278195488722e-06, + "loss": 0.2471, + "step": 57780 + }, + { + "epoch": 86.9, + "grad_norm": 7.3494062423706055, + "learning_rate": 1.3097744360902257e-06, + "loss": 0.1744, + "step": 57790 + }, + { + "epoch": 86.92, + "grad_norm": 6.284552574157715, + "learning_rate": 1.3082706766917294e-06, + "loss": 0.1936, + "step": 57800 + }, + { + "epoch": 86.93, + "grad_norm": 2.87434720993042, + "learning_rate": 1.3067669172932332e-06, + "loss": 0.1445, + "step": 57810 + }, + { + "epoch": 86.95, + "grad_norm": 3.9225971698760986, + "learning_rate": 1.3052631578947369e-06, + "loss": 0.1282, + "step": 57820 + }, + { + "epoch": 86.96, + "grad_norm": 6.310343265533447, + "learning_rate": 1.3037593984962408e-06, + "loss": 0.1504, + "step": 57830 + }, + { + "epoch": 86.98, + "grad_norm": 2.322134494781494, + "learning_rate": 1.3022556390977446e-06, + "loss": 0.1774, + "step": 57840 + }, + { + "epoch": 86.99, + "grad_norm": 6.937019348144531, + "learning_rate": 1.3007518796992483e-06, + "loss": 0.2013, + "step": 57850 + }, + { + "epoch": 87.0, + "eval_accuracy": 0.9294, + "eval_loss": 0.33600765466690063, + "eval_runtime": 84.8466, + "eval_samples_per_second": 117.86, + "eval_steps_per_second": 0.471, + "step": 57855 + }, + { + "epoch": 87.01, + "grad_norm": 6.285330772399902, + "learning_rate": 1.299248120300752e-06, + "loss": 0.1726, + "step": 57860 + }, + { + "epoch": 87.02, + "grad_norm": 5.90061616897583, + "learning_rate": 1.2977443609022557e-06, + "loss": 0.1748, + "step": 57870 + }, + { + "epoch": 87.04, + "grad_norm": 4.586902141571045, + "learning_rate": 1.2962406015037595e-06, + "loss": 0.1611, + "step": 57880 + }, + { + "epoch": 87.05, + "grad_norm": 6.66023063659668, + "learning_rate": 1.2947368421052634e-06, + "loss": 0.1986, + "step": 57890 + }, + { + "epoch": 87.07, + "grad_norm": 5.385473728179932, + "learning_rate": 1.2932330827067672e-06, + "loss": 0.1067, + "step": 57900 + }, + { + "epoch": 87.08, + "grad_norm": 6.819510459899902, + "learning_rate": 1.2917293233082709e-06, + "loss": 0.2073, + "step": 57910 + }, + { + "epoch": 87.1, + "grad_norm": 6.095653533935547, + "learning_rate": 1.2902255639097746e-06, + "loss": 0.182, + "step": 57920 + }, + { + "epoch": 87.11, + "grad_norm": 5.213226318359375, + "learning_rate": 1.2887218045112783e-06, + "loss": 0.1669, + "step": 57930 + }, + { + "epoch": 87.13, + "grad_norm": 6.771183013916016, + "learning_rate": 1.2872180451127819e-06, + "loss": 0.1516, + "step": 57940 + }, + { + "epoch": 87.14, + "grad_norm": 4.856118202209473, + "learning_rate": 1.2857142857142856e-06, + "loss": 0.1819, + "step": 57950 + }, + { + "epoch": 87.16, + "grad_norm": 4.535681247711182, + "learning_rate": 1.2842105263157895e-06, + "loss": 0.14, + "step": 57960 + }, + { + "epoch": 87.17, + "grad_norm": 4.036566734313965, + "learning_rate": 1.2827067669172933e-06, + "loss": 0.136, + "step": 57970 + }, + { + "epoch": 87.19, + "grad_norm": 4.947079181671143, + "learning_rate": 1.281203007518797e-06, + "loss": 0.1617, + "step": 57980 + }, + { + "epoch": 87.2, + "grad_norm": 2.912419557571411, + "learning_rate": 1.2796992481203007e-06, + "loss": 0.1863, + "step": 57990 + }, + { + "epoch": 87.22, + "grad_norm": 5.682669639587402, + "learning_rate": 1.2781954887218045e-06, + "loss": 0.1317, + "step": 58000 + }, + { + "epoch": 87.23, + "grad_norm": 5.527998447418213, + "learning_rate": 1.2766917293233084e-06, + "loss": 0.1618, + "step": 58010 + }, + { + "epoch": 87.25, + "grad_norm": 6.747208595275879, + "learning_rate": 1.2751879699248121e-06, + "loss": 0.1686, + "step": 58020 + }, + { + "epoch": 87.26, + "grad_norm": 5.503549098968506, + "learning_rate": 1.2736842105263159e-06, + "loss": 0.1881, + "step": 58030 + }, + { + "epoch": 87.28, + "grad_norm": 5.4713029861450195, + "learning_rate": 1.2721804511278196e-06, + "loss": 0.184, + "step": 58040 + }, + { + "epoch": 87.29, + "grad_norm": 2.895097494125366, + "learning_rate": 1.2706766917293233e-06, + "loss": 0.1819, + "step": 58050 + }, + { + "epoch": 87.31, + "grad_norm": 6.448250770568848, + "learning_rate": 1.2691729323308273e-06, + "loss": 0.1763, + "step": 58060 + }, + { + "epoch": 87.32, + "grad_norm": 3.6069977283477783, + "learning_rate": 1.267669172932331e-06, + "loss": 0.1478, + "step": 58070 + }, + { + "epoch": 87.34, + "grad_norm": 13.077876091003418, + "learning_rate": 1.2661654135338347e-06, + "loss": 0.1803, + "step": 58080 + }, + { + "epoch": 87.35, + "grad_norm": 2.1468687057495117, + "learning_rate": 1.2646616541353385e-06, + "loss": 0.1337, + "step": 58090 + }, + { + "epoch": 87.37, + "grad_norm": 4.480571269989014, + "learning_rate": 1.2631578947368422e-06, + "loss": 0.1444, + "step": 58100 + }, + { + "epoch": 87.38, + "grad_norm": 8.891602516174316, + "learning_rate": 1.261654135338346e-06, + "loss": 0.1664, + "step": 58110 + }, + { + "epoch": 87.4, + "grad_norm": 8.419784545898438, + "learning_rate": 1.2601503759398499e-06, + "loss": 0.1594, + "step": 58120 + }, + { + "epoch": 87.41, + "grad_norm": 3.2222964763641357, + "learning_rate": 1.2586466165413536e-06, + "loss": 0.1197, + "step": 58130 + }, + { + "epoch": 87.43, + "grad_norm": 6.81358528137207, + "learning_rate": 1.2571428571428573e-06, + "loss": 0.1934, + "step": 58140 + }, + { + "epoch": 87.44, + "grad_norm": 8.876537322998047, + "learning_rate": 1.255639097744361e-06, + "loss": 0.1814, + "step": 58150 + }, + { + "epoch": 87.46, + "grad_norm": 7.15813684463501, + "learning_rate": 1.2541353383458648e-06, + "loss": 0.1791, + "step": 58160 + }, + { + "epoch": 87.47, + "grad_norm": 7.933776378631592, + "learning_rate": 1.2526315789473687e-06, + "loss": 0.1833, + "step": 58170 + }, + { + "epoch": 87.49, + "grad_norm": 4.0624284744262695, + "learning_rate": 1.2511278195488725e-06, + "loss": 0.1686, + "step": 58180 + }, + { + "epoch": 87.5, + "grad_norm": 6.974576950073242, + "learning_rate": 1.249624060150376e-06, + "loss": 0.151, + "step": 58190 + }, + { + "epoch": 87.52, + "grad_norm": 5.813340663909912, + "learning_rate": 1.2481203007518797e-06, + "loss": 0.158, + "step": 58200 + }, + { + "epoch": 87.53, + "grad_norm": 4.3520073890686035, + "learning_rate": 1.2466165413533837e-06, + "loss": 0.1725, + "step": 58210 + }, + { + "epoch": 87.55, + "grad_norm": 5.577434539794922, + "learning_rate": 1.2451127819548874e-06, + "loss": 0.178, + "step": 58220 + }, + { + "epoch": 87.56, + "grad_norm": 1.6522976160049438, + "learning_rate": 1.2436090225563911e-06, + "loss": 0.17, + "step": 58230 + }, + { + "epoch": 87.58, + "grad_norm": 8.239200592041016, + "learning_rate": 1.2421052631578948e-06, + "loss": 0.1651, + "step": 58240 + }, + { + "epoch": 87.59, + "grad_norm": 3.1411798000335693, + "learning_rate": 1.2406015037593986e-06, + "loss": 0.1839, + "step": 58250 + }, + { + "epoch": 87.61, + "grad_norm": 8.265312194824219, + "learning_rate": 1.2390977443609023e-06, + "loss": 0.1692, + "step": 58260 + }, + { + "epoch": 87.62, + "grad_norm": 7.520175457000732, + "learning_rate": 1.237593984962406e-06, + "loss": 0.1909, + "step": 58270 + }, + { + "epoch": 87.64, + "grad_norm": 9.53011703491211, + "learning_rate": 1.2360902255639098e-06, + "loss": 0.2348, + "step": 58280 + }, + { + "epoch": 87.65, + "grad_norm": 5.072795391082764, + "learning_rate": 1.2345864661654137e-06, + "loss": 0.1431, + "step": 58290 + }, + { + "epoch": 87.67, + "grad_norm": 4.908288955688477, + "learning_rate": 1.2330827067669174e-06, + "loss": 0.2498, + "step": 58300 + }, + { + "epoch": 87.68, + "grad_norm": 5.3725762367248535, + "learning_rate": 1.2315789473684212e-06, + "loss": 0.1617, + "step": 58310 + }, + { + "epoch": 87.7, + "grad_norm": 5.9502034187316895, + "learning_rate": 1.230075187969925e-06, + "loss": 0.0983, + "step": 58320 + }, + { + "epoch": 87.71, + "grad_norm": 7.810244083404541, + "learning_rate": 1.2285714285714286e-06, + "loss": 0.1523, + "step": 58330 + }, + { + "epoch": 87.73, + "grad_norm": 3.383789539337158, + "learning_rate": 1.2270676691729324e-06, + "loss": 0.1778, + "step": 58340 + }, + { + "epoch": 87.74, + "grad_norm": 3.804659366607666, + "learning_rate": 1.2255639097744363e-06, + "loss": 0.1638, + "step": 58350 + }, + { + "epoch": 87.76, + "grad_norm": 6.533144950866699, + "learning_rate": 1.22406015037594e-06, + "loss": 0.1497, + "step": 58360 + }, + { + "epoch": 87.77, + "grad_norm": 5.189228057861328, + "learning_rate": 1.2225563909774438e-06, + "loss": 0.15, + "step": 58370 + }, + { + "epoch": 87.79, + "grad_norm": 7.200681209564209, + "learning_rate": 1.2210526315789475e-06, + "loss": 0.151, + "step": 58380 + }, + { + "epoch": 87.8, + "grad_norm": 5.005568027496338, + "learning_rate": 1.2195488721804512e-06, + "loss": 0.1967, + "step": 58390 + }, + { + "epoch": 87.82, + "grad_norm": 7.443274974822998, + "learning_rate": 1.218045112781955e-06, + "loss": 0.2047, + "step": 58400 + }, + { + "epoch": 87.83, + "grad_norm": 8.02253532409668, + "learning_rate": 1.2165413533834587e-06, + "loss": 0.1725, + "step": 58410 + }, + { + "epoch": 87.85, + "grad_norm": 4.530572414398193, + "learning_rate": 1.2150375939849624e-06, + "loss": 0.2269, + "step": 58420 + }, + { + "epoch": 87.86, + "grad_norm": 6.738208293914795, + "learning_rate": 1.2135338345864662e-06, + "loss": 0.1799, + "step": 58430 + }, + { + "epoch": 87.88, + "grad_norm": 4.552396297454834, + "learning_rate": 1.21203007518797e-06, + "loss": 0.1505, + "step": 58440 + }, + { + "epoch": 87.89, + "grad_norm": 3.540454387664795, + "learning_rate": 1.2105263157894738e-06, + "loss": 0.1816, + "step": 58450 + }, + { + "epoch": 87.91, + "grad_norm": 4.805243015289307, + "learning_rate": 1.2090225563909776e-06, + "loss": 0.1457, + "step": 58460 + }, + { + "epoch": 87.92, + "grad_norm": 6.110500812530518, + "learning_rate": 1.2075187969924813e-06, + "loss": 0.1308, + "step": 58470 + }, + { + "epoch": 87.94, + "grad_norm": 2.2697036266326904, + "learning_rate": 1.206015037593985e-06, + "loss": 0.1508, + "step": 58480 + }, + { + "epoch": 87.95, + "grad_norm": 1.7307939529418945, + "learning_rate": 1.204511278195489e-06, + "loss": 0.1436, + "step": 58490 + }, + { + "epoch": 87.97, + "grad_norm": 4.120843410491943, + "learning_rate": 1.2030075187969925e-06, + "loss": 0.1766, + "step": 58500 + }, + { + "epoch": 87.98, + "grad_norm": 5.4482927322387695, + "learning_rate": 1.2015037593984962e-06, + "loss": 0.2227, + "step": 58510 + }, + { + "epoch": 88.0, + "grad_norm": 0.018793661147356033, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.1495, + "step": 58520 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.9307, + "eval_loss": 0.33708590269088745, + "eval_runtime": 84.8059, + "eval_samples_per_second": 117.916, + "eval_steps_per_second": 0.472, + "step": 58520 + }, + { + "epoch": 88.02, + "grad_norm": 4.496594429016113, + "learning_rate": 1.1984962406015039e-06, + "loss": 0.2237, + "step": 58530 + }, + { + "epoch": 88.03, + "grad_norm": 4.44071102142334, + "learning_rate": 1.1969924812030076e-06, + "loss": 0.2014, + "step": 58540 + }, + { + "epoch": 88.05, + "grad_norm": 4.390169620513916, + "learning_rate": 1.1954887218045113e-06, + "loss": 0.1108, + "step": 58550 + }, + { + "epoch": 88.06, + "grad_norm": 5.913539409637451, + "learning_rate": 1.193984962406015e-06, + "loss": 0.1861, + "step": 58560 + }, + { + "epoch": 88.08, + "grad_norm": 4.491770267486572, + "learning_rate": 1.1924812030075188e-06, + "loss": 0.1935, + "step": 58570 + }, + { + "epoch": 88.09, + "grad_norm": 5.476345539093018, + "learning_rate": 1.1909774436090228e-06, + "loss": 0.1692, + "step": 58580 + }, + { + "epoch": 88.11, + "grad_norm": 5.3462347984313965, + "learning_rate": 1.1894736842105265e-06, + "loss": 0.1797, + "step": 58590 + }, + { + "epoch": 88.12, + "grad_norm": 2.1195619106292725, + "learning_rate": 1.1879699248120302e-06, + "loss": 0.1823, + "step": 58600 + }, + { + "epoch": 88.14, + "grad_norm": 8.027178764343262, + "learning_rate": 1.186466165413534e-06, + "loss": 0.2147, + "step": 58610 + }, + { + "epoch": 88.15, + "grad_norm": 6.265287399291992, + "learning_rate": 1.1849624060150377e-06, + "loss": 0.1477, + "step": 58620 + }, + { + "epoch": 88.17, + "grad_norm": 4.6563191413879395, + "learning_rate": 1.1834586466165414e-06, + "loss": 0.1371, + "step": 58630 + }, + { + "epoch": 88.18, + "grad_norm": 3.4797229766845703, + "learning_rate": 1.1819548872180451e-06, + "loss": 0.1688, + "step": 58640 + }, + { + "epoch": 88.2, + "grad_norm": 12.818514823913574, + "learning_rate": 1.1804511278195489e-06, + "loss": 0.1646, + "step": 58650 + }, + { + "epoch": 88.21, + "grad_norm": 6.959251880645752, + "learning_rate": 1.1789473684210526e-06, + "loss": 0.1892, + "step": 58660 + }, + { + "epoch": 88.23, + "grad_norm": 5.956885814666748, + "learning_rate": 1.1774436090225565e-06, + "loss": 0.191, + "step": 58670 + }, + { + "epoch": 88.24, + "grad_norm": 3.1932976245880127, + "learning_rate": 1.1759398496240603e-06, + "loss": 0.187, + "step": 58680 + }, + { + "epoch": 88.26, + "grad_norm": 2.2420973777770996, + "learning_rate": 1.174436090225564e-06, + "loss": 0.1762, + "step": 58690 + }, + { + "epoch": 88.27, + "grad_norm": 3.847198486328125, + "learning_rate": 1.1729323308270677e-06, + "loss": 0.1542, + "step": 58700 + }, + { + "epoch": 88.29, + "grad_norm": 7.925451755523682, + "learning_rate": 1.1714285714285715e-06, + "loss": 0.1671, + "step": 58710 + }, + { + "epoch": 88.3, + "grad_norm": 4.539743423461914, + "learning_rate": 1.1699248120300754e-06, + "loss": 0.1702, + "step": 58720 + }, + { + "epoch": 88.32, + "grad_norm": 0.4386990964412689, + "learning_rate": 1.1684210526315791e-06, + "loss": 0.1528, + "step": 58730 + }, + { + "epoch": 88.33, + "grad_norm": 0.1927802413702011, + "learning_rate": 1.1669172932330829e-06, + "loss": 0.1165, + "step": 58740 + }, + { + "epoch": 88.35, + "grad_norm": 4.060121059417725, + "learning_rate": 1.1654135338345866e-06, + "loss": 0.1751, + "step": 58750 + }, + { + "epoch": 88.36, + "grad_norm": 6.014774799346924, + "learning_rate": 1.1639097744360903e-06, + "loss": 0.1487, + "step": 58760 + }, + { + "epoch": 88.38, + "grad_norm": 2.9679887294769287, + "learning_rate": 1.162406015037594e-06, + "loss": 0.1832, + "step": 58770 + }, + { + "epoch": 88.39, + "grad_norm": 3.4474730491638184, + "learning_rate": 1.1609022556390978e-06, + "loss": 0.181, + "step": 58780 + }, + { + "epoch": 88.41, + "grad_norm": 5.39216947555542, + "learning_rate": 1.1593984962406015e-06, + "loss": 0.1964, + "step": 58790 + }, + { + "epoch": 88.42, + "grad_norm": 3.7178587913513184, + "learning_rate": 1.1578947368421053e-06, + "loss": 0.1406, + "step": 58800 + }, + { + "epoch": 88.44, + "grad_norm": 2.4193215370178223, + "learning_rate": 1.1563909774436092e-06, + "loss": 0.1928, + "step": 58810 + }, + { + "epoch": 88.45, + "grad_norm": 5.395383834838867, + "learning_rate": 1.154887218045113e-06, + "loss": 0.2289, + "step": 58820 + }, + { + "epoch": 88.47, + "grad_norm": 3.297912120819092, + "learning_rate": 1.1533834586466167e-06, + "loss": 0.1687, + "step": 58830 + }, + { + "epoch": 88.48, + "grad_norm": 6.842408180236816, + "learning_rate": 1.1518796992481204e-06, + "loss": 0.1959, + "step": 58840 + }, + { + "epoch": 88.5, + "grad_norm": 2.9778201580047607, + "learning_rate": 1.1503759398496241e-06, + "loss": 0.1847, + "step": 58850 + }, + { + "epoch": 88.51, + "grad_norm": 4.785979270935059, + "learning_rate": 1.148872180451128e-06, + "loss": 0.1789, + "step": 58860 + }, + { + "epoch": 88.53, + "grad_norm": 2.7606077194213867, + "learning_rate": 1.1473684210526316e-06, + "loss": 0.1995, + "step": 58870 + }, + { + "epoch": 88.54, + "grad_norm": 6.0019097328186035, + "learning_rate": 1.1458646616541353e-06, + "loss": 0.1979, + "step": 58880 + }, + { + "epoch": 88.56, + "grad_norm": 3.31396746635437, + "learning_rate": 1.144360902255639e-06, + "loss": 0.1191, + "step": 58890 + }, + { + "epoch": 88.57, + "grad_norm": 5.867440223693848, + "learning_rate": 1.142857142857143e-06, + "loss": 0.1948, + "step": 58900 + }, + { + "epoch": 88.59, + "grad_norm": 6.043757438659668, + "learning_rate": 1.1413533834586467e-06, + "loss": 0.145, + "step": 58910 + }, + { + "epoch": 88.6, + "grad_norm": 5.402288913726807, + "learning_rate": 1.1398496240601504e-06, + "loss": 0.166, + "step": 58920 + }, + { + "epoch": 88.62, + "grad_norm": 6.221014976501465, + "learning_rate": 1.1383458646616542e-06, + "loss": 0.2292, + "step": 58930 + }, + { + "epoch": 88.63, + "grad_norm": 6.4407572746276855, + "learning_rate": 1.136842105263158e-06, + "loss": 0.1624, + "step": 58940 + }, + { + "epoch": 88.65, + "grad_norm": 2.440732955932617, + "learning_rate": 1.1353383458646619e-06, + "loss": 0.1674, + "step": 58950 + }, + { + "epoch": 88.66, + "grad_norm": 4.425859451293945, + "learning_rate": 1.1338345864661656e-06, + "loss": 0.212, + "step": 58960 + }, + { + "epoch": 88.68, + "grad_norm": 5.0957818031311035, + "learning_rate": 1.1323308270676693e-06, + "loss": 0.2014, + "step": 58970 + }, + { + "epoch": 88.69, + "grad_norm": 3.4791855812072754, + "learning_rate": 1.130827067669173e-06, + "loss": 0.2379, + "step": 58980 + }, + { + "epoch": 88.71, + "grad_norm": 8.51900577545166, + "learning_rate": 1.1293233082706768e-06, + "loss": 0.1812, + "step": 58990 + }, + { + "epoch": 88.72, + "grad_norm": 3.879889965057373, + "learning_rate": 1.1278195488721805e-06, + "loss": 0.1396, + "step": 59000 + }, + { + "epoch": 88.74, + "grad_norm": 4.356947422027588, + "learning_rate": 1.1263157894736842e-06, + "loss": 0.1634, + "step": 59010 + }, + { + "epoch": 88.75, + "grad_norm": 3.2641735076904297, + "learning_rate": 1.124812030075188e-06, + "loss": 0.1805, + "step": 59020 + }, + { + "epoch": 88.77, + "grad_norm": 7.944269180297852, + "learning_rate": 1.1233082706766917e-06, + "loss": 0.1875, + "step": 59030 + }, + { + "epoch": 88.78, + "grad_norm": 3.0705432891845703, + "learning_rate": 1.1218045112781956e-06, + "loss": 0.1884, + "step": 59040 + }, + { + "epoch": 88.8, + "grad_norm": 5.9474196434021, + "learning_rate": 1.1203007518796994e-06, + "loss": 0.1614, + "step": 59050 + }, + { + "epoch": 88.81, + "grad_norm": 8.783775329589844, + "learning_rate": 1.118796992481203e-06, + "loss": 0.2516, + "step": 59060 + }, + { + "epoch": 88.83, + "grad_norm": 4.90156888961792, + "learning_rate": 1.1172932330827068e-06, + "loss": 0.1655, + "step": 59070 + }, + { + "epoch": 88.84, + "grad_norm": 4.247372627258301, + "learning_rate": 1.1157894736842106e-06, + "loss": 0.2167, + "step": 59080 + }, + { + "epoch": 88.86, + "grad_norm": 4.955306053161621, + "learning_rate": 1.1142857142857145e-06, + "loss": 0.159, + "step": 59090 + }, + { + "epoch": 88.87, + "grad_norm": 4.56100606918335, + "learning_rate": 1.1127819548872182e-06, + "loss": 0.1346, + "step": 59100 + }, + { + "epoch": 88.89, + "grad_norm": 4.857382297515869, + "learning_rate": 1.111278195488722e-06, + "loss": 0.1887, + "step": 59110 + }, + { + "epoch": 88.9, + "grad_norm": 4.279732704162598, + "learning_rate": 1.1097744360902255e-06, + "loss": 0.1426, + "step": 59120 + }, + { + "epoch": 88.92, + "grad_norm": 4.930658340454102, + "learning_rate": 1.1082706766917294e-06, + "loss": 0.2294, + "step": 59130 + }, + { + "epoch": 88.93, + "grad_norm": 5.308464050292969, + "learning_rate": 1.1067669172932332e-06, + "loss": 0.1437, + "step": 59140 + }, + { + "epoch": 88.95, + "grad_norm": 5.982918739318848, + "learning_rate": 1.1052631578947369e-06, + "loss": 0.2073, + "step": 59150 + }, + { + "epoch": 88.96, + "grad_norm": 6.234976291656494, + "learning_rate": 1.1037593984962406e-06, + "loss": 0.1346, + "step": 59160 + }, + { + "epoch": 88.98, + "grad_norm": 12.91275691986084, + "learning_rate": 1.1022556390977444e-06, + "loss": 0.2041, + "step": 59170 + }, + { + "epoch": 88.99, + "grad_norm": 2.407877206802368, + "learning_rate": 1.1007518796992483e-06, + "loss": 0.1042, + "step": 59180 + }, + { + "epoch": 89.0, + "eval_accuracy": 0.9316, + "eval_loss": 0.33024507761001587, + "eval_runtime": 84.4728, + "eval_samples_per_second": 118.381, + "eval_steps_per_second": 0.474, + "step": 59185 + }, + { + "epoch": 89.01, + "grad_norm": 9.109488487243652, + "learning_rate": 1.099248120300752e-06, + "loss": 0.187, + "step": 59190 + }, + { + "epoch": 89.02, + "grad_norm": 2.8222405910491943, + "learning_rate": 1.0977443609022558e-06, + "loss": 0.1887, + "step": 59200 + }, + { + "epoch": 89.04, + "grad_norm": 3.990464448928833, + "learning_rate": 1.0962406015037595e-06, + "loss": 0.1464, + "step": 59210 + }, + { + "epoch": 89.05, + "grad_norm": 3.481008529663086, + "learning_rate": 1.0947368421052632e-06, + "loss": 0.1941, + "step": 59220 + }, + { + "epoch": 89.07, + "grad_norm": 4.007563591003418, + "learning_rate": 1.0932330827067672e-06, + "loss": 0.2157, + "step": 59230 + }, + { + "epoch": 89.08, + "grad_norm": 4.087463855743408, + "learning_rate": 1.0917293233082709e-06, + "loss": 0.1788, + "step": 59240 + }, + { + "epoch": 89.1, + "grad_norm": 7.297598838806152, + "learning_rate": 1.0902255639097744e-06, + "loss": 0.1653, + "step": 59250 + }, + { + "epoch": 89.11, + "grad_norm": 4.800395488739014, + "learning_rate": 1.0887218045112781e-06, + "loss": 0.2075, + "step": 59260 + }, + { + "epoch": 89.13, + "grad_norm": 3.8315725326538086, + "learning_rate": 1.087218045112782e-06, + "loss": 0.1383, + "step": 59270 + }, + { + "epoch": 89.14, + "grad_norm": 6.9601898193359375, + "learning_rate": 1.0857142857142858e-06, + "loss": 0.1993, + "step": 59280 + }, + { + "epoch": 89.16, + "grad_norm": 6.42357063293457, + "learning_rate": 1.0842105263157895e-06, + "loss": 0.1716, + "step": 59290 + }, + { + "epoch": 89.17, + "grad_norm": 4.080452919006348, + "learning_rate": 1.0827067669172933e-06, + "loss": 0.198, + "step": 59300 + }, + { + "epoch": 89.19, + "grad_norm": 4.082363605499268, + "learning_rate": 1.081203007518797e-06, + "loss": 0.1456, + "step": 59310 + }, + { + "epoch": 89.2, + "grad_norm": 2.618720531463623, + "learning_rate": 1.079699248120301e-06, + "loss": 0.2309, + "step": 59320 + }, + { + "epoch": 89.22, + "grad_norm": 6.8665080070495605, + "learning_rate": 1.0781954887218047e-06, + "loss": 0.1487, + "step": 59330 + }, + { + "epoch": 89.23, + "grad_norm": 6.956995487213135, + "learning_rate": 1.0766917293233084e-06, + "loss": 0.2283, + "step": 59340 + }, + { + "epoch": 89.25, + "grad_norm": 4.275158405303955, + "learning_rate": 1.0751879699248121e-06, + "loss": 0.1732, + "step": 59350 + }, + { + "epoch": 89.26, + "grad_norm": 6.3027024269104, + "learning_rate": 1.0736842105263159e-06, + "loss": 0.1888, + "step": 59360 + }, + { + "epoch": 89.28, + "grad_norm": 6.413710594177246, + "learning_rate": 1.0721804511278196e-06, + "loss": 0.2427, + "step": 59370 + }, + { + "epoch": 89.29, + "grad_norm": 5.2090888023376465, + "learning_rate": 1.0706766917293233e-06, + "loss": 0.1906, + "step": 59380 + }, + { + "epoch": 89.31, + "grad_norm": 5.576053142547607, + "learning_rate": 1.069172932330827e-06, + "loss": 0.1398, + "step": 59390 + }, + { + "epoch": 89.32, + "grad_norm": 3.6044418811798096, + "learning_rate": 1.0676691729323308e-06, + "loss": 0.1232, + "step": 59400 + }, + { + "epoch": 89.34, + "grad_norm": 3.0662240982055664, + "learning_rate": 1.0661654135338347e-06, + "loss": 0.189, + "step": 59410 + }, + { + "epoch": 89.35, + "grad_norm": 3.142246961593628, + "learning_rate": 1.0646616541353385e-06, + "loss": 0.1447, + "step": 59420 + }, + { + "epoch": 89.37, + "grad_norm": 3.3175511360168457, + "learning_rate": 1.0631578947368422e-06, + "loss": 0.127, + "step": 59430 + }, + { + "epoch": 89.38, + "grad_norm": 4.238184928894043, + "learning_rate": 1.061654135338346e-06, + "loss": 0.1321, + "step": 59440 + }, + { + "epoch": 89.4, + "grad_norm": 7.08275842666626, + "learning_rate": 1.0601503759398497e-06, + "loss": 0.1795, + "step": 59450 + }, + { + "epoch": 89.41, + "grad_norm": 7.184491157531738, + "learning_rate": 1.0586466165413536e-06, + "loss": 0.2363, + "step": 59460 + }, + { + "epoch": 89.43, + "grad_norm": 4.729690074920654, + "learning_rate": 1.0571428571428573e-06, + "loss": 0.1752, + "step": 59470 + }, + { + "epoch": 89.44, + "grad_norm": 5.500245094299316, + "learning_rate": 1.055639097744361e-06, + "loss": 0.1708, + "step": 59480 + }, + { + "epoch": 89.46, + "grad_norm": 4.5759406089782715, + "learning_rate": 1.0541353383458648e-06, + "loss": 0.1884, + "step": 59490 + }, + { + "epoch": 89.47, + "grad_norm": 2.6944706439971924, + "learning_rate": 1.0526315789473685e-06, + "loss": 0.1609, + "step": 59500 + }, + { + "epoch": 89.49, + "grad_norm": 3.369946002960205, + "learning_rate": 1.0511278195488723e-06, + "loss": 0.1926, + "step": 59510 + }, + { + "epoch": 89.5, + "grad_norm": 4.653051376342773, + "learning_rate": 1.049624060150376e-06, + "loss": 0.1455, + "step": 59520 + }, + { + "epoch": 89.52, + "grad_norm": 0.8550413846969604, + "learning_rate": 1.0481203007518797e-06, + "loss": 0.1589, + "step": 59530 + }, + { + "epoch": 89.53, + "grad_norm": 4.918313026428223, + "learning_rate": 1.0466165413533835e-06, + "loss": 0.0954, + "step": 59540 + }, + { + "epoch": 89.55, + "grad_norm": 6.69185209274292, + "learning_rate": 1.0451127819548874e-06, + "loss": 0.1798, + "step": 59550 + }, + { + "epoch": 89.56, + "grad_norm": 6.381489276885986, + "learning_rate": 1.0436090225563911e-06, + "loss": 0.1493, + "step": 59560 + }, + { + "epoch": 89.58, + "grad_norm": 2.553143262863159, + "learning_rate": 1.0421052631578949e-06, + "loss": 0.1313, + "step": 59570 + }, + { + "epoch": 89.59, + "grad_norm": 2.599313974380493, + "learning_rate": 1.0406015037593986e-06, + "loss": 0.171, + "step": 59580 + }, + { + "epoch": 89.61, + "grad_norm": 6.928783893585205, + "learning_rate": 1.0390977443609023e-06, + "loss": 0.2035, + "step": 59590 + }, + { + "epoch": 89.62, + "grad_norm": 4.362387657165527, + "learning_rate": 1.037593984962406e-06, + "loss": 0.1783, + "step": 59600 + }, + { + "epoch": 89.64, + "grad_norm": 2.9104678630828857, + "learning_rate": 1.03609022556391e-06, + "loss": 0.1895, + "step": 59610 + }, + { + "epoch": 89.65, + "grad_norm": 6.155645370483398, + "learning_rate": 1.0345864661654135e-06, + "loss": 0.1856, + "step": 59620 + }, + { + "epoch": 89.67, + "grad_norm": 5.490917205810547, + "learning_rate": 1.0330827067669172e-06, + "loss": 0.151, + "step": 59630 + }, + { + "epoch": 89.68, + "grad_norm": 0.3577495515346527, + "learning_rate": 1.0315789473684212e-06, + "loss": 0.1647, + "step": 59640 + }, + { + "epoch": 89.7, + "grad_norm": 3.6894378662109375, + "learning_rate": 1.030075187969925e-06, + "loss": 0.1578, + "step": 59650 + }, + { + "epoch": 89.71, + "grad_norm": 7.005927562713623, + "learning_rate": 1.0285714285714286e-06, + "loss": 0.1644, + "step": 59660 + }, + { + "epoch": 89.73, + "grad_norm": 2.355557680130005, + "learning_rate": 1.0270676691729324e-06, + "loss": 0.1616, + "step": 59670 + }, + { + "epoch": 89.74, + "grad_norm": 5.249048233032227, + "learning_rate": 1.0255639097744361e-06, + "loss": 0.1285, + "step": 59680 + }, + { + "epoch": 89.76, + "grad_norm": 9.33356761932373, + "learning_rate": 1.02406015037594e-06, + "loss": 0.1326, + "step": 59690 + }, + { + "epoch": 89.77, + "grad_norm": 3.254373550415039, + "learning_rate": 1.0225563909774438e-06, + "loss": 0.1191, + "step": 59700 + }, + { + "epoch": 89.79, + "grad_norm": 4.756227493286133, + "learning_rate": 1.0210526315789475e-06, + "loss": 0.1221, + "step": 59710 + }, + { + "epoch": 89.8, + "grad_norm": 4.242854118347168, + "learning_rate": 1.0195488721804512e-06, + "loss": 0.1932, + "step": 59720 + }, + { + "epoch": 89.82, + "grad_norm": 6.5166425704956055, + "learning_rate": 1.018045112781955e-06, + "loss": 0.1373, + "step": 59730 + }, + { + "epoch": 89.83, + "grad_norm": 2.136077880859375, + "learning_rate": 1.0165413533834587e-06, + "loss": 0.1699, + "step": 59740 + }, + { + "epoch": 89.85, + "grad_norm": 2.9859583377838135, + "learning_rate": 1.0150375939849624e-06, + "loss": 0.1677, + "step": 59750 + }, + { + "epoch": 89.86, + "grad_norm": 5.3651933670043945, + "learning_rate": 1.0135338345864662e-06, + "loss": 0.176, + "step": 59760 + }, + { + "epoch": 89.88, + "grad_norm": 4.6981520652771, + "learning_rate": 1.01203007518797e-06, + "loss": 0.1921, + "step": 59770 + }, + { + "epoch": 89.89, + "grad_norm": 4.584779262542725, + "learning_rate": 1.0105263157894738e-06, + "loss": 0.1803, + "step": 59780 + }, + { + "epoch": 89.91, + "grad_norm": 7.521881580352783, + "learning_rate": 1.0090225563909776e-06, + "loss": 0.1587, + "step": 59790 + }, + { + "epoch": 89.92, + "grad_norm": 7.652347564697266, + "learning_rate": 1.0075187969924813e-06, + "loss": 0.1852, + "step": 59800 + }, + { + "epoch": 89.94, + "grad_norm": 1.188881516456604, + "learning_rate": 1.006015037593985e-06, + "loss": 0.1649, + "step": 59810 + }, + { + "epoch": 89.95, + "grad_norm": 3.557762622833252, + "learning_rate": 1.0045112781954888e-06, + "loss": 0.1391, + "step": 59820 + }, + { + "epoch": 89.97, + "grad_norm": 1.8400624990463257, + "learning_rate": 1.0030075187969925e-06, + "loss": 0.2029, + "step": 59830 + }, + { + "epoch": 89.98, + "grad_norm": 5.2794575691223145, + "learning_rate": 1.0015037593984964e-06, + "loss": 0.1536, + "step": 59840 + }, + { + "epoch": 90.0, + "grad_norm": 43.79523849487305, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.1681, + "step": 59850 + }, + { + "epoch": 90.0, + "eval_accuracy": 0.9295, + "eval_loss": 0.3303840756416321, + "eval_runtime": 84.4298, + "eval_samples_per_second": 118.442, + "eval_steps_per_second": 0.474, + "step": 59850 + }, + { + "epoch": 90.02, + "grad_norm": 4.819916725158691, + "learning_rate": 9.98496240601504e-07, + "loss": 0.1482, + "step": 59860 + }, + { + "epoch": 90.03, + "grad_norm": 8.389203071594238, + "learning_rate": 9.969924812030076e-07, + "loss": 0.204, + "step": 59870 + }, + { + "epoch": 90.05, + "grad_norm": 4.255865097045898, + "learning_rate": 9.954887218045114e-07, + "loss": 0.2107, + "step": 59880 + }, + { + "epoch": 90.06, + "grad_norm": 2.4185428619384766, + "learning_rate": 9.93984962406015e-07, + "loss": 0.2002, + "step": 59890 + }, + { + "epoch": 90.08, + "grad_norm": 6.136438846588135, + "learning_rate": 9.924812030075188e-07, + "loss": 0.1432, + "step": 59900 + }, + { + "epoch": 90.09, + "grad_norm": 4.596153259277344, + "learning_rate": 9.909774436090226e-07, + "loss": 0.2041, + "step": 59910 + }, + { + "epoch": 90.11, + "grad_norm": 6.638514518737793, + "learning_rate": 9.894736842105265e-07, + "loss": 0.1594, + "step": 59920 + }, + { + "epoch": 90.12, + "grad_norm": 5.299813270568848, + "learning_rate": 9.879699248120302e-07, + "loss": 0.1713, + "step": 59930 + }, + { + "epoch": 90.14, + "grad_norm": 6.116607666015625, + "learning_rate": 9.86466165413534e-07, + "loss": 0.1566, + "step": 59940 + }, + { + "epoch": 90.15, + "grad_norm": 6.558013916015625, + "learning_rate": 9.849624060150377e-07, + "loss": 0.203, + "step": 59950 + }, + { + "epoch": 90.17, + "grad_norm": 2.691136598587036, + "learning_rate": 9.834586466165414e-07, + "loss": 0.1761, + "step": 59960 + }, + { + "epoch": 90.18, + "grad_norm": 6.008741855621338, + "learning_rate": 9.819548872180451e-07, + "loss": 0.2017, + "step": 59970 + }, + { + "epoch": 90.2, + "grad_norm": 7.76501989364624, + "learning_rate": 9.80451127819549e-07, + "loss": 0.188, + "step": 59980 + }, + { + "epoch": 90.21, + "grad_norm": 3.065690517425537, + "learning_rate": 9.789473684210526e-07, + "loss": 0.1711, + "step": 59990 + }, + { + "epoch": 90.23, + "grad_norm": 6.988502502441406, + "learning_rate": 9.774436090225563e-07, + "loss": 0.1323, + "step": 60000 + }, + { + "epoch": 90.24, + "grad_norm": 4.135454177856445, + "learning_rate": 9.759398496240603e-07, + "loss": 0.1522, + "step": 60010 + }, + { + "epoch": 90.26, + "grad_norm": 5.1034016609191895, + "learning_rate": 9.74436090225564e-07, + "loss": 0.1998, + "step": 60020 + }, + { + "epoch": 90.27, + "grad_norm": 2.8112096786499023, + "learning_rate": 9.729323308270677e-07, + "loss": 0.1588, + "step": 60030 + }, + { + "epoch": 90.29, + "grad_norm": 2.276792049407959, + "learning_rate": 9.714285714285715e-07, + "loss": 0.152, + "step": 60040 + }, + { + "epoch": 90.3, + "grad_norm": 16.462791442871094, + "learning_rate": 9.699248120300752e-07, + "loss": 0.1913, + "step": 60050 + }, + { + "epoch": 90.32, + "grad_norm": 7.889447212219238, + "learning_rate": 9.68421052631579e-07, + "loss": 0.1824, + "step": 60060 + }, + { + "epoch": 90.33, + "grad_norm": 5.8780837059021, + "learning_rate": 9.669172932330829e-07, + "loss": 0.1683, + "step": 60070 + }, + { + "epoch": 90.35, + "grad_norm": 1.407557487487793, + "learning_rate": 9.654135338345866e-07, + "loss": 0.1696, + "step": 60080 + }, + { + "epoch": 90.36, + "grad_norm": 2.4125938415527344, + "learning_rate": 9.639097744360903e-07, + "loss": 0.1517, + "step": 60090 + }, + { + "epoch": 90.38, + "grad_norm": 6.825364112854004, + "learning_rate": 9.62406015037594e-07, + "loss": 0.217, + "step": 60100 + }, + { + "epoch": 90.39, + "grad_norm": 5.1629743576049805, + "learning_rate": 9.609022556390978e-07, + "loss": 0.1541, + "step": 60110 + }, + { + "epoch": 90.41, + "grad_norm": 6.873205184936523, + "learning_rate": 9.593984962406015e-07, + "loss": 0.1416, + "step": 60120 + }, + { + "epoch": 90.42, + "grad_norm": 5.59041690826416, + "learning_rate": 9.578947368421053e-07, + "loss": 0.153, + "step": 60130 + }, + { + "epoch": 90.44, + "grad_norm": 3.045102834701538, + "learning_rate": 9.56390977443609e-07, + "loss": 0.1723, + "step": 60140 + }, + { + "epoch": 90.45, + "grad_norm": 3.393899917602539, + "learning_rate": 9.54887218045113e-07, + "loss": 0.1852, + "step": 60150 + }, + { + "epoch": 90.47, + "grad_norm": 5.689571380615234, + "learning_rate": 9.533834586466166e-07, + "loss": 0.2115, + "step": 60160 + }, + { + "epoch": 90.48, + "grad_norm": 5.110867023468018, + "learning_rate": 9.518796992481204e-07, + "loss": 0.1877, + "step": 60170 + }, + { + "epoch": 90.5, + "grad_norm": 2.7676916122436523, + "learning_rate": 9.503759398496241e-07, + "loss": 0.1966, + "step": 60180 + }, + { + "epoch": 90.51, + "grad_norm": 0.4056377410888672, + "learning_rate": 9.48872180451128e-07, + "loss": 0.1819, + "step": 60190 + }, + { + "epoch": 90.53, + "grad_norm": 15.291031837463379, + "learning_rate": 9.473684210526317e-07, + "loss": 0.1564, + "step": 60200 + }, + { + "epoch": 90.54, + "grad_norm": 4.443658828735352, + "learning_rate": 9.458646616541354e-07, + "loss": 0.1336, + "step": 60210 + }, + { + "epoch": 90.56, + "grad_norm": 8.213922500610352, + "learning_rate": 9.443609022556393e-07, + "loss": 0.1578, + "step": 60220 + }, + { + "epoch": 90.57, + "grad_norm": 5.976234436035156, + "learning_rate": 9.42857142857143e-07, + "loss": 0.1815, + "step": 60230 + }, + { + "epoch": 90.59, + "grad_norm": 3.436178684234619, + "learning_rate": 9.413533834586466e-07, + "loss": 0.1836, + "step": 60240 + }, + { + "epoch": 90.6, + "grad_norm": 7.1853203773498535, + "learning_rate": 9.398496240601504e-07, + "loss": 0.1733, + "step": 60250 + }, + { + "epoch": 90.62, + "grad_norm": 3.7736470699310303, + "learning_rate": 9.383458646616542e-07, + "loss": 0.0901, + "step": 60260 + }, + { + "epoch": 90.63, + "grad_norm": 6.868650436401367, + "learning_rate": 9.368421052631579e-07, + "loss": 0.1495, + "step": 60270 + }, + { + "epoch": 90.65, + "grad_norm": 6.130927085876465, + "learning_rate": 9.353383458646618e-07, + "loss": 0.175, + "step": 60280 + }, + { + "epoch": 90.66, + "grad_norm": 2.877918004989624, + "learning_rate": 9.338345864661655e-07, + "loss": 0.1292, + "step": 60290 + }, + { + "epoch": 90.68, + "grad_norm": 6.6926493644714355, + "learning_rate": 9.323308270676692e-07, + "loss": 0.1629, + "step": 60300 + }, + { + "epoch": 90.69, + "grad_norm": 4.85024356842041, + "learning_rate": 9.308270676691731e-07, + "loss": 0.1462, + "step": 60310 + }, + { + "epoch": 90.71, + "grad_norm": 4.398621559143066, + "learning_rate": 9.293233082706768e-07, + "loss": 0.1805, + "step": 60320 + }, + { + "epoch": 90.72, + "grad_norm": 4.464656829833984, + "learning_rate": 9.278195488721805e-07, + "loss": 0.1624, + "step": 60330 + }, + { + "epoch": 90.74, + "grad_norm": 2.839735984802246, + "learning_rate": 9.263157894736844e-07, + "loss": 0.1813, + "step": 60340 + }, + { + "epoch": 90.75, + "grad_norm": 3.697030544281006, + "learning_rate": 9.248120300751881e-07, + "loss": 0.1472, + "step": 60350 + }, + { + "epoch": 90.77, + "grad_norm": 4.30585241317749, + "learning_rate": 9.233082706766917e-07, + "loss": 0.1624, + "step": 60360 + }, + { + "epoch": 90.78, + "grad_norm": 5.344664096832275, + "learning_rate": 9.218045112781955e-07, + "loss": 0.1486, + "step": 60370 + }, + { + "epoch": 90.8, + "grad_norm": 3.9240634441375732, + "learning_rate": 9.203007518796993e-07, + "loss": 0.1843, + "step": 60380 + }, + { + "epoch": 90.81, + "grad_norm": 2.0143322944641113, + "learning_rate": 9.18796992481203e-07, + "loss": 0.1147, + "step": 60390 + }, + { + "epoch": 90.83, + "grad_norm": 6.274852275848389, + "learning_rate": 9.172932330827068e-07, + "loss": 0.169, + "step": 60400 + }, + { + "epoch": 90.84, + "grad_norm": 5.181715488433838, + "learning_rate": 9.157894736842106e-07, + "loss": 0.1845, + "step": 60410 + }, + { + "epoch": 90.86, + "grad_norm": 0.9931633472442627, + "learning_rate": 9.142857142857144e-07, + "loss": 0.1365, + "step": 60420 + }, + { + "epoch": 90.87, + "grad_norm": 5.815258502960205, + "learning_rate": 9.127819548872181e-07, + "loss": 0.1571, + "step": 60430 + }, + { + "epoch": 90.89, + "grad_norm": 4.237338542938232, + "learning_rate": 9.112781954887219e-07, + "loss": 0.1783, + "step": 60440 + }, + { + "epoch": 90.9, + "grad_norm": 4.938007354736328, + "learning_rate": 9.097744360902257e-07, + "loss": 0.1615, + "step": 60450 + }, + { + "epoch": 90.92, + "grad_norm": 7.971113204956055, + "learning_rate": 9.082706766917294e-07, + "loss": 0.1811, + "step": 60460 + }, + { + "epoch": 90.93, + "grad_norm": 6.53290319442749, + "learning_rate": 9.067669172932332e-07, + "loss": 0.1816, + "step": 60470 + }, + { + "epoch": 90.95, + "grad_norm": 5.972143173217773, + "learning_rate": 9.05263157894737e-07, + "loss": 0.2215, + "step": 60480 + }, + { + "epoch": 90.96, + "grad_norm": 6.27506685256958, + "learning_rate": 9.037593984962406e-07, + "loss": 0.1527, + "step": 60490 + }, + { + "epoch": 90.98, + "grad_norm": 3.595836877822876, + "learning_rate": 9.022556390977444e-07, + "loss": 0.1536, + "step": 60500 + }, + { + "epoch": 90.99, + "grad_norm": 4.775961399078369, + "learning_rate": 9.007518796992482e-07, + "loss": 0.1802, + "step": 60510 + }, + { + "epoch": 91.0, + "eval_accuracy": 0.9298, + "eval_loss": 0.3351175785064697, + "eval_runtime": 84.5299, + "eval_samples_per_second": 118.301, + "eval_steps_per_second": 0.473, + "step": 60515 + }, + { + "epoch": 91.01, + "grad_norm": 7.370502948760986, + "learning_rate": 8.992481203007519e-07, + "loss": 0.2365, + "step": 60520 + }, + { + "epoch": 91.02, + "grad_norm": 8.751474380493164, + "learning_rate": 8.977443609022557e-07, + "loss": 0.2011, + "step": 60530 + }, + { + "epoch": 91.04, + "grad_norm": 3.2370705604553223, + "learning_rate": 8.962406015037595e-07, + "loss": 0.1858, + "step": 60540 + }, + { + "epoch": 91.05, + "grad_norm": 5.268539905548096, + "learning_rate": 8.947368421052632e-07, + "loss": 0.2265, + "step": 60550 + }, + { + "epoch": 91.07, + "grad_norm": 5.824682235717773, + "learning_rate": 8.93233082706767e-07, + "loss": 0.17, + "step": 60560 + }, + { + "epoch": 91.08, + "grad_norm": 4.85299015045166, + "learning_rate": 8.917293233082708e-07, + "loss": 0.1715, + "step": 60570 + }, + { + "epoch": 91.1, + "grad_norm": 3.4943690299987793, + "learning_rate": 8.902255639097745e-07, + "loss": 0.1739, + "step": 60580 + }, + { + "epoch": 91.11, + "grad_norm": 5.409356594085693, + "learning_rate": 8.887218045112784e-07, + "loss": 0.1693, + "step": 60590 + }, + { + "epoch": 91.13, + "grad_norm": 6.857398986816406, + "learning_rate": 8.872180451127821e-07, + "loss": 0.1883, + "step": 60600 + }, + { + "epoch": 91.14, + "grad_norm": 4.355463027954102, + "learning_rate": 8.857142857142857e-07, + "loss": 0.1444, + "step": 60610 + }, + { + "epoch": 91.16, + "grad_norm": 5.453840255737305, + "learning_rate": 8.842105263157895e-07, + "loss": 0.1129, + "step": 60620 + }, + { + "epoch": 91.17, + "grad_norm": 2.509838581085205, + "learning_rate": 8.827067669172933e-07, + "loss": 0.1524, + "step": 60630 + }, + { + "epoch": 91.19, + "grad_norm": 4.419145107269287, + "learning_rate": 8.81203007518797e-07, + "loss": 0.1007, + "step": 60640 + }, + { + "epoch": 91.2, + "grad_norm": 2.9368622303009033, + "learning_rate": 8.796992481203009e-07, + "loss": 0.1373, + "step": 60650 + }, + { + "epoch": 91.22, + "grad_norm": 5.6778082847595215, + "learning_rate": 8.781954887218046e-07, + "loss": 0.1914, + "step": 60660 + }, + { + "epoch": 91.23, + "grad_norm": 4.565642833709717, + "learning_rate": 8.766917293233083e-07, + "loss": 0.1885, + "step": 60670 + }, + { + "epoch": 91.25, + "grad_norm": 12.5923433303833, + "learning_rate": 8.751879699248122e-07, + "loss": 0.1841, + "step": 60680 + }, + { + "epoch": 91.26, + "grad_norm": 3.8963727951049805, + "learning_rate": 8.736842105263159e-07, + "loss": 0.2082, + "step": 60690 + }, + { + "epoch": 91.28, + "grad_norm": 7.847858428955078, + "learning_rate": 8.721804511278196e-07, + "loss": 0.2347, + "step": 60700 + }, + { + "epoch": 91.29, + "grad_norm": 6.093050003051758, + "learning_rate": 8.706766917293235e-07, + "loss": 0.2005, + "step": 60710 + }, + { + "epoch": 91.31, + "grad_norm": 6.70009708404541, + "learning_rate": 8.691729323308272e-07, + "loss": 0.1581, + "step": 60720 + }, + { + "epoch": 91.32, + "grad_norm": 5.4743852615356445, + "learning_rate": 8.67669172932331e-07, + "loss": 0.1938, + "step": 60730 + }, + { + "epoch": 91.34, + "grad_norm": 6.605056285858154, + "learning_rate": 8.661654135338346e-07, + "loss": 0.2517, + "step": 60740 + }, + { + "epoch": 91.35, + "grad_norm": 5.257135391235352, + "learning_rate": 8.646616541353384e-07, + "loss": 0.1354, + "step": 60750 + }, + { + "epoch": 91.37, + "grad_norm": 4.14031982421875, + "learning_rate": 8.631578947368421e-07, + "loss": 0.2008, + "step": 60760 + }, + { + "epoch": 91.38, + "grad_norm": 4.664227485656738, + "learning_rate": 8.616541353383459e-07, + "loss": 0.166, + "step": 60770 + }, + { + "epoch": 91.4, + "grad_norm": 7.249124526977539, + "learning_rate": 8.601503759398497e-07, + "loss": 0.1729, + "step": 60780 + }, + { + "epoch": 91.41, + "grad_norm": 1.5819720029830933, + "learning_rate": 8.586466165413534e-07, + "loss": 0.1673, + "step": 60790 + }, + { + "epoch": 91.43, + "grad_norm": 8.89696216583252, + "learning_rate": 8.571428571428572e-07, + "loss": 0.1627, + "step": 60800 + }, + { + "epoch": 91.44, + "grad_norm": 6.409267425537109, + "learning_rate": 8.55639097744361e-07, + "loss": 0.1917, + "step": 60810 + }, + { + "epoch": 91.46, + "grad_norm": 13.142133712768555, + "learning_rate": 8.541353383458648e-07, + "loss": 0.2156, + "step": 60820 + }, + { + "epoch": 91.47, + "grad_norm": 3.5235838890075684, + "learning_rate": 8.526315789473685e-07, + "loss": 0.1644, + "step": 60830 + }, + { + "epoch": 91.49, + "grad_norm": 5.23291015625, + "learning_rate": 8.511278195488723e-07, + "loss": 0.2202, + "step": 60840 + }, + { + "epoch": 91.5, + "grad_norm": 8.659149169921875, + "learning_rate": 8.496240601503761e-07, + "loss": 0.1695, + "step": 60850 + }, + { + "epoch": 91.52, + "grad_norm": 5.686830043792725, + "learning_rate": 8.481203007518797e-07, + "loss": 0.1524, + "step": 60860 + }, + { + "epoch": 91.53, + "grad_norm": 5.107367992401123, + "learning_rate": 8.466165413533835e-07, + "loss": 0.1736, + "step": 60870 + }, + { + "epoch": 91.55, + "grad_norm": 8.266801834106445, + "learning_rate": 8.451127819548873e-07, + "loss": 0.1509, + "step": 60880 + }, + { + "epoch": 91.56, + "grad_norm": 8.239102363586426, + "learning_rate": 8.43609022556391e-07, + "loss": 0.1974, + "step": 60890 + }, + { + "epoch": 91.58, + "grad_norm": 4.153801918029785, + "learning_rate": 8.421052631578948e-07, + "loss": 0.2202, + "step": 60900 + }, + { + "epoch": 91.59, + "grad_norm": 4.520059585571289, + "learning_rate": 8.406015037593986e-07, + "loss": 0.1404, + "step": 60910 + }, + { + "epoch": 91.61, + "grad_norm": 3.584502935409546, + "learning_rate": 8.390977443609023e-07, + "loss": 0.1914, + "step": 60920 + }, + { + "epoch": 91.62, + "grad_norm": 5.777580261230469, + "learning_rate": 8.375939849624061e-07, + "loss": 0.1338, + "step": 60930 + }, + { + "epoch": 91.64, + "grad_norm": 4.816416263580322, + "learning_rate": 8.360902255639099e-07, + "loss": 0.1983, + "step": 60940 + }, + { + "epoch": 91.65, + "grad_norm": 1.8106648921966553, + "learning_rate": 8.345864661654136e-07, + "loss": 0.1728, + "step": 60950 + }, + { + "epoch": 91.67, + "grad_norm": 6.586467266082764, + "learning_rate": 8.330827067669174e-07, + "loss": 0.1635, + "step": 60960 + }, + { + "epoch": 91.68, + "grad_norm": 4.0947136878967285, + "learning_rate": 8.315789473684212e-07, + "loss": 0.1938, + "step": 60970 + }, + { + "epoch": 91.7, + "grad_norm": 6.031853199005127, + "learning_rate": 8.300751879699248e-07, + "loss": 0.18, + "step": 60980 + }, + { + "epoch": 91.71, + "grad_norm": 5.872425079345703, + "learning_rate": 8.285714285714285e-07, + "loss": 0.1693, + "step": 60990 + }, + { + "epoch": 91.73, + "grad_norm": 9.427947998046875, + "learning_rate": 8.270676691729324e-07, + "loss": 0.187, + "step": 61000 + }, + { + "epoch": 91.74, + "grad_norm": 5.179823398590088, + "learning_rate": 8.255639097744361e-07, + "loss": 0.1372, + "step": 61010 + }, + { + "epoch": 91.76, + "grad_norm": 5.486731052398682, + "learning_rate": 8.240601503759398e-07, + "loss": 0.1654, + "step": 61020 + }, + { + "epoch": 91.77, + "grad_norm": 3.1775834560394287, + "learning_rate": 8.225563909774437e-07, + "loss": 0.176, + "step": 61030 + }, + { + "epoch": 91.79, + "grad_norm": 4.589211463928223, + "learning_rate": 8.210526315789474e-07, + "loss": 0.1452, + "step": 61040 + }, + { + "epoch": 91.8, + "grad_norm": 3.6685729026794434, + "learning_rate": 8.195488721804513e-07, + "loss": 0.1883, + "step": 61050 + }, + { + "epoch": 91.82, + "grad_norm": 4.512982368469238, + "learning_rate": 8.18045112781955e-07, + "loss": 0.1589, + "step": 61060 + }, + { + "epoch": 91.83, + "grad_norm": 9.20313549041748, + "learning_rate": 8.165413533834587e-07, + "loss": 0.1399, + "step": 61070 + }, + { + "epoch": 91.85, + "grad_norm": 5.7398905754089355, + "learning_rate": 8.150375939849625e-07, + "loss": 0.1689, + "step": 61080 + }, + { + "epoch": 91.86, + "grad_norm": 4.950761795043945, + "learning_rate": 8.135338345864663e-07, + "loss": 0.1259, + "step": 61090 + }, + { + "epoch": 91.88, + "grad_norm": 7.066133499145508, + "learning_rate": 8.1203007518797e-07, + "loss": 0.1673, + "step": 61100 + }, + { + "epoch": 91.89, + "grad_norm": 4.5306596755981445, + "learning_rate": 8.105263157894736e-07, + "loss": 0.173, + "step": 61110 + }, + { + "epoch": 91.91, + "grad_norm": 6.547530651092529, + "learning_rate": 8.090225563909775e-07, + "loss": 0.1469, + "step": 61120 + }, + { + "epoch": 91.92, + "grad_norm": 6.574265003204346, + "learning_rate": 8.075187969924812e-07, + "loss": 0.1464, + "step": 61130 + }, + { + "epoch": 91.94, + "grad_norm": 2.8120083808898926, + "learning_rate": 8.06015037593985e-07, + "loss": 0.1422, + "step": 61140 + }, + { + "epoch": 91.95, + "grad_norm": 3.274015188217163, + "learning_rate": 8.045112781954888e-07, + "loss": 0.187, + "step": 61150 + }, + { + "epoch": 91.97, + "grad_norm": 16.76445198059082, + "learning_rate": 8.030075187969925e-07, + "loss": 0.1365, + "step": 61160 + }, + { + "epoch": 91.98, + "grad_norm": 3.863375663757324, + "learning_rate": 8.015037593984963e-07, + "loss": 0.1375, + "step": 61170 + }, + { + "epoch": 92.0, + "grad_norm": 23.6773738861084, + "learning_rate": 8.000000000000001e-07, + "loss": 0.268, + "step": 61180 + }, + { + "epoch": 92.0, + "eval_accuracy": 0.9305, + "eval_loss": 0.33316439390182495, + "eval_runtime": 84.8569, + "eval_samples_per_second": 117.845, + "eval_steps_per_second": 0.471, + "step": 61180 + }, + { + "epoch": 92.02, + "grad_norm": 6.592239856719971, + "learning_rate": 7.984962406015038e-07, + "loss": 0.1531, + "step": 61190 + }, + { + "epoch": 92.03, + "grad_norm": 3.465878963470459, + "learning_rate": 7.969924812030076e-07, + "loss": 0.1796, + "step": 61200 + }, + { + "epoch": 92.05, + "grad_norm": 5.356743812561035, + "learning_rate": 7.954887218045114e-07, + "loss": 0.1227, + "step": 61210 + }, + { + "epoch": 92.06, + "grad_norm": 9.6128511428833, + "learning_rate": 7.939849624060152e-07, + "loss": 0.1512, + "step": 61220 + }, + { + "epoch": 92.08, + "grad_norm": 5.72312068939209, + "learning_rate": 7.924812030075188e-07, + "loss": 0.1641, + "step": 61230 + }, + { + "epoch": 92.09, + "grad_norm": 2.734785556793213, + "learning_rate": 7.909774436090226e-07, + "loss": 0.2045, + "step": 61240 + }, + { + "epoch": 92.11, + "grad_norm": 5.764986991882324, + "learning_rate": 7.894736842105263e-07, + "loss": 0.1482, + "step": 61250 + }, + { + "epoch": 92.12, + "grad_norm": 6.9798173904418945, + "learning_rate": 7.879699248120301e-07, + "loss": 0.2035, + "step": 61260 + }, + { + "epoch": 92.14, + "grad_norm": 6.008689880371094, + "learning_rate": 7.864661654135339e-07, + "loss": 0.1314, + "step": 61270 + }, + { + "epoch": 92.15, + "grad_norm": 4.28978157043457, + "learning_rate": 7.849624060150377e-07, + "loss": 0.1305, + "step": 61280 + }, + { + "epoch": 92.17, + "grad_norm": 2.4776744842529297, + "learning_rate": 7.834586466165414e-07, + "loss": 0.1741, + "step": 61290 + }, + { + "epoch": 92.18, + "grad_norm": 6.749001502990723, + "learning_rate": 7.819548872180452e-07, + "loss": 0.1177, + "step": 61300 + }, + { + "epoch": 92.2, + "grad_norm": 5.488044738769531, + "learning_rate": 7.80451127819549e-07, + "loss": 0.2004, + "step": 61310 + }, + { + "epoch": 92.21, + "grad_norm": 6.313375949859619, + "learning_rate": 7.789473684210527e-07, + "loss": 0.2079, + "step": 61320 + }, + { + "epoch": 92.23, + "grad_norm": 4.431061267852783, + "learning_rate": 7.774436090225565e-07, + "loss": 0.135, + "step": 61330 + }, + { + "epoch": 92.24, + "grad_norm": 4.987079620361328, + "learning_rate": 7.759398496240603e-07, + "loss": 0.1677, + "step": 61340 + }, + { + "epoch": 92.26, + "grad_norm": 6.565682411193848, + "learning_rate": 7.74436090225564e-07, + "loss": 0.1398, + "step": 61350 + }, + { + "epoch": 92.27, + "grad_norm": 8.182050704956055, + "learning_rate": 7.729323308270676e-07, + "loss": 0.1806, + "step": 61360 + }, + { + "epoch": 92.29, + "grad_norm": 1.415570855140686, + "learning_rate": 7.714285714285715e-07, + "loss": 0.1538, + "step": 61370 + }, + { + "epoch": 92.3, + "grad_norm": 7.348145961761475, + "learning_rate": 7.699248120300752e-07, + "loss": 0.152, + "step": 61380 + }, + { + "epoch": 92.32, + "grad_norm": 2.7620201110839844, + "learning_rate": 7.684210526315789e-07, + "loss": 0.2018, + "step": 61390 + }, + { + "epoch": 92.33, + "grad_norm": 6.486240863800049, + "learning_rate": 7.669172932330828e-07, + "loss": 0.257, + "step": 61400 + }, + { + "epoch": 92.35, + "grad_norm": 4.345780372619629, + "learning_rate": 7.654135338345865e-07, + "loss": 0.2216, + "step": 61410 + }, + { + "epoch": 92.36, + "grad_norm": 3.1555426120758057, + "learning_rate": 7.639097744360902e-07, + "loss": 0.1257, + "step": 61420 + }, + { + "epoch": 92.38, + "grad_norm": 4.688722610473633, + "learning_rate": 7.624060150375941e-07, + "loss": 0.1059, + "step": 61430 + }, + { + "epoch": 92.39, + "grad_norm": 4.647052764892578, + "learning_rate": 7.609022556390978e-07, + "loss": 0.1449, + "step": 61440 + }, + { + "epoch": 92.41, + "grad_norm": 7.654082775115967, + "learning_rate": 7.593984962406016e-07, + "loss": 0.2042, + "step": 61450 + }, + { + "epoch": 92.42, + "grad_norm": 12.962769508361816, + "learning_rate": 7.578947368421054e-07, + "loss": 0.1656, + "step": 61460 + }, + { + "epoch": 92.44, + "grad_norm": 4.620476245880127, + "learning_rate": 7.563909774436091e-07, + "loss": 0.1846, + "step": 61470 + }, + { + "epoch": 92.45, + "grad_norm": 6.064635753631592, + "learning_rate": 7.548872180451127e-07, + "loss": 0.1734, + "step": 61480 + }, + { + "epoch": 92.47, + "grad_norm": 4.810222625732422, + "learning_rate": 7.533834586466166e-07, + "loss": 0.1596, + "step": 61490 + }, + { + "epoch": 92.48, + "grad_norm": 6.1294755935668945, + "learning_rate": 7.518796992481203e-07, + "loss": 0.1405, + "step": 61500 + }, + { + "epoch": 92.5, + "grad_norm": 6.559119701385498, + "learning_rate": 7.503759398496241e-07, + "loss": 0.1389, + "step": 61510 + }, + { + "epoch": 92.51, + "grad_norm": 4.981884479522705, + "learning_rate": 7.488721804511279e-07, + "loss": 0.1999, + "step": 61520 + }, + { + "epoch": 92.53, + "grad_norm": 6.0175604820251465, + "learning_rate": 7.473684210526316e-07, + "loss": 0.194, + "step": 61530 + }, + { + "epoch": 92.54, + "grad_norm": 6.699028968811035, + "learning_rate": 7.458646616541354e-07, + "loss": 0.2085, + "step": 61540 + }, + { + "epoch": 92.56, + "grad_norm": 9.893536567687988, + "learning_rate": 7.443609022556392e-07, + "loss": 0.1569, + "step": 61550 + }, + { + "epoch": 92.57, + "grad_norm": 3.370506525039673, + "learning_rate": 7.428571428571429e-07, + "loss": 0.1606, + "step": 61560 + }, + { + "epoch": 92.59, + "grad_norm": 4.382241725921631, + "learning_rate": 7.413533834586467e-07, + "loss": 0.1473, + "step": 61570 + }, + { + "epoch": 92.6, + "grad_norm": 1.784295916557312, + "learning_rate": 7.398496240601505e-07, + "loss": 0.1692, + "step": 61580 + }, + { + "epoch": 92.62, + "grad_norm": 2.770347833633423, + "learning_rate": 7.383458646616543e-07, + "loss": 0.1704, + "step": 61590 + }, + { + "epoch": 92.63, + "grad_norm": 3.2620766162872314, + "learning_rate": 7.368421052631579e-07, + "loss": 0.1742, + "step": 61600 + }, + { + "epoch": 92.65, + "grad_norm": 8.29354476928711, + "learning_rate": 7.353383458646617e-07, + "loss": 0.1271, + "step": 61610 + }, + { + "epoch": 92.66, + "grad_norm": 3.8892619609832764, + "learning_rate": 7.338345864661654e-07, + "loss": 0.2461, + "step": 61620 + }, + { + "epoch": 92.68, + "grad_norm": 7.872462272644043, + "learning_rate": 7.323308270676692e-07, + "loss": 0.1877, + "step": 61630 + }, + { + "epoch": 92.69, + "grad_norm": 4.30030632019043, + "learning_rate": 7.30827067669173e-07, + "loss": 0.167, + "step": 61640 + }, + { + "epoch": 92.71, + "grad_norm": 3.3043644428253174, + "learning_rate": 7.293233082706767e-07, + "loss": 0.2256, + "step": 61650 + }, + { + "epoch": 92.72, + "grad_norm": 6.955836296081543, + "learning_rate": 7.278195488721805e-07, + "loss": 0.2138, + "step": 61660 + }, + { + "epoch": 92.74, + "grad_norm": 2.1110410690307617, + "learning_rate": 7.263157894736843e-07, + "loss": 0.1773, + "step": 61670 + }, + { + "epoch": 92.75, + "grad_norm": 6.610522747039795, + "learning_rate": 7.248120300751881e-07, + "loss": 0.1779, + "step": 61680 + }, + { + "epoch": 92.77, + "grad_norm": 4.267979145050049, + "learning_rate": 7.233082706766918e-07, + "loss": 0.1249, + "step": 61690 + }, + { + "epoch": 92.78, + "grad_norm": 5.974915027618408, + "learning_rate": 7.218045112781956e-07, + "loss": 0.1679, + "step": 61700 + }, + { + "epoch": 92.8, + "grad_norm": 2.948686122894287, + "learning_rate": 7.203007518796994e-07, + "loss": 0.1572, + "step": 61710 + }, + { + "epoch": 92.81, + "grad_norm": 4.3910040855407715, + "learning_rate": 7.187969924812031e-07, + "loss": 0.1469, + "step": 61720 + }, + { + "epoch": 92.83, + "grad_norm": 5.496255874633789, + "learning_rate": 7.172932330827067e-07, + "loss": 0.1574, + "step": 61730 + }, + { + "epoch": 92.84, + "grad_norm": 6.861441135406494, + "learning_rate": 7.157894736842106e-07, + "loss": 0.161, + "step": 61740 + }, + { + "epoch": 92.86, + "grad_norm": 6.682887554168701, + "learning_rate": 7.142857142857143e-07, + "loss": 0.1741, + "step": 61750 + }, + { + "epoch": 92.87, + "grad_norm": 5.851301670074463, + "learning_rate": 7.12781954887218e-07, + "loss": 0.2264, + "step": 61760 + }, + { + "epoch": 92.89, + "grad_norm": 6.062599182128906, + "learning_rate": 7.112781954887219e-07, + "loss": 0.1853, + "step": 61770 + }, + { + "epoch": 92.9, + "grad_norm": 3.691725969314575, + "learning_rate": 7.097744360902256e-07, + "loss": 0.1196, + "step": 61780 + }, + { + "epoch": 92.92, + "grad_norm": 6.096661567687988, + "learning_rate": 7.082706766917293e-07, + "loss": 0.1816, + "step": 61790 + }, + { + "epoch": 92.93, + "grad_norm": 0.75389564037323, + "learning_rate": 7.067669172932332e-07, + "loss": 0.1673, + "step": 61800 + }, + { + "epoch": 92.95, + "grad_norm": 8.029284477233887, + "learning_rate": 7.052631578947369e-07, + "loss": 0.1728, + "step": 61810 + }, + { + "epoch": 92.96, + "grad_norm": 9.124761581420898, + "learning_rate": 7.037593984962407e-07, + "loss": 0.2143, + "step": 61820 + }, + { + "epoch": 92.98, + "grad_norm": 6.918426036834717, + "learning_rate": 7.022556390977445e-07, + "loss": 0.1845, + "step": 61830 + }, + { + "epoch": 92.99, + "grad_norm": 6.886169910430908, + "learning_rate": 7.007518796992482e-07, + "loss": 0.1807, + "step": 61840 + }, + { + "epoch": 93.0, + "eval_accuracy": 0.9307, + "eval_loss": 0.3299960494041443, + "eval_runtime": 84.6466, + "eval_samples_per_second": 118.138, + "eval_steps_per_second": 0.473, + "step": 61845 + }, + { + "epoch": 93.01, + "grad_norm": 5.7528886795043945, + "learning_rate": 6.992481203007518e-07, + "loss": 0.1765, + "step": 61850 + }, + { + "epoch": 93.02, + "grad_norm": 6.218891620635986, + "learning_rate": 6.977443609022557e-07, + "loss": 0.1465, + "step": 61860 + }, + { + "epoch": 93.04, + "grad_norm": 3.1827406883239746, + "learning_rate": 6.962406015037594e-07, + "loss": 0.1436, + "step": 61870 + }, + { + "epoch": 93.05, + "grad_norm": 2.038198232650757, + "learning_rate": 6.947368421052631e-07, + "loss": 0.1519, + "step": 61880 + }, + { + "epoch": 93.07, + "grad_norm": 4.038834095001221, + "learning_rate": 6.93233082706767e-07, + "loss": 0.1381, + "step": 61890 + }, + { + "epoch": 93.08, + "grad_norm": 8.468280792236328, + "learning_rate": 6.917293233082707e-07, + "loss": 0.1598, + "step": 61900 + }, + { + "epoch": 93.1, + "grad_norm": 5.541633129119873, + "learning_rate": 6.902255639097745e-07, + "loss": 0.1888, + "step": 61910 + }, + { + "epoch": 93.11, + "grad_norm": 6.266165733337402, + "learning_rate": 6.887218045112783e-07, + "loss": 0.1338, + "step": 61920 + }, + { + "epoch": 93.13, + "grad_norm": 2.7548935413360596, + "learning_rate": 6.87218045112782e-07, + "loss": 0.1435, + "step": 61930 + }, + { + "epoch": 93.14, + "grad_norm": 6.2727131843566895, + "learning_rate": 6.857142857142858e-07, + "loss": 0.1858, + "step": 61940 + }, + { + "epoch": 93.16, + "grad_norm": 8.00541877746582, + "learning_rate": 6.842105263157896e-07, + "loss": 0.1809, + "step": 61950 + }, + { + "epoch": 93.17, + "grad_norm": 4.776533126831055, + "learning_rate": 6.827067669172933e-07, + "loss": 0.1105, + "step": 61960 + }, + { + "epoch": 93.19, + "grad_norm": 0.5957467555999756, + "learning_rate": 6.812030075187971e-07, + "loss": 0.1535, + "step": 61970 + }, + { + "epoch": 93.2, + "grad_norm": 2.1154375076293945, + "learning_rate": 6.796992481203008e-07, + "loss": 0.1391, + "step": 61980 + }, + { + "epoch": 93.22, + "grad_norm": 3.501743793487549, + "learning_rate": 6.781954887218045e-07, + "loss": 0.1809, + "step": 61990 + }, + { + "epoch": 93.23, + "grad_norm": 19.21070098876953, + "learning_rate": 6.766917293233083e-07, + "loss": 0.2236, + "step": 62000 + }, + { + "epoch": 93.25, + "grad_norm": 8.024877548217773, + "learning_rate": 6.751879699248121e-07, + "loss": 0.1522, + "step": 62010 + }, + { + "epoch": 93.26, + "grad_norm": 3.7269623279571533, + "learning_rate": 6.736842105263158e-07, + "loss": 0.1898, + "step": 62020 + }, + { + "epoch": 93.28, + "grad_norm": 3.2718605995178223, + "learning_rate": 6.721804511278196e-07, + "loss": 0.1798, + "step": 62030 + }, + { + "epoch": 93.29, + "grad_norm": 5.502279281616211, + "learning_rate": 6.706766917293234e-07, + "loss": 0.1604, + "step": 62040 + }, + { + "epoch": 93.31, + "grad_norm": 5.675206661224365, + "learning_rate": 6.691729323308272e-07, + "loss": 0.1656, + "step": 62050 + }, + { + "epoch": 93.32, + "grad_norm": 5.183054447174072, + "learning_rate": 6.676691729323309e-07, + "loss": 0.2103, + "step": 62060 + }, + { + "epoch": 93.34, + "grad_norm": 1.9447113275527954, + "learning_rate": 6.661654135338347e-07, + "loss": 0.1621, + "step": 62070 + }, + { + "epoch": 93.35, + "grad_norm": 3.6219539642333984, + "learning_rate": 6.646616541353385e-07, + "loss": 0.126, + "step": 62080 + }, + { + "epoch": 93.37, + "grad_norm": 4.979419708251953, + "learning_rate": 6.631578947368422e-07, + "loss": 0.1749, + "step": 62090 + }, + { + "epoch": 93.38, + "grad_norm": 7.001568794250488, + "learning_rate": 6.616541353383458e-07, + "loss": 0.1297, + "step": 62100 + }, + { + "epoch": 93.4, + "grad_norm": 7.3389387130737305, + "learning_rate": 6.601503759398496e-07, + "loss": 0.1794, + "step": 62110 + }, + { + "epoch": 93.41, + "grad_norm": 4.73595666885376, + "learning_rate": 6.586466165413534e-07, + "loss": 0.1852, + "step": 62120 + }, + { + "epoch": 93.43, + "grad_norm": 5.426881790161133, + "learning_rate": 6.571428571428571e-07, + "loss": 0.2275, + "step": 62130 + }, + { + "epoch": 93.44, + "grad_norm": 7.103865146636963, + "learning_rate": 6.55639097744361e-07, + "loss": 0.1765, + "step": 62140 + }, + { + "epoch": 93.46, + "grad_norm": 5.244080066680908, + "learning_rate": 6.541353383458647e-07, + "loss": 0.1559, + "step": 62150 + }, + { + "epoch": 93.47, + "grad_norm": 11.155570030212402, + "learning_rate": 6.526315789473684e-07, + "loss": 0.191, + "step": 62160 + }, + { + "epoch": 93.49, + "grad_norm": 3.5410802364349365, + "learning_rate": 6.511278195488723e-07, + "loss": 0.1308, + "step": 62170 + }, + { + "epoch": 93.5, + "grad_norm": 5.664346218109131, + "learning_rate": 6.49624060150376e-07, + "loss": 0.1734, + "step": 62180 + }, + { + "epoch": 93.52, + "grad_norm": 3.049161434173584, + "learning_rate": 6.481203007518797e-07, + "loss": 0.1595, + "step": 62190 + }, + { + "epoch": 93.53, + "grad_norm": 3.4412500858306885, + "learning_rate": 6.466165413533836e-07, + "loss": 0.1986, + "step": 62200 + }, + { + "epoch": 93.55, + "grad_norm": 5.76616096496582, + "learning_rate": 6.451127819548873e-07, + "loss": 0.2203, + "step": 62210 + }, + { + "epoch": 93.56, + "grad_norm": 3.9202961921691895, + "learning_rate": 6.436090225563909e-07, + "loss": 0.1739, + "step": 62220 + }, + { + "epoch": 93.58, + "grad_norm": 5.559597969055176, + "learning_rate": 6.421052631578948e-07, + "loss": 0.1472, + "step": 62230 + }, + { + "epoch": 93.59, + "grad_norm": 6.698195934295654, + "learning_rate": 6.406015037593985e-07, + "loss": 0.1822, + "step": 62240 + }, + { + "epoch": 93.61, + "grad_norm": 7.657590866088867, + "learning_rate": 6.390977443609022e-07, + "loss": 0.1469, + "step": 62250 + }, + { + "epoch": 93.62, + "grad_norm": 9.91557788848877, + "learning_rate": 6.375939849624061e-07, + "loss": 0.1382, + "step": 62260 + }, + { + "epoch": 93.64, + "grad_norm": 4.059289932250977, + "learning_rate": 6.360902255639098e-07, + "loss": 0.1408, + "step": 62270 + }, + { + "epoch": 93.65, + "grad_norm": 3.5155844688415527, + "learning_rate": 6.345864661654136e-07, + "loss": 0.153, + "step": 62280 + }, + { + "epoch": 93.67, + "grad_norm": 2.9289627075195312, + "learning_rate": 6.330827067669174e-07, + "loss": 0.133, + "step": 62290 + }, + { + "epoch": 93.68, + "grad_norm": 1.8064905405044556, + "learning_rate": 6.315789473684211e-07, + "loss": 0.1543, + "step": 62300 + }, + { + "epoch": 93.7, + "grad_norm": 5.917304039001465, + "learning_rate": 6.300751879699249e-07, + "loss": 0.1764, + "step": 62310 + }, + { + "epoch": 93.71, + "grad_norm": 9.10999584197998, + "learning_rate": 6.285714285714287e-07, + "loss": 0.1601, + "step": 62320 + }, + { + "epoch": 93.73, + "grad_norm": 9.873708724975586, + "learning_rate": 6.270676691729324e-07, + "loss": 0.2098, + "step": 62330 + }, + { + "epoch": 93.74, + "grad_norm": 4.124228000640869, + "learning_rate": 6.255639097744362e-07, + "loss": 0.1395, + "step": 62340 + }, + { + "epoch": 93.76, + "grad_norm": 5.858664035797119, + "learning_rate": 6.240601503759399e-07, + "loss": 0.2124, + "step": 62350 + }, + { + "epoch": 93.77, + "grad_norm": 5.991408348083496, + "learning_rate": 6.225563909774437e-07, + "loss": 0.1532, + "step": 62360 + }, + { + "epoch": 93.79, + "grad_norm": 3.827173948287964, + "learning_rate": 6.210526315789474e-07, + "loss": 0.1139, + "step": 62370 + }, + { + "epoch": 93.8, + "grad_norm": 3.3806679248809814, + "learning_rate": 6.195488721804512e-07, + "loss": 0.1869, + "step": 62380 + }, + { + "epoch": 93.82, + "grad_norm": 5.624974250793457, + "learning_rate": 6.180451127819549e-07, + "loss": 0.2374, + "step": 62390 + }, + { + "epoch": 93.83, + "grad_norm": 3.611284017562866, + "learning_rate": 6.165413533834587e-07, + "loss": 0.2001, + "step": 62400 + }, + { + "epoch": 93.85, + "grad_norm": 3.4336488246917725, + "learning_rate": 6.150375939849625e-07, + "loss": 0.2229, + "step": 62410 + }, + { + "epoch": 93.86, + "grad_norm": 5.959348201751709, + "learning_rate": 6.135338345864662e-07, + "loss": 0.1408, + "step": 62420 + }, + { + "epoch": 93.88, + "grad_norm": 4.848119258880615, + "learning_rate": 6.1203007518797e-07, + "loss": 0.1566, + "step": 62430 + }, + { + "epoch": 93.89, + "grad_norm": 4.178713798522949, + "learning_rate": 6.105263157894738e-07, + "loss": 0.1452, + "step": 62440 + }, + { + "epoch": 93.91, + "grad_norm": 7.14044713973999, + "learning_rate": 6.090225563909775e-07, + "loss": 0.15, + "step": 62450 + }, + { + "epoch": 93.92, + "grad_norm": 6.114139556884766, + "learning_rate": 6.075187969924812e-07, + "loss": 0.1249, + "step": 62460 + }, + { + "epoch": 93.94, + "grad_norm": 2.516326904296875, + "learning_rate": 6.06015037593985e-07, + "loss": 0.112, + "step": 62470 + }, + { + "epoch": 93.95, + "grad_norm": 6.2728705406188965, + "learning_rate": 6.045112781954888e-07, + "loss": 0.1425, + "step": 62480 + }, + { + "epoch": 93.97, + "grad_norm": 5.063921928405762, + "learning_rate": 6.030075187969925e-07, + "loss": 0.1956, + "step": 62490 + }, + { + "epoch": 93.98, + "grad_norm": 1.3252296447753906, + "learning_rate": 6.015037593984962e-07, + "loss": 0.1332, + "step": 62500 + }, + { + "epoch": 94.0, + "grad_norm": 0.03227702155709267, + "learning_rate": 6.000000000000001e-07, + "loss": 0.1855, + "step": 62510 + }, + { + "epoch": 94.0, + "eval_accuracy": 0.9303, + "eval_loss": 0.33146244287490845, + "eval_runtime": 85.2753, + "eval_samples_per_second": 117.267, + "eval_steps_per_second": 0.469, + "step": 62510 + }, + { + "epoch": 94.02, + "grad_norm": 4.525468826293945, + "learning_rate": 5.984962406015038e-07, + "loss": 0.1478, + "step": 62520 + }, + { + "epoch": 94.03, + "grad_norm": 3.999218702316284, + "learning_rate": 5.969924812030075e-07, + "loss": 0.1595, + "step": 62530 + }, + { + "epoch": 94.05, + "grad_norm": 5.01107120513916, + "learning_rate": 5.954887218045114e-07, + "loss": 0.1798, + "step": 62540 + }, + { + "epoch": 94.06, + "grad_norm": 3.2505600452423096, + "learning_rate": 5.939849624060151e-07, + "loss": 0.1772, + "step": 62550 + }, + { + "epoch": 94.08, + "grad_norm": 5.565808296203613, + "learning_rate": 5.924812030075188e-07, + "loss": 0.163, + "step": 62560 + }, + { + "epoch": 94.09, + "grad_norm": 5.079768180847168, + "learning_rate": 5.909774436090226e-07, + "loss": 0.1596, + "step": 62570 + }, + { + "epoch": 94.11, + "grad_norm": 6.390072345733643, + "learning_rate": 5.894736842105263e-07, + "loss": 0.1867, + "step": 62580 + }, + { + "epoch": 94.12, + "grad_norm": 3.9409000873565674, + "learning_rate": 5.879699248120301e-07, + "loss": 0.1428, + "step": 62590 + }, + { + "epoch": 94.14, + "grad_norm": 7.87678337097168, + "learning_rate": 5.864661654135339e-07, + "loss": 0.139, + "step": 62600 + }, + { + "epoch": 94.15, + "grad_norm": 3.9159417152404785, + "learning_rate": 5.849624060150377e-07, + "loss": 0.1449, + "step": 62610 + }, + { + "epoch": 94.17, + "grad_norm": 5.255906581878662, + "learning_rate": 5.834586466165414e-07, + "loss": 0.1411, + "step": 62620 + }, + { + "epoch": 94.18, + "grad_norm": 4.5999436378479, + "learning_rate": 5.819548872180452e-07, + "loss": 0.1697, + "step": 62630 + }, + { + "epoch": 94.2, + "grad_norm": 10.575860977172852, + "learning_rate": 5.804511278195489e-07, + "loss": 0.1787, + "step": 62640 + }, + { + "epoch": 94.21, + "grad_norm": 8.932710647583008, + "learning_rate": 5.789473684210526e-07, + "loss": 0.1589, + "step": 62650 + }, + { + "epoch": 94.23, + "grad_norm": 8.139424324035645, + "learning_rate": 5.774436090225565e-07, + "loss": 0.1776, + "step": 62660 + }, + { + "epoch": 94.24, + "grad_norm": 1.6099450588226318, + "learning_rate": 5.759398496240602e-07, + "loss": 0.1939, + "step": 62670 + }, + { + "epoch": 94.26, + "grad_norm": 2.6869306564331055, + "learning_rate": 5.74436090225564e-07, + "loss": 0.1988, + "step": 62680 + }, + { + "epoch": 94.27, + "grad_norm": 5.661433219909668, + "learning_rate": 5.729323308270677e-07, + "loss": 0.2013, + "step": 62690 + }, + { + "epoch": 94.29, + "grad_norm": 6.6180315017700195, + "learning_rate": 5.714285714285715e-07, + "loss": 0.188, + "step": 62700 + }, + { + "epoch": 94.3, + "grad_norm": 4.5534563064575195, + "learning_rate": 5.699248120300752e-07, + "loss": 0.1996, + "step": 62710 + }, + { + "epoch": 94.32, + "grad_norm": 5.5353193283081055, + "learning_rate": 5.68421052631579e-07, + "loss": 0.1214, + "step": 62720 + }, + { + "epoch": 94.33, + "grad_norm": 3.558548927307129, + "learning_rate": 5.669172932330828e-07, + "loss": 0.1451, + "step": 62730 + }, + { + "epoch": 94.35, + "grad_norm": 4.777967929840088, + "learning_rate": 5.654135338345865e-07, + "loss": 0.1993, + "step": 62740 + }, + { + "epoch": 94.36, + "grad_norm": 5.252827167510986, + "learning_rate": 5.639097744360903e-07, + "loss": 0.1442, + "step": 62750 + }, + { + "epoch": 94.38, + "grad_norm": 6.862250804901123, + "learning_rate": 5.62406015037594e-07, + "loss": 0.191, + "step": 62760 + }, + { + "epoch": 94.39, + "grad_norm": 7.284124851226807, + "learning_rate": 5.609022556390978e-07, + "loss": 0.1813, + "step": 62770 + }, + { + "epoch": 94.41, + "grad_norm": 4.488624572753906, + "learning_rate": 5.593984962406016e-07, + "loss": 0.1546, + "step": 62780 + }, + { + "epoch": 94.42, + "grad_norm": 4.856323719024658, + "learning_rate": 5.578947368421053e-07, + "loss": 0.1553, + "step": 62790 + }, + { + "epoch": 94.44, + "grad_norm": 5.90994119644165, + "learning_rate": 5.563909774436091e-07, + "loss": 0.168, + "step": 62800 + }, + { + "epoch": 94.45, + "grad_norm": 4.328577041625977, + "learning_rate": 5.548872180451127e-07, + "loss": 0.1539, + "step": 62810 + }, + { + "epoch": 94.47, + "grad_norm": 2.8183932304382324, + "learning_rate": 5.533834586466166e-07, + "loss": 0.1973, + "step": 62820 + }, + { + "epoch": 94.48, + "grad_norm": 5.438328266143799, + "learning_rate": 5.518796992481203e-07, + "loss": 0.17, + "step": 62830 + }, + { + "epoch": 94.5, + "grad_norm": 4.676559925079346, + "learning_rate": 5.503759398496241e-07, + "loss": 0.131, + "step": 62840 + }, + { + "epoch": 94.51, + "grad_norm": 8.041285514831543, + "learning_rate": 5.488721804511279e-07, + "loss": 0.1931, + "step": 62850 + }, + { + "epoch": 94.53, + "grad_norm": 6.983100414276123, + "learning_rate": 5.473684210526316e-07, + "loss": 0.1621, + "step": 62860 + }, + { + "epoch": 94.54, + "grad_norm": 8.173839569091797, + "learning_rate": 5.458646616541354e-07, + "loss": 0.1366, + "step": 62870 + }, + { + "epoch": 94.56, + "grad_norm": 4.067495346069336, + "learning_rate": 5.443609022556391e-07, + "loss": 0.137, + "step": 62880 + }, + { + "epoch": 94.57, + "grad_norm": 1.778714656829834, + "learning_rate": 5.428571428571429e-07, + "loss": 0.1684, + "step": 62890 + }, + { + "epoch": 94.59, + "grad_norm": 4.966789722442627, + "learning_rate": 5.413533834586466e-07, + "loss": 0.1576, + "step": 62900 + }, + { + "epoch": 94.6, + "grad_norm": 6.58605432510376, + "learning_rate": 5.398496240601505e-07, + "loss": 0.1515, + "step": 62910 + }, + { + "epoch": 94.62, + "grad_norm": 5.37683629989624, + "learning_rate": 5.383458646616542e-07, + "loss": 0.2039, + "step": 62920 + }, + { + "epoch": 94.63, + "grad_norm": 4.260867595672607, + "learning_rate": 5.368421052631579e-07, + "loss": 0.169, + "step": 62930 + }, + { + "epoch": 94.65, + "grad_norm": 8.92151927947998, + "learning_rate": 5.353383458646617e-07, + "loss": 0.1674, + "step": 62940 + }, + { + "epoch": 94.66, + "grad_norm": 5.447167873382568, + "learning_rate": 5.338345864661654e-07, + "loss": 0.1623, + "step": 62950 + }, + { + "epoch": 94.68, + "grad_norm": 5.827322483062744, + "learning_rate": 5.323308270676692e-07, + "loss": 0.1627, + "step": 62960 + }, + { + "epoch": 94.69, + "grad_norm": 3.478543281555176, + "learning_rate": 5.30827067669173e-07, + "loss": 0.1619, + "step": 62970 + }, + { + "epoch": 94.71, + "grad_norm": 2.6951584815979004, + "learning_rate": 5.293233082706768e-07, + "loss": 0.1155, + "step": 62980 + }, + { + "epoch": 94.72, + "grad_norm": 4.290323734283447, + "learning_rate": 5.278195488721805e-07, + "loss": 0.2299, + "step": 62990 + }, + { + "epoch": 94.74, + "grad_norm": 2.3207318782806396, + "learning_rate": 5.263157894736843e-07, + "loss": 0.1377, + "step": 63000 + }, + { + "epoch": 94.75, + "grad_norm": 4.391146659851074, + "learning_rate": 5.24812030075188e-07, + "loss": 0.1383, + "step": 63010 + }, + { + "epoch": 94.77, + "grad_norm": 8.625882148742676, + "learning_rate": 5.233082706766917e-07, + "loss": 0.1567, + "step": 63020 + }, + { + "epoch": 94.78, + "grad_norm": 5.556321620941162, + "learning_rate": 5.218045112781956e-07, + "loss": 0.1483, + "step": 63030 + }, + { + "epoch": 94.8, + "grad_norm": 3.1565639972686768, + "learning_rate": 5.203007518796993e-07, + "loss": 0.1173, + "step": 63040 + }, + { + "epoch": 94.81, + "grad_norm": 1.9764903783798218, + "learning_rate": 5.18796992481203e-07, + "loss": 0.1378, + "step": 63050 + }, + { + "epoch": 94.83, + "grad_norm": 6.802098274230957, + "learning_rate": 5.172932330827068e-07, + "loss": 0.1888, + "step": 63060 + }, + { + "epoch": 94.84, + "grad_norm": 10.17620849609375, + "learning_rate": 5.157894736842106e-07, + "loss": 0.1397, + "step": 63070 + }, + { + "epoch": 94.86, + "grad_norm": 2.024324893951416, + "learning_rate": 5.142857142857143e-07, + "loss": 0.1553, + "step": 63080 + }, + { + "epoch": 94.87, + "grad_norm": 4.072267532348633, + "learning_rate": 5.127819548872181e-07, + "loss": 0.1659, + "step": 63090 + }, + { + "epoch": 94.89, + "grad_norm": 5.023469924926758, + "learning_rate": 5.112781954887219e-07, + "loss": 0.1639, + "step": 63100 + }, + { + "epoch": 94.9, + "grad_norm": 8.388982772827148, + "learning_rate": 5.097744360902256e-07, + "loss": 0.1692, + "step": 63110 + }, + { + "epoch": 94.92, + "grad_norm": 3.7290244102478027, + "learning_rate": 5.082706766917294e-07, + "loss": 0.1179, + "step": 63120 + }, + { + "epoch": 94.93, + "grad_norm": 7.133794784545898, + "learning_rate": 5.067669172932331e-07, + "loss": 0.1777, + "step": 63130 + }, + { + "epoch": 94.95, + "grad_norm": 1.3478121757507324, + "learning_rate": 5.052631578947369e-07, + "loss": 0.1594, + "step": 63140 + }, + { + "epoch": 94.96, + "grad_norm": 2.427905321121216, + "learning_rate": 5.037593984962407e-07, + "loss": 0.1275, + "step": 63150 + }, + { + "epoch": 94.98, + "grad_norm": 4.035090446472168, + "learning_rate": 5.022556390977444e-07, + "loss": 0.1687, + "step": 63160 + }, + { + "epoch": 94.99, + "grad_norm": 3.4016542434692383, + "learning_rate": 5.007518796992482e-07, + "loss": 0.1747, + "step": 63170 + }, + { + "epoch": 95.0, + "eval_accuracy": 0.9295, + "eval_loss": 0.33236411213874817, + "eval_runtime": 84.9502, + "eval_samples_per_second": 117.716, + "eval_steps_per_second": 0.471, + "step": 63175 + }, + { + "epoch": 95.01, + "grad_norm": 6.205942630767822, + "learning_rate": 4.99248120300752e-07, + "loss": 0.1474, + "step": 63180 + }, + { + "epoch": 95.02, + "grad_norm": 6.6230788230896, + "learning_rate": 4.977443609022557e-07, + "loss": 0.1549, + "step": 63190 + }, + { + "epoch": 95.04, + "grad_norm": 8.845964431762695, + "learning_rate": 4.962406015037594e-07, + "loss": 0.1962, + "step": 63200 + }, + { + "epoch": 95.05, + "grad_norm": 3.8196167945861816, + "learning_rate": 4.947368421052632e-07, + "loss": 0.1145, + "step": 63210 + }, + { + "epoch": 95.07, + "grad_norm": 4.0402607917785645, + "learning_rate": 4.93233082706767e-07, + "loss": 0.1538, + "step": 63220 + }, + { + "epoch": 95.08, + "grad_norm": 0.3398207724094391, + "learning_rate": 4.917293233082707e-07, + "loss": 0.1827, + "step": 63230 + }, + { + "epoch": 95.1, + "grad_norm": 5.126389503479004, + "learning_rate": 4.902255639097745e-07, + "loss": 0.1691, + "step": 63240 + }, + { + "epoch": 95.11, + "grad_norm": 5.045697212219238, + "learning_rate": 4.887218045112782e-07, + "loss": 0.1115, + "step": 63250 + }, + { + "epoch": 95.13, + "grad_norm": 6.75160551071167, + "learning_rate": 4.87218045112782e-07, + "loss": 0.1862, + "step": 63260 + }, + { + "epoch": 95.14, + "grad_norm": 3.8719749450683594, + "learning_rate": 4.857142857142857e-07, + "loss": 0.1878, + "step": 63270 + }, + { + "epoch": 95.16, + "grad_norm": 6.269413471221924, + "learning_rate": 4.842105263157895e-07, + "loss": 0.1563, + "step": 63280 + }, + { + "epoch": 95.17, + "grad_norm": 4.618571758270264, + "learning_rate": 4.827067669172933e-07, + "loss": 0.1733, + "step": 63290 + }, + { + "epoch": 95.19, + "grad_norm": 4.135458469390869, + "learning_rate": 4.81203007518797e-07, + "loss": 0.2016, + "step": 63300 + }, + { + "epoch": 95.2, + "grad_norm": 7.51487398147583, + "learning_rate": 4.796992481203008e-07, + "loss": 0.1643, + "step": 63310 + }, + { + "epoch": 95.22, + "grad_norm": 4.156686305999756, + "learning_rate": 4.781954887218045e-07, + "loss": 0.1999, + "step": 63320 + }, + { + "epoch": 95.23, + "grad_norm": 8.415448188781738, + "learning_rate": 4.766917293233083e-07, + "loss": 0.2051, + "step": 63330 + }, + { + "epoch": 95.25, + "grad_norm": 3.9293386936187744, + "learning_rate": 4.7518796992481207e-07, + "loss": 0.1811, + "step": 63340 + }, + { + "epoch": 95.26, + "grad_norm": 5.730102062225342, + "learning_rate": 4.7368421052631585e-07, + "loss": 0.1573, + "step": 63350 + }, + { + "epoch": 95.28, + "grad_norm": 5.62534236907959, + "learning_rate": 4.7218045112781963e-07, + "loss": 0.1802, + "step": 63360 + }, + { + "epoch": 95.29, + "grad_norm": 5.367505073547363, + "learning_rate": 4.706766917293233e-07, + "loss": 0.1676, + "step": 63370 + }, + { + "epoch": 95.31, + "grad_norm": 5.48174524307251, + "learning_rate": 4.691729323308271e-07, + "loss": 0.1394, + "step": 63380 + }, + { + "epoch": 95.32, + "grad_norm": 5.894782543182373, + "learning_rate": 4.676691729323309e-07, + "loss": 0.1703, + "step": 63390 + }, + { + "epoch": 95.34, + "grad_norm": 3.4429852962493896, + "learning_rate": 4.661654135338346e-07, + "loss": 0.1782, + "step": 63400 + }, + { + "epoch": 95.35, + "grad_norm": 5.609094142913818, + "learning_rate": 4.646616541353384e-07, + "loss": 0.2212, + "step": 63410 + }, + { + "epoch": 95.37, + "grad_norm": 6.826117992401123, + "learning_rate": 4.631578947368422e-07, + "loss": 0.1549, + "step": 63420 + }, + { + "epoch": 95.38, + "grad_norm": 6.199655532836914, + "learning_rate": 4.6165413533834585e-07, + "loss": 0.2035, + "step": 63430 + }, + { + "epoch": 95.4, + "grad_norm": 3.159210443496704, + "learning_rate": 4.6015037593984964e-07, + "loss": 0.168, + "step": 63440 + }, + { + "epoch": 95.41, + "grad_norm": 4.340883255004883, + "learning_rate": 4.586466165413534e-07, + "loss": 0.1506, + "step": 63450 + }, + { + "epoch": 95.43, + "grad_norm": 6.630001068115234, + "learning_rate": 4.571428571428572e-07, + "loss": 0.1342, + "step": 63460 + }, + { + "epoch": 95.44, + "grad_norm": 6.111904144287109, + "learning_rate": 4.5563909774436094e-07, + "loss": 0.1454, + "step": 63470 + }, + { + "epoch": 95.46, + "grad_norm": 8.334033012390137, + "learning_rate": 4.541353383458647e-07, + "loss": 0.1583, + "step": 63480 + }, + { + "epoch": 95.47, + "grad_norm": 3.537055730819702, + "learning_rate": 4.526315789473685e-07, + "loss": 0.1506, + "step": 63490 + }, + { + "epoch": 95.49, + "grad_norm": 4.5531792640686035, + "learning_rate": 4.511278195488722e-07, + "loss": 0.1923, + "step": 63500 + }, + { + "epoch": 95.5, + "grad_norm": 4.920889854431152, + "learning_rate": 4.4962406015037597e-07, + "loss": 0.1556, + "step": 63510 + }, + { + "epoch": 95.52, + "grad_norm": 4.547194004058838, + "learning_rate": 4.4812030075187975e-07, + "loss": 0.1974, + "step": 63520 + }, + { + "epoch": 95.53, + "grad_norm": 3.0299079418182373, + "learning_rate": 4.466165413533835e-07, + "loss": 0.1801, + "step": 63530 + }, + { + "epoch": 95.55, + "grad_norm": 5.605795383453369, + "learning_rate": 4.4511278195488726e-07, + "loss": 0.1776, + "step": 63540 + }, + { + "epoch": 95.56, + "grad_norm": 8.868003845214844, + "learning_rate": 4.4360902255639105e-07, + "loss": 0.1175, + "step": 63550 + }, + { + "epoch": 95.58, + "grad_norm": 5.188925266265869, + "learning_rate": 4.421052631578947e-07, + "loss": 0.1546, + "step": 63560 + }, + { + "epoch": 95.59, + "grad_norm": 3.829498529434204, + "learning_rate": 4.406015037593985e-07, + "loss": 0.2129, + "step": 63570 + }, + { + "epoch": 95.61, + "grad_norm": 7.33263635635376, + "learning_rate": 4.390977443609023e-07, + "loss": 0.2169, + "step": 63580 + }, + { + "epoch": 95.62, + "grad_norm": 6.37952995300293, + "learning_rate": 4.375939849624061e-07, + "loss": 0.1823, + "step": 63590 + }, + { + "epoch": 95.64, + "grad_norm": 4.5436201095581055, + "learning_rate": 4.360902255639098e-07, + "loss": 0.1293, + "step": 63600 + }, + { + "epoch": 95.65, + "grad_norm": 4.836911201477051, + "learning_rate": 4.345864661654136e-07, + "loss": 0.1519, + "step": 63610 + }, + { + "epoch": 95.67, + "grad_norm": 3.8880579471588135, + "learning_rate": 4.330827067669173e-07, + "loss": 0.1459, + "step": 63620 + }, + { + "epoch": 95.68, + "grad_norm": 5.021939277648926, + "learning_rate": 4.3157894736842105e-07, + "loss": 0.2607, + "step": 63630 + }, + { + "epoch": 95.7, + "grad_norm": 5.220824241638184, + "learning_rate": 4.3007518796992484e-07, + "loss": 0.2041, + "step": 63640 + }, + { + "epoch": 95.71, + "grad_norm": 6.0544514656066895, + "learning_rate": 4.285714285714286e-07, + "loss": 0.1999, + "step": 63650 + }, + { + "epoch": 95.73, + "grad_norm": 3.8796777725219727, + "learning_rate": 4.270676691729324e-07, + "loss": 0.1769, + "step": 63660 + }, + { + "epoch": 95.74, + "grad_norm": 3.2138679027557373, + "learning_rate": 4.2556390977443613e-07, + "loss": 0.1565, + "step": 63670 + }, + { + "epoch": 95.76, + "grad_norm": 6.524970054626465, + "learning_rate": 4.2406015037593987e-07, + "loss": 0.1141, + "step": 63680 + }, + { + "epoch": 95.77, + "grad_norm": 5.798688888549805, + "learning_rate": 4.2255639097744365e-07, + "loss": 0.182, + "step": 63690 + }, + { + "epoch": 95.79, + "grad_norm": 4.808920383453369, + "learning_rate": 4.210526315789474e-07, + "loss": 0.1496, + "step": 63700 + }, + { + "epoch": 95.8, + "grad_norm": 2.4658915996551514, + "learning_rate": 4.1954887218045116e-07, + "loss": 0.1157, + "step": 63710 + }, + { + "epoch": 95.82, + "grad_norm": 3.5132291316986084, + "learning_rate": 4.1804511278195495e-07, + "loss": 0.1528, + "step": 63720 + }, + { + "epoch": 95.83, + "grad_norm": 8.57204818725586, + "learning_rate": 4.165413533834587e-07, + "loss": 0.2021, + "step": 63730 + }, + { + "epoch": 95.85, + "grad_norm": 6.247355937957764, + "learning_rate": 4.150375939849624e-07, + "loss": 0.1266, + "step": 63740 + }, + { + "epoch": 95.86, + "grad_norm": 6.308277606964111, + "learning_rate": 4.135338345864662e-07, + "loss": 0.1494, + "step": 63750 + }, + { + "epoch": 95.88, + "grad_norm": 4.962404251098633, + "learning_rate": 4.120300751879699e-07, + "loss": 0.1847, + "step": 63760 + }, + { + "epoch": 95.89, + "grad_norm": 3.808089017868042, + "learning_rate": 4.105263157894737e-07, + "loss": 0.1727, + "step": 63770 + }, + { + "epoch": 95.91, + "grad_norm": 4.107529640197754, + "learning_rate": 4.090225563909775e-07, + "loss": 0.1562, + "step": 63780 + }, + { + "epoch": 95.92, + "grad_norm": 6.463099956512451, + "learning_rate": 4.075187969924813e-07, + "loss": 0.145, + "step": 63790 + }, + { + "epoch": 95.94, + "grad_norm": 9.739261627197266, + "learning_rate": 4.06015037593985e-07, + "loss": 0.1749, + "step": 63800 + }, + { + "epoch": 95.95, + "grad_norm": 4.032473087310791, + "learning_rate": 4.0451127819548874e-07, + "loss": 0.1542, + "step": 63810 + }, + { + "epoch": 95.97, + "grad_norm": 4.938335418701172, + "learning_rate": 4.030075187969925e-07, + "loss": 0.1901, + "step": 63820 + }, + { + "epoch": 95.98, + "grad_norm": 5.375936985015869, + "learning_rate": 4.0150375939849625e-07, + "loss": 0.2043, + "step": 63830 + }, + { + "epoch": 96.0, + "grad_norm": 15.463907241821289, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.1783, + "step": 63840 + }, + { + "epoch": 96.0, + "eval_accuracy": 0.9315, + "eval_loss": 0.3313089907169342, + "eval_runtime": 85.2264, + "eval_samples_per_second": 117.334, + "eval_steps_per_second": 0.469, + "step": 63840 + }, + { + "epoch": 96.02, + "grad_norm": 3.8098232746124268, + "learning_rate": 3.984962406015038e-07, + "loss": 0.177, + "step": 63850 + }, + { + "epoch": 96.03, + "grad_norm": 2.248171806335449, + "learning_rate": 3.969924812030076e-07, + "loss": 0.1043, + "step": 63860 + }, + { + "epoch": 96.05, + "grad_norm": 5.304133415222168, + "learning_rate": 3.954887218045113e-07, + "loss": 0.1221, + "step": 63870 + }, + { + "epoch": 96.06, + "grad_norm": 9.222192764282227, + "learning_rate": 3.9398496240601506e-07, + "loss": 0.1829, + "step": 63880 + }, + { + "epoch": 96.08, + "grad_norm": 5.404617786407471, + "learning_rate": 3.9248120300751885e-07, + "loss": 0.1773, + "step": 63890 + }, + { + "epoch": 96.09, + "grad_norm": 4.069046974182129, + "learning_rate": 3.909774436090226e-07, + "loss": 0.1894, + "step": 63900 + }, + { + "epoch": 96.11, + "grad_norm": 5.911056995391846, + "learning_rate": 3.8947368421052636e-07, + "loss": 0.1733, + "step": 63910 + }, + { + "epoch": 96.12, + "grad_norm": 5.389181137084961, + "learning_rate": 3.8796992481203015e-07, + "loss": 0.1594, + "step": 63920 + }, + { + "epoch": 96.14, + "grad_norm": 5.494384288787842, + "learning_rate": 3.864661654135338e-07, + "loss": 0.1526, + "step": 63930 + }, + { + "epoch": 96.15, + "grad_norm": 3.113043785095215, + "learning_rate": 3.849624060150376e-07, + "loss": 0.1355, + "step": 63940 + }, + { + "epoch": 96.17, + "grad_norm": 6.085422039031982, + "learning_rate": 3.834586466165414e-07, + "loss": 0.1163, + "step": 63950 + }, + { + "epoch": 96.18, + "grad_norm": 4.339461326599121, + "learning_rate": 3.819548872180451e-07, + "loss": 0.1856, + "step": 63960 + }, + { + "epoch": 96.2, + "grad_norm": 5.448220729827881, + "learning_rate": 3.804511278195489e-07, + "loss": 0.1677, + "step": 63970 + }, + { + "epoch": 96.21, + "grad_norm": 8.586206436157227, + "learning_rate": 3.789473684210527e-07, + "loss": 0.1541, + "step": 63980 + }, + { + "epoch": 96.23, + "grad_norm": 7.591394424438477, + "learning_rate": 3.7744360902255637e-07, + "loss": 0.1415, + "step": 63990 + }, + { + "epoch": 96.24, + "grad_norm": 6.6830153465271, + "learning_rate": 3.7593984962406015e-07, + "loss": 0.1787, + "step": 64000 + }, + { + "epoch": 96.26, + "grad_norm": 4.420645713806152, + "learning_rate": 3.7443609022556394e-07, + "loss": 0.1118, + "step": 64010 + }, + { + "epoch": 96.27, + "grad_norm": 6.308743000030518, + "learning_rate": 3.729323308270677e-07, + "loss": 0.1742, + "step": 64020 + }, + { + "epoch": 96.29, + "grad_norm": 5.491370677947998, + "learning_rate": 3.7142857142857145e-07, + "loss": 0.1964, + "step": 64030 + }, + { + "epoch": 96.3, + "grad_norm": 3.9227075576782227, + "learning_rate": 3.6992481203007523e-07, + "loss": 0.1793, + "step": 64040 + }, + { + "epoch": 96.32, + "grad_norm": 2.4654548168182373, + "learning_rate": 3.6842105263157896e-07, + "loss": 0.1948, + "step": 64050 + }, + { + "epoch": 96.33, + "grad_norm": 4.7875590324401855, + "learning_rate": 3.669172932330827e-07, + "loss": 0.1703, + "step": 64060 + }, + { + "epoch": 96.35, + "grad_norm": 5.792588233947754, + "learning_rate": 3.654135338345865e-07, + "loss": 0.2115, + "step": 64070 + }, + { + "epoch": 96.36, + "grad_norm": 3.1025025844573975, + "learning_rate": 3.6390977443609026e-07, + "loss": 0.1557, + "step": 64080 + }, + { + "epoch": 96.38, + "grad_norm": 7.800917625427246, + "learning_rate": 3.6240601503759405e-07, + "loss": 0.1808, + "step": 64090 + }, + { + "epoch": 96.39, + "grad_norm": 5.239587306976318, + "learning_rate": 3.609022556390978e-07, + "loss": 0.1319, + "step": 64100 + }, + { + "epoch": 96.41, + "grad_norm": 4.564650058746338, + "learning_rate": 3.5939849624060156e-07, + "loss": 0.1893, + "step": 64110 + }, + { + "epoch": 96.42, + "grad_norm": 5.047513484954834, + "learning_rate": 3.578947368421053e-07, + "loss": 0.1427, + "step": 64120 + }, + { + "epoch": 96.44, + "grad_norm": 5.4176249504089355, + "learning_rate": 3.56390977443609e-07, + "loss": 0.1932, + "step": 64130 + }, + { + "epoch": 96.45, + "grad_norm": 5.930634021759033, + "learning_rate": 3.548872180451128e-07, + "loss": 0.1444, + "step": 64140 + }, + { + "epoch": 96.47, + "grad_norm": 4.005468368530273, + "learning_rate": 3.533834586466166e-07, + "loss": 0.163, + "step": 64150 + }, + { + "epoch": 96.48, + "grad_norm": 6.028830528259277, + "learning_rate": 3.518796992481204e-07, + "loss": 0.1771, + "step": 64160 + }, + { + "epoch": 96.5, + "grad_norm": 2.6578989028930664, + "learning_rate": 3.503759398496241e-07, + "loss": 0.1031, + "step": 64170 + }, + { + "epoch": 96.51, + "grad_norm": 5.434506416320801, + "learning_rate": 3.4887218045112784e-07, + "loss": 0.1739, + "step": 64180 + }, + { + "epoch": 96.53, + "grad_norm": 5.756526470184326, + "learning_rate": 3.4736842105263157e-07, + "loss": 0.1191, + "step": 64190 + }, + { + "epoch": 96.54, + "grad_norm": 4.299066543579102, + "learning_rate": 3.4586466165413535e-07, + "loss": 0.1522, + "step": 64200 + }, + { + "epoch": 96.56, + "grad_norm": 4.130573272705078, + "learning_rate": 3.4436090225563913e-07, + "loss": 0.1381, + "step": 64210 + }, + { + "epoch": 96.57, + "grad_norm": 5.61849308013916, + "learning_rate": 3.428571428571429e-07, + "loss": 0.1467, + "step": 64220 + }, + { + "epoch": 96.59, + "grad_norm": 4.367659568786621, + "learning_rate": 3.4135338345864665e-07, + "loss": 0.1557, + "step": 64230 + }, + { + "epoch": 96.6, + "grad_norm": 2.8992245197296143, + "learning_rate": 3.398496240601504e-07, + "loss": 0.1537, + "step": 64240 + }, + { + "epoch": 96.62, + "grad_norm": 3.538975715637207, + "learning_rate": 3.3834586466165416e-07, + "loss": 0.1396, + "step": 64250 + }, + { + "epoch": 96.63, + "grad_norm": 8.194727897644043, + "learning_rate": 3.368421052631579e-07, + "loss": 0.1698, + "step": 64260 + }, + { + "epoch": 96.65, + "grad_norm": 10.290156364440918, + "learning_rate": 3.353383458646617e-07, + "loss": 0.1561, + "step": 64270 + }, + { + "epoch": 96.66, + "grad_norm": 5.7417073249816895, + "learning_rate": 3.3383458646616546e-07, + "loss": 0.1215, + "step": 64280 + }, + { + "epoch": 96.68, + "grad_norm": 6.722829818725586, + "learning_rate": 3.3233082706766924e-07, + "loss": 0.2179, + "step": 64290 + }, + { + "epoch": 96.69, + "grad_norm": 8.941481590270996, + "learning_rate": 3.308270676691729e-07, + "loss": 0.1763, + "step": 64300 + }, + { + "epoch": 96.71, + "grad_norm": 4.601995468139648, + "learning_rate": 3.293233082706767e-07, + "loss": 0.1611, + "step": 64310 + }, + { + "epoch": 96.72, + "grad_norm": 7.5421671867370605, + "learning_rate": 3.278195488721805e-07, + "loss": 0.1459, + "step": 64320 + }, + { + "epoch": 96.74, + "grad_norm": 2.4349803924560547, + "learning_rate": 3.263157894736842e-07, + "loss": 0.1892, + "step": 64330 + }, + { + "epoch": 96.75, + "grad_norm": 1.8686342239379883, + "learning_rate": 3.24812030075188e-07, + "loss": 0.1407, + "step": 64340 + }, + { + "epoch": 96.77, + "grad_norm": 5.259500503540039, + "learning_rate": 3.233082706766918e-07, + "loss": 0.1057, + "step": 64350 + }, + { + "epoch": 96.78, + "grad_norm": 4.295063018798828, + "learning_rate": 3.2180451127819547e-07, + "loss": 0.1865, + "step": 64360 + }, + { + "epoch": 96.8, + "grad_norm": 4.434532642364502, + "learning_rate": 3.2030075187969925e-07, + "loss": 0.1993, + "step": 64370 + }, + { + "epoch": 96.81, + "grad_norm": 5.843010902404785, + "learning_rate": 3.1879699248120303e-07, + "loss": 0.166, + "step": 64380 + }, + { + "epoch": 96.83, + "grad_norm": 4.545680046081543, + "learning_rate": 3.172932330827068e-07, + "loss": 0.135, + "step": 64390 + }, + { + "epoch": 96.84, + "grad_norm": 6.0127739906311035, + "learning_rate": 3.1578947368421055e-07, + "loss": 0.201, + "step": 64400 + }, + { + "epoch": 96.86, + "grad_norm": 6.95708703994751, + "learning_rate": 3.1428571428571433e-07, + "loss": 0.1584, + "step": 64410 + }, + { + "epoch": 96.87, + "grad_norm": 0.3541220724582672, + "learning_rate": 3.127819548872181e-07, + "loss": 0.143, + "step": 64420 + }, + { + "epoch": 96.89, + "grad_norm": 4.712253570556641, + "learning_rate": 3.1127819548872185e-07, + "loss": 0.1417, + "step": 64430 + }, + { + "epoch": 96.9, + "grad_norm": 5.915809154510498, + "learning_rate": 3.097744360902256e-07, + "loss": 0.135, + "step": 64440 + }, + { + "epoch": 96.92, + "grad_norm": 7.92987060546875, + "learning_rate": 3.0827067669172936e-07, + "loss": 0.1586, + "step": 64450 + }, + { + "epoch": 96.93, + "grad_norm": 3.605088472366333, + "learning_rate": 3.067669172932331e-07, + "loss": 0.1384, + "step": 64460 + }, + { + "epoch": 96.95, + "grad_norm": 5.079803943634033, + "learning_rate": 3.052631578947369e-07, + "loss": 0.177, + "step": 64470 + }, + { + "epoch": 96.96, + "grad_norm": 5.349300384521484, + "learning_rate": 3.037593984962406e-07, + "loss": 0.2294, + "step": 64480 + }, + { + "epoch": 96.98, + "grad_norm": 9.588523864746094, + "learning_rate": 3.022556390977444e-07, + "loss": 0.1532, + "step": 64490 + }, + { + "epoch": 96.99, + "grad_norm": 3.1783509254455566, + "learning_rate": 3.007518796992481e-07, + "loss": 0.1256, + "step": 64500 + }, + { + "epoch": 97.0, + "eval_accuracy": 0.9308, + "eval_loss": 0.332674503326416, + "eval_runtime": 84.9569, + "eval_samples_per_second": 117.707, + "eval_steps_per_second": 0.471, + "step": 64505 + }, + { + "epoch": 97.01, + "grad_norm": 7.344843864440918, + "learning_rate": 2.992481203007519e-07, + "loss": 0.1059, + "step": 64510 + }, + { + "epoch": 97.02, + "grad_norm": 10.291905403137207, + "learning_rate": 2.977443609022557e-07, + "loss": 0.1662, + "step": 64520 + }, + { + "epoch": 97.04, + "grad_norm": 8.140653610229492, + "learning_rate": 2.962406015037594e-07, + "loss": 0.2008, + "step": 64530 + }, + { + "epoch": 97.05, + "grad_norm": 3.3860104084014893, + "learning_rate": 2.9473684210526315e-07, + "loss": 0.1893, + "step": 64540 + }, + { + "epoch": 97.07, + "grad_norm": 7.287869930267334, + "learning_rate": 2.9323308270676693e-07, + "loss": 0.1382, + "step": 64550 + }, + { + "epoch": 97.08, + "grad_norm": 8.065459251403809, + "learning_rate": 2.917293233082707e-07, + "loss": 0.1349, + "step": 64560 + }, + { + "epoch": 97.1, + "grad_norm": 5.940702438354492, + "learning_rate": 2.9022556390977445e-07, + "loss": 0.1896, + "step": 64570 + }, + { + "epoch": 97.11, + "grad_norm": 8.597441673278809, + "learning_rate": 2.8872180451127823e-07, + "loss": 0.1467, + "step": 64580 + }, + { + "epoch": 97.13, + "grad_norm": 4.228404998779297, + "learning_rate": 2.87218045112782e-07, + "loss": 0.1584, + "step": 64590 + }, + { + "epoch": 97.14, + "grad_norm": 8.518570899963379, + "learning_rate": 2.8571428571428575e-07, + "loss": 0.1133, + "step": 64600 + }, + { + "epoch": 97.16, + "grad_norm": 5.979735374450684, + "learning_rate": 2.842105263157895e-07, + "loss": 0.1639, + "step": 64610 + }, + { + "epoch": 97.17, + "grad_norm": 4.625314235687256, + "learning_rate": 2.8270676691729326e-07, + "loss": 0.1755, + "step": 64620 + }, + { + "epoch": 97.19, + "grad_norm": 2.0218687057495117, + "learning_rate": 2.81203007518797e-07, + "loss": 0.1521, + "step": 64630 + }, + { + "epoch": 97.2, + "grad_norm": 6.187188148498535, + "learning_rate": 2.796992481203008e-07, + "loss": 0.162, + "step": 64640 + }, + { + "epoch": 97.22, + "grad_norm": 6.775570869445801, + "learning_rate": 2.7819548872180456e-07, + "loss": 0.1703, + "step": 64650 + }, + { + "epoch": 97.23, + "grad_norm": 1.6887646913528442, + "learning_rate": 2.766917293233083e-07, + "loss": 0.2072, + "step": 64660 + }, + { + "epoch": 97.25, + "grad_norm": 5.309747695922852, + "learning_rate": 2.751879699248121e-07, + "loss": 0.1646, + "step": 64670 + }, + { + "epoch": 97.26, + "grad_norm": 5.663767337799072, + "learning_rate": 2.736842105263158e-07, + "loss": 0.166, + "step": 64680 + }, + { + "epoch": 97.28, + "grad_norm": 5.132472991943359, + "learning_rate": 2.7218045112781954e-07, + "loss": 0.1857, + "step": 64690 + }, + { + "epoch": 97.29, + "grad_norm": 5.090039253234863, + "learning_rate": 2.706766917293233e-07, + "loss": 0.195, + "step": 64700 + }, + { + "epoch": 97.31, + "grad_norm": 9.945984840393066, + "learning_rate": 2.691729323308271e-07, + "loss": 0.1869, + "step": 64710 + }, + { + "epoch": 97.32, + "grad_norm": 9.467114448547363, + "learning_rate": 2.6766917293233083e-07, + "loss": 0.1387, + "step": 64720 + }, + { + "epoch": 97.34, + "grad_norm": 2.083440065383911, + "learning_rate": 2.661654135338346e-07, + "loss": 0.1623, + "step": 64730 + }, + { + "epoch": 97.35, + "grad_norm": 3.3765485286712646, + "learning_rate": 2.646616541353384e-07, + "loss": 0.1059, + "step": 64740 + }, + { + "epoch": 97.37, + "grad_norm": 2.4343514442443848, + "learning_rate": 2.6315789473684213e-07, + "loss": 0.193, + "step": 64750 + }, + { + "epoch": 97.38, + "grad_norm": 7.881185531616211, + "learning_rate": 2.6165413533834586e-07, + "loss": 0.162, + "step": 64760 + }, + { + "epoch": 97.4, + "grad_norm": 3.8608689308166504, + "learning_rate": 2.6015037593984965e-07, + "loss": 0.1648, + "step": 64770 + }, + { + "epoch": 97.41, + "grad_norm": 3.745732069015503, + "learning_rate": 2.586466165413534e-07, + "loss": 0.1706, + "step": 64780 + }, + { + "epoch": 97.43, + "grad_norm": 1.355181336402893, + "learning_rate": 2.5714285714285716e-07, + "loss": 0.1558, + "step": 64790 + }, + { + "epoch": 97.44, + "grad_norm": 3.218841075897217, + "learning_rate": 2.5563909774436095e-07, + "loss": 0.1518, + "step": 64800 + }, + { + "epoch": 97.46, + "grad_norm": 1.6268922090530396, + "learning_rate": 2.541353383458647e-07, + "loss": 0.1573, + "step": 64810 + }, + { + "epoch": 97.47, + "grad_norm": 2.534693479537964, + "learning_rate": 2.5263157894736846e-07, + "loss": 0.1684, + "step": 64820 + }, + { + "epoch": 97.49, + "grad_norm": 5.223088264465332, + "learning_rate": 2.511278195488722e-07, + "loss": 0.1612, + "step": 64830 + }, + { + "epoch": 97.5, + "grad_norm": 4.601364612579346, + "learning_rate": 2.49624060150376e-07, + "loss": 0.1334, + "step": 64840 + }, + { + "epoch": 97.52, + "grad_norm": 3.1391961574554443, + "learning_rate": 2.481203007518797e-07, + "loss": 0.1509, + "step": 64850 + }, + { + "epoch": 97.53, + "grad_norm": 5.453991889953613, + "learning_rate": 2.466165413533835e-07, + "loss": 0.1266, + "step": 64860 + }, + { + "epoch": 97.55, + "grad_norm": 4.62083625793457, + "learning_rate": 2.4511278195488727e-07, + "loss": 0.158, + "step": 64870 + }, + { + "epoch": 97.56, + "grad_norm": 1.933967113494873, + "learning_rate": 2.43609022556391e-07, + "loss": 0.0955, + "step": 64880 + }, + { + "epoch": 97.58, + "grad_norm": 5.738483428955078, + "learning_rate": 2.4210526315789473e-07, + "loss": 0.1555, + "step": 64890 + }, + { + "epoch": 97.59, + "grad_norm": 4.406610012054443, + "learning_rate": 2.406015037593985e-07, + "loss": 0.1667, + "step": 64900 + }, + { + "epoch": 97.61, + "grad_norm": 4.557981491088867, + "learning_rate": 2.3909774436090225e-07, + "loss": 0.1796, + "step": 64910 + }, + { + "epoch": 97.62, + "grad_norm": 4.654937744140625, + "learning_rate": 2.3759398496240603e-07, + "loss": 0.1713, + "step": 64920 + }, + { + "epoch": 97.64, + "grad_norm": 5.671332359313965, + "learning_rate": 2.3609022556390982e-07, + "loss": 0.1292, + "step": 64930 + }, + { + "epoch": 97.65, + "grad_norm": 5.643190860748291, + "learning_rate": 2.3458646616541355e-07, + "loss": 0.1565, + "step": 64940 + }, + { + "epoch": 97.67, + "grad_norm": 2.340914487838745, + "learning_rate": 2.330827067669173e-07, + "loss": 0.128, + "step": 64950 + }, + { + "epoch": 97.68, + "grad_norm": 7.151415824890137, + "learning_rate": 2.315789473684211e-07, + "loss": 0.1618, + "step": 64960 + }, + { + "epoch": 97.7, + "grad_norm": 7.448176860809326, + "learning_rate": 2.3007518796992482e-07, + "loss": 0.1396, + "step": 64970 + }, + { + "epoch": 97.71, + "grad_norm": 5.021811008453369, + "learning_rate": 2.285714285714286e-07, + "loss": 0.1772, + "step": 64980 + }, + { + "epoch": 97.73, + "grad_norm": 0.7833675146102905, + "learning_rate": 2.2706766917293236e-07, + "loss": 0.1125, + "step": 64990 + }, + { + "epoch": 97.74, + "grad_norm": 7.163773536682129, + "learning_rate": 2.255639097744361e-07, + "loss": 0.2541, + "step": 65000 + }, + { + "epoch": 97.76, + "grad_norm": 7.621840000152588, + "learning_rate": 2.2406015037593987e-07, + "loss": 0.1367, + "step": 65010 + }, + { + "epoch": 97.77, + "grad_norm": 3.7870657444000244, + "learning_rate": 2.2255639097744363e-07, + "loss": 0.1336, + "step": 65020 + }, + { + "epoch": 97.79, + "grad_norm": 4.785586357116699, + "learning_rate": 2.2105263157894736e-07, + "loss": 0.1779, + "step": 65030 + }, + { + "epoch": 97.8, + "grad_norm": 2.2367138862609863, + "learning_rate": 2.1954887218045115e-07, + "loss": 0.1886, + "step": 65040 + }, + { + "epoch": 97.82, + "grad_norm": 4.493763446807861, + "learning_rate": 2.180451127819549e-07, + "loss": 0.1943, + "step": 65050 + }, + { + "epoch": 97.83, + "grad_norm": 6.870443344116211, + "learning_rate": 2.1654135338345866e-07, + "loss": 0.0937, + "step": 65060 + }, + { + "epoch": 97.85, + "grad_norm": 6.161388874053955, + "learning_rate": 2.1503759398496242e-07, + "loss": 0.192, + "step": 65070 + }, + { + "epoch": 97.86, + "grad_norm": 5.1376729011535645, + "learning_rate": 2.135338345864662e-07, + "loss": 0.1587, + "step": 65080 + }, + { + "epoch": 97.88, + "grad_norm": 3.9065380096435547, + "learning_rate": 2.1203007518796993e-07, + "loss": 0.1104, + "step": 65090 + }, + { + "epoch": 97.89, + "grad_norm": 5.1056227684021, + "learning_rate": 2.105263157894737e-07, + "loss": 0.157, + "step": 65100 + }, + { + "epoch": 97.91, + "grad_norm": 3.8293356895446777, + "learning_rate": 2.0902255639097747e-07, + "loss": 0.17, + "step": 65110 + }, + { + "epoch": 97.92, + "grad_norm": 4.883228302001953, + "learning_rate": 2.075187969924812e-07, + "loss": 0.1989, + "step": 65120 + }, + { + "epoch": 97.94, + "grad_norm": 6.1433210372924805, + "learning_rate": 2.0601503759398496e-07, + "loss": 0.2161, + "step": 65130 + }, + { + "epoch": 97.95, + "grad_norm": 3.8383498191833496, + "learning_rate": 2.0451127819548875e-07, + "loss": 0.2358, + "step": 65140 + }, + { + "epoch": 97.97, + "grad_norm": 3.27778959274292, + "learning_rate": 2.030075187969925e-07, + "loss": 0.1498, + "step": 65150 + }, + { + "epoch": 97.98, + "grad_norm": 5.0947265625, + "learning_rate": 2.0150375939849626e-07, + "loss": 0.1577, + "step": 65160 + }, + { + "epoch": 98.0, + "grad_norm": 0.12277551740407944, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0984, + "step": 65170 + }, + { + "epoch": 98.0, + "eval_accuracy": 0.9317, + "eval_loss": 0.32914501428604126, + "eval_runtime": 84.4819, + "eval_samples_per_second": 118.369, + "eval_steps_per_second": 0.473, + "step": 65170 + }, + { + "epoch": 98.02, + "grad_norm": 8.998836517333984, + "learning_rate": 1.984962406015038e-07, + "loss": 0.1479, + "step": 65180 + }, + { + "epoch": 98.03, + "grad_norm": 4.802471160888672, + "learning_rate": 1.9699248120300753e-07, + "loss": 0.1792, + "step": 65190 + }, + { + "epoch": 98.05, + "grad_norm": 6.974554061889648, + "learning_rate": 1.954887218045113e-07, + "loss": 0.1599, + "step": 65200 + }, + { + "epoch": 98.06, + "grad_norm": 5.7014641761779785, + "learning_rate": 1.9398496240601507e-07, + "loss": 0.1527, + "step": 65210 + }, + { + "epoch": 98.08, + "grad_norm": 7.29013729095459, + "learning_rate": 1.924812030075188e-07, + "loss": 0.1443, + "step": 65220 + }, + { + "epoch": 98.09, + "grad_norm": 3.837073802947998, + "learning_rate": 1.9097744360902256e-07, + "loss": 0.1044, + "step": 65230 + }, + { + "epoch": 98.11, + "grad_norm": 9.046390533447266, + "learning_rate": 1.8947368421052634e-07, + "loss": 0.1598, + "step": 65240 + }, + { + "epoch": 98.12, + "grad_norm": 7.997859954833984, + "learning_rate": 1.8796992481203008e-07, + "loss": 0.2044, + "step": 65250 + }, + { + "epoch": 98.14, + "grad_norm": 5.434107780456543, + "learning_rate": 1.8646616541353386e-07, + "loss": 0.1655, + "step": 65260 + }, + { + "epoch": 98.15, + "grad_norm": 4.262781620025635, + "learning_rate": 1.8496240601503762e-07, + "loss": 0.1393, + "step": 65270 + }, + { + "epoch": 98.17, + "grad_norm": 6.696924209594727, + "learning_rate": 1.8345864661654135e-07, + "loss": 0.1341, + "step": 65280 + }, + { + "epoch": 98.18, + "grad_norm": 6.295987129211426, + "learning_rate": 1.8195488721804513e-07, + "loss": 0.1655, + "step": 65290 + }, + { + "epoch": 98.2, + "grad_norm": 2.8167364597320557, + "learning_rate": 1.804511278195489e-07, + "loss": 0.1448, + "step": 65300 + }, + { + "epoch": 98.21, + "grad_norm": 2.0722157955169678, + "learning_rate": 1.7894736842105265e-07, + "loss": 0.2011, + "step": 65310 + }, + { + "epoch": 98.23, + "grad_norm": 4.755069255828857, + "learning_rate": 1.774436090225564e-07, + "loss": 0.1573, + "step": 65320 + }, + { + "epoch": 98.24, + "grad_norm": 11.782632827758789, + "learning_rate": 1.759398496240602e-07, + "loss": 0.1736, + "step": 65330 + }, + { + "epoch": 98.26, + "grad_norm": 4.000039577484131, + "learning_rate": 1.7443609022556392e-07, + "loss": 0.1365, + "step": 65340 + }, + { + "epoch": 98.27, + "grad_norm": 4.4005656242370605, + "learning_rate": 1.7293233082706767e-07, + "loss": 0.1795, + "step": 65350 + }, + { + "epoch": 98.29, + "grad_norm": 7.827691555023193, + "learning_rate": 1.7142857142857146e-07, + "loss": 0.1507, + "step": 65360 + }, + { + "epoch": 98.3, + "grad_norm": 3.5540237426757812, + "learning_rate": 1.699248120300752e-07, + "loss": 0.1653, + "step": 65370 + }, + { + "epoch": 98.32, + "grad_norm": 4.963322639465332, + "learning_rate": 1.6842105263157895e-07, + "loss": 0.1742, + "step": 65380 + }, + { + "epoch": 98.33, + "grad_norm": 4.910261154174805, + "learning_rate": 1.6691729323308273e-07, + "loss": 0.1477, + "step": 65390 + }, + { + "epoch": 98.35, + "grad_norm": 7.770264148712158, + "learning_rate": 1.6541353383458646e-07, + "loss": 0.1713, + "step": 65400 + }, + { + "epoch": 98.36, + "grad_norm": 2.305351495742798, + "learning_rate": 1.6390977443609025e-07, + "loss": 0.1471, + "step": 65410 + }, + { + "epoch": 98.38, + "grad_norm": 5.237599849700928, + "learning_rate": 1.62406015037594e-07, + "loss": 0.2242, + "step": 65420 + }, + { + "epoch": 98.39, + "grad_norm": 3.8604493141174316, + "learning_rate": 1.6090225563909773e-07, + "loss": 0.1789, + "step": 65430 + }, + { + "epoch": 98.41, + "grad_norm": 4.205555438995361, + "learning_rate": 1.5939849624060152e-07, + "loss": 0.1286, + "step": 65440 + }, + { + "epoch": 98.42, + "grad_norm": 6.994670867919922, + "learning_rate": 1.5789473684210527e-07, + "loss": 0.204, + "step": 65450 + }, + { + "epoch": 98.44, + "grad_norm": 6.597128391265869, + "learning_rate": 1.5639097744360906e-07, + "loss": 0.1172, + "step": 65460 + }, + { + "epoch": 98.45, + "grad_norm": 9.189870834350586, + "learning_rate": 1.548872180451128e-07, + "loss": 0.1601, + "step": 65470 + }, + { + "epoch": 98.47, + "grad_norm": 5.771359920501709, + "learning_rate": 1.5338345864661655e-07, + "loss": 0.1457, + "step": 65480 + }, + { + "epoch": 98.48, + "grad_norm": 6.4306840896606445, + "learning_rate": 1.518796992481203e-07, + "loss": 0.1385, + "step": 65490 + }, + { + "epoch": 98.5, + "grad_norm": 3.8584532737731934, + "learning_rate": 1.5037593984962406e-07, + "loss": 0.1687, + "step": 65500 + }, + { + "epoch": 98.51, + "grad_norm": 4.318583965301514, + "learning_rate": 1.4887218045112784e-07, + "loss": 0.1136, + "step": 65510 + }, + { + "epoch": 98.53, + "grad_norm": 4.790411472320557, + "learning_rate": 1.4736842105263158e-07, + "loss": 0.176, + "step": 65520 + }, + { + "epoch": 98.54, + "grad_norm": 7.689479351043701, + "learning_rate": 1.4586466165413536e-07, + "loss": 0.17, + "step": 65530 + }, + { + "epoch": 98.56, + "grad_norm": 6.151116371154785, + "learning_rate": 1.4436090225563912e-07, + "loss": 0.1942, + "step": 65540 + }, + { + "epoch": 98.57, + "grad_norm": 5.650688648223877, + "learning_rate": 1.4285714285714287e-07, + "loss": 0.2166, + "step": 65550 + }, + { + "epoch": 98.59, + "grad_norm": 4.956643581390381, + "learning_rate": 1.4135338345864663e-07, + "loss": 0.1672, + "step": 65560 + }, + { + "epoch": 98.6, + "grad_norm": 1.9977216720581055, + "learning_rate": 1.398496240601504e-07, + "loss": 0.153, + "step": 65570 + }, + { + "epoch": 98.62, + "grad_norm": 6.504947185516357, + "learning_rate": 1.3834586466165415e-07, + "loss": 0.2146, + "step": 65580 + }, + { + "epoch": 98.63, + "grad_norm": 3.9041435718536377, + "learning_rate": 1.368421052631579e-07, + "loss": 0.2097, + "step": 65590 + }, + { + "epoch": 98.65, + "grad_norm": 6.294933795928955, + "learning_rate": 1.3533834586466166e-07, + "loss": 0.1532, + "step": 65600 + }, + { + "epoch": 98.66, + "grad_norm": 3.49302339553833, + "learning_rate": 1.3383458646616542e-07, + "loss": 0.1407, + "step": 65610 + }, + { + "epoch": 98.68, + "grad_norm": 6.846986293792725, + "learning_rate": 1.323308270676692e-07, + "loss": 0.1436, + "step": 65620 + }, + { + "epoch": 98.69, + "grad_norm": 8.050239562988281, + "learning_rate": 1.3082706766917293e-07, + "loss": 0.1993, + "step": 65630 + }, + { + "epoch": 98.71, + "grad_norm": 2.393629789352417, + "learning_rate": 1.293233082706767e-07, + "loss": 0.1998, + "step": 65640 + }, + { + "epoch": 98.72, + "grad_norm": 3.4583864212036133, + "learning_rate": 1.2781954887218047e-07, + "loss": 0.2058, + "step": 65650 + }, + { + "epoch": 98.74, + "grad_norm": 1.8351730108261108, + "learning_rate": 1.2631578947368423e-07, + "loss": 0.0967, + "step": 65660 + }, + { + "epoch": 98.75, + "grad_norm": 4.4519195556640625, + "learning_rate": 1.24812030075188e-07, + "loss": 0.1455, + "step": 65670 + }, + { + "epoch": 98.77, + "grad_norm": 3.725358009338379, + "learning_rate": 1.2330827067669174e-07, + "loss": 0.1497, + "step": 65680 + }, + { + "epoch": 98.78, + "grad_norm": 5.022524356842041, + "learning_rate": 1.218045112781955e-07, + "loss": 0.2046, + "step": 65690 + }, + { + "epoch": 98.8, + "grad_norm": 0.5539684295654297, + "learning_rate": 1.2030075187969926e-07, + "loss": 0.1661, + "step": 65700 + }, + { + "epoch": 98.81, + "grad_norm": 5.730692386627197, + "learning_rate": 1.1879699248120302e-07, + "loss": 0.1907, + "step": 65710 + }, + { + "epoch": 98.83, + "grad_norm": 4.800581455230713, + "learning_rate": 1.1729323308270677e-07, + "loss": 0.1923, + "step": 65720 + }, + { + "epoch": 98.84, + "grad_norm": 7.648699760437012, + "learning_rate": 1.1578947368421054e-07, + "loss": 0.1876, + "step": 65730 + }, + { + "epoch": 98.86, + "grad_norm": 6.9826860427856445, + "learning_rate": 1.142857142857143e-07, + "loss": 0.1962, + "step": 65740 + }, + { + "epoch": 98.87, + "grad_norm": 4.280595779418945, + "learning_rate": 1.1278195488721805e-07, + "loss": 0.1995, + "step": 65750 + }, + { + "epoch": 98.89, + "grad_norm": 9.39132308959961, + "learning_rate": 1.1127819548872182e-07, + "loss": 0.1467, + "step": 65760 + }, + { + "epoch": 98.9, + "grad_norm": 8.573371887207031, + "learning_rate": 1.0977443609022557e-07, + "loss": 0.156, + "step": 65770 + }, + { + "epoch": 98.92, + "grad_norm": 3.3703553676605225, + "learning_rate": 1.0827067669172933e-07, + "loss": 0.1639, + "step": 65780 + }, + { + "epoch": 98.93, + "grad_norm": 5.628261089324951, + "learning_rate": 1.067669172932331e-07, + "loss": 0.1439, + "step": 65790 + }, + { + "epoch": 98.95, + "grad_norm": 4.770748615264893, + "learning_rate": 1.0526315789473685e-07, + "loss": 0.1691, + "step": 65800 + }, + { + "epoch": 98.96, + "grad_norm": 3.2969655990600586, + "learning_rate": 1.037593984962406e-07, + "loss": 0.129, + "step": 65810 + }, + { + "epoch": 98.98, + "grad_norm": 5.129558563232422, + "learning_rate": 1.0225563909774437e-07, + "loss": 0.1381, + "step": 65820 + }, + { + "epoch": 98.99, + "grad_norm": 7.410007953643799, + "learning_rate": 1.0075187969924813e-07, + "loss": 0.1525, + "step": 65830 + }, + { + "epoch": 99.0, + "eval_accuracy": 0.9311, + "eval_loss": 0.33068838715553284, + "eval_runtime": 84.5618, + "eval_samples_per_second": 118.257, + "eval_steps_per_second": 0.473, + "step": 65835 + }, + { + "epoch": 99.01, + "grad_norm": 4.0409674644470215, + "learning_rate": 9.92481203007519e-08, + "loss": 0.1796, + "step": 65840 + }, + { + "epoch": 99.02, + "grad_norm": 6.996447563171387, + "learning_rate": 9.774436090225564e-08, + "loss": 0.1921, + "step": 65850 + }, + { + "epoch": 99.04, + "grad_norm": 9.530542373657227, + "learning_rate": 9.62406015037594e-08, + "loss": 0.1522, + "step": 65860 + }, + { + "epoch": 99.05, + "grad_norm": 1.6565377712249756, + "learning_rate": 9.473684210526317e-08, + "loss": 0.1283, + "step": 65870 + }, + { + "epoch": 99.07, + "grad_norm": 4.351380348205566, + "learning_rate": 9.323308270676693e-08, + "loss": 0.1222, + "step": 65880 + }, + { + "epoch": 99.08, + "grad_norm": 5.406339645385742, + "learning_rate": 9.172932330827067e-08, + "loss": 0.1445, + "step": 65890 + }, + { + "epoch": 99.1, + "grad_norm": 3.567111015319824, + "learning_rate": 9.022556390977444e-08, + "loss": 0.0886, + "step": 65900 + }, + { + "epoch": 99.11, + "grad_norm": 4.423712730407715, + "learning_rate": 8.87218045112782e-08, + "loss": 0.158, + "step": 65910 + }, + { + "epoch": 99.13, + "grad_norm": 1.147392988204956, + "learning_rate": 8.721804511278196e-08, + "loss": 0.1439, + "step": 65920 + }, + { + "epoch": 99.14, + "grad_norm": 4.6901936531066895, + "learning_rate": 8.571428571428573e-08, + "loss": 0.1749, + "step": 65930 + }, + { + "epoch": 99.16, + "grad_norm": 9.2613525390625, + "learning_rate": 8.421052631578947e-08, + "loss": 0.1984, + "step": 65940 + }, + { + "epoch": 99.17, + "grad_norm": 3.2010562419891357, + "learning_rate": 8.270676691729323e-08, + "loss": 0.115, + "step": 65950 + }, + { + "epoch": 99.19, + "grad_norm": 2.4863340854644775, + "learning_rate": 8.1203007518797e-08, + "loss": 0.1331, + "step": 65960 + }, + { + "epoch": 99.2, + "grad_norm": 2.9829752445220947, + "learning_rate": 7.969924812030076e-08, + "loss": 0.1423, + "step": 65970 + }, + { + "epoch": 99.22, + "grad_norm": 4.100430011749268, + "learning_rate": 7.819548872180453e-08, + "loss": 0.1391, + "step": 65980 + }, + { + "epoch": 99.23, + "grad_norm": 4.504250526428223, + "learning_rate": 7.669172932330827e-08, + "loss": 0.1529, + "step": 65990 + }, + { + "epoch": 99.25, + "grad_norm": 3.5403831005096436, + "learning_rate": 7.518796992481203e-08, + "loss": 0.1882, + "step": 66000 + }, + { + "epoch": 99.26, + "grad_norm": 4.3315253257751465, + "learning_rate": 7.368421052631579e-08, + "loss": 0.1514, + "step": 66010 + }, + { + "epoch": 99.28, + "grad_norm": 5.822681903839111, + "learning_rate": 7.218045112781956e-08, + "loss": 0.1829, + "step": 66020 + }, + { + "epoch": 99.29, + "grad_norm": 5.7567596435546875, + "learning_rate": 7.067669172932332e-08, + "loss": 0.1862, + "step": 66030 + }, + { + "epoch": 99.31, + "grad_norm": 7.93897819519043, + "learning_rate": 6.917293233082707e-08, + "loss": 0.1586, + "step": 66040 + }, + { + "epoch": 99.32, + "grad_norm": 2.92777156829834, + "learning_rate": 6.766917293233083e-08, + "loss": 0.1325, + "step": 66050 + }, + { + "epoch": 99.34, + "grad_norm": 5.535176753997803, + "learning_rate": 6.61654135338346e-08, + "loss": 0.192, + "step": 66060 + }, + { + "epoch": 99.35, + "grad_norm": 6.798808574676514, + "learning_rate": 6.466165413533834e-08, + "loss": 0.171, + "step": 66070 + }, + { + "epoch": 99.37, + "grad_norm": 7.453383922576904, + "learning_rate": 6.315789473684211e-08, + "loss": 0.2143, + "step": 66080 + }, + { + "epoch": 99.38, + "grad_norm": 5.353199005126953, + "learning_rate": 6.165413533834587e-08, + "loss": 0.165, + "step": 66090 + }, + { + "epoch": 99.4, + "grad_norm": 3.9845528602600098, + "learning_rate": 6.015037593984963e-08, + "loss": 0.1643, + "step": 66100 + }, + { + "epoch": 99.41, + "grad_norm": 10.434816360473633, + "learning_rate": 5.864661654135339e-08, + "loss": 0.2099, + "step": 66110 + }, + { + "epoch": 99.43, + "grad_norm": 3.1787760257720947, + "learning_rate": 5.714285714285715e-08, + "loss": 0.1539, + "step": 66120 + }, + { + "epoch": 99.44, + "grad_norm": 6.48075532913208, + "learning_rate": 5.563909774436091e-08, + "loss": 0.1741, + "step": 66130 + }, + { + "epoch": 99.46, + "grad_norm": 3.269533634185791, + "learning_rate": 5.4135338345864665e-08, + "loss": 0.1438, + "step": 66140 + }, + { + "epoch": 99.47, + "grad_norm": 8.6683988571167, + "learning_rate": 5.263157894736842e-08, + "loss": 0.1949, + "step": 66150 + }, + { + "epoch": 99.49, + "grad_norm": 4.051426887512207, + "learning_rate": 5.1127819548872186e-08, + "loss": 0.1634, + "step": 66160 + }, + { + "epoch": 99.5, + "grad_norm": 6.472201347351074, + "learning_rate": 4.962406015037595e-08, + "loss": 0.1474, + "step": 66170 + }, + { + "epoch": 99.52, + "grad_norm": 4.984071731567383, + "learning_rate": 4.81203007518797e-08, + "loss": 0.1707, + "step": 66180 + }, + { + "epoch": 99.53, + "grad_norm": 5.94837760925293, + "learning_rate": 4.6616541353383465e-08, + "loss": 0.2083, + "step": 66190 + }, + { + "epoch": 99.55, + "grad_norm": 5.263061046600342, + "learning_rate": 4.511278195488722e-08, + "loss": 0.1573, + "step": 66200 + }, + { + "epoch": 99.56, + "grad_norm": 5.428894996643066, + "learning_rate": 4.360902255639098e-08, + "loss": 0.1753, + "step": 66210 + }, + { + "epoch": 99.58, + "grad_norm": 6.092723846435547, + "learning_rate": 4.2105263157894737e-08, + "loss": 0.1658, + "step": 66220 + }, + { + "epoch": 99.59, + "grad_norm": 5.707245826721191, + "learning_rate": 4.06015037593985e-08, + "loss": 0.1583, + "step": 66230 + }, + { + "epoch": 99.61, + "grad_norm": 5.840432167053223, + "learning_rate": 3.9097744360902264e-08, + "loss": 0.2175, + "step": 66240 + }, + { + "epoch": 99.62, + "grad_norm": 5.600442409515381, + "learning_rate": 3.7593984962406015e-08, + "loss": 0.2012, + "step": 66250 + }, + { + "epoch": 99.64, + "grad_norm": 8.73216724395752, + "learning_rate": 3.609022556390978e-08, + "loss": 0.1126, + "step": 66260 + }, + { + "epoch": 99.65, + "grad_norm": 4.184023857116699, + "learning_rate": 3.4586466165413536e-08, + "loss": 0.1326, + "step": 66270 + }, + { + "epoch": 99.67, + "grad_norm": 2.4098730087280273, + "learning_rate": 3.30827067669173e-08, + "loss": 0.1219, + "step": 66280 + }, + { + "epoch": 99.68, + "grad_norm": 6.737592697143555, + "learning_rate": 3.157894736842106e-08, + "loss": 0.1788, + "step": 66290 + }, + { + "epoch": 99.7, + "grad_norm": 3.4307661056518555, + "learning_rate": 3.0075187969924815e-08, + "loss": 0.1677, + "step": 66300 + }, + { + "epoch": 99.71, + "grad_norm": 6.838919162750244, + "learning_rate": 2.8571428571428575e-08, + "loss": 0.156, + "step": 66310 + }, + { + "epoch": 99.73, + "grad_norm": 3.750256061553955, + "learning_rate": 2.7067669172932333e-08, + "loss": 0.1118, + "step": 66320 + }, + { + "epoch": 99.74, + "grad_norm": 3.541330099105835, + "learning_rate": 2.5563909774436093e-08, + "loss": 0.1467, + "step": 66330 + }, + { + "epoch": 99.76, + "grad_norm": 6.930870532989502, + "learning_rate": 2.406015037593985e-08, + "loss": 0.1469, + "step": 66340 + }, + { + "epoch": 99.77, + "grad_norm": 4.462535858154297, + "learning_rate": 2.255639097744361e-08, + "loss": 0.1693, + "step": 66350 + }, + { + "epoch": 99.79, + "grad_norm": 16.731098175048828, + "learning_rate": 2.1052631578947368e-08, + "loss": 0.1813, + "step": 66360 + }, + { + "epoch": 99.8, + "grad_norm": 4.853457450866699, + "learning_rate": 1.9548872180451132e-08, + "loss": 0.1693, + "step": 66370 + }, + { + "epoch": 99.82, + "grad_norm": 8.614503860473633, + "learning_rate": 1.804511278195489e-08, + "loss": 0.1212, + "step": 66380 + }, + { + "epoch": 99.83, + "grad_norm": 1.8390692472457886, + "learning_rate": 1.654135338345865e-08, + "loss": 0.1156, + "step": 66390 + }, + { + "epoch": 99.85, + "grad_norm": 3.2085022926330566, + "learning_rate": 1.5037593984962407e-08, + "loss": 0.1469, + "step": 66400 + }, + { + "epoch": 99.86, + "grad_norm": 4.798015594482422, + "learning_rate": 1.3533834586466166e-08, + "loss": 0.1651, + "step": 66410 + }, + { + "epoch": 99.88, + "grad_norm": 5.5648322105407715, + "learning_rate": 1.2030075187969925e-08, + "loss": 0.1682, + "step": 66420 + }, + { + "epoch": 99.89, + "grad_norm": 4.767200469970703, + "learning_rate": 1.0526315789473684e-08, + "loss": 0.2008, + "step": 66430 + }, + { + "epoch": 99.91, + "grad_norm": 2.938972234725952, + "learning_rate": 9.022556390977445e-09, + "loss": 0.1684, + "step": 66440 + }, + { + "epoch": 99.92, + "grad_norm": 7.364297389984131, + "learning_rate": 7.518796992481204e-09, + "loss": 0.1327, + "step": 66450 + }, + { + "epoch": 99.94, + "grad_norm": 4.032517433166504, + "learning_rate": 6.015037593984963e-09, + "loss": 0.1577, + "step": 66460 + }, + { + "epoch": 99.95, + "grad_norm": 5.099705219268799, + "learning_rate": 4.511278195488722e-09, + "loss": 0.1541, + "step": 66470 + }, + { + "epoch": 99.97, + "grad_norm": 5.294932842254639, + "learning_rate": 3.0075187969924813e-09, + "loss": 0.147, + "step": 66480 + }, + { + "epoch": 99.98, + "grad_norm": 5.263985633850098, + "learning_rate": 1.5037593984962407e-09, + "loss": 0.0833, + "step": 66490 + }, + { + "epoch": 100.0, + "grad_norm": 0.18724946677684784, + "learning_rate": 0.0, + "loss": 0.1471, + "step": 66500 + }, + { + "epoch": 100.0, + "eval_accuracy": 0.9309, + "eval_loss": 0.33010387420654297, + "eval_runtime": 84.7295, + "eval_samples_per_second": 118.023, + "eval_steps_per_second": 0.472, + "step": 66500 + }, + { + "epoch": 100.0, + "step": 66500, + "total_flos": 1.1646058381332455e+21, + "train_loss": 0.2947179788530321, + "train_runtime": 117326.6726, + "train_samples_per_second": 36.224, + "train_steps_per_second": 0.567 + } + ], + "logging_steps": 10, + "max_steps": 66500, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 500, + "total_flos": 1.1646058381332455e+21, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}