{ "best_metric": 0.20614954829216003, "best_model_checkpoint": "./cocoa_outputs_vit/checkpoint-980", "epoch": 100.0, "eval_steps": 500, "global_step": 19600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05102040816326531, "grad_norm": 9.997809410095215, "learning_rate": 1.998979591836735e-05, "loss": 1.243, "step": 10 }, { "epoch": 0.10204081632653061, "grad_norm": 6.510932445526123, "learning_rate": 1.9979591836734697e-05, "loss": 0.6487, "step": 20 }, { "epoch": 0.15306122448979592, "grad_norm": 4.67606258392334, "learning_rate": 1.9969387755102042e-05, "loss": 0.5975, "step": 30 }, { "epoch": 0.20408163265306123, "grad_norm": 1.5653560161590576, "learning_rate": 1.9959183673469388e-05, "loss": 0.6574, "step": 40 }, { "epoch": 0.25510204081632654, "grad_norm": 4.846164226531982, "learning_rate": 1.9948979591836737e-05, "loss": 0.498, "step": 50 }, { "epoch": 0.30612244897959184, "grad_norm": 6.9164042472839355, "learning_rate": 1.9938775510204083e-05, "loss": 0.2128, "step": 60 }, { "epoch": 0.35714285714285715, "grad_norm": 8.036698341369629, "learning_rate": 1.992857142857143e-05, "loss": 0.5379, "step": 70 }, { "epoch": 0.40816326530612246, "grad_norm": 3.314554214477539, "learning_rate": 1.9918367346938775e-05, "loss": 0.4304, "step": 80 }, { "epoch": 0.45918367346938777, "grad_norm": 8.956796646118164, "learning_rate": 1.9908163265306124e-05, "loss": 0.8448, "step": 90 }, { "epoch": 0.5102040816326531, "grad_norm": 6.246781826019287, "learning_rate": 1.9897959183673473e-05, "loss": 0.3728, "step": 100 }, { "epoch": 0.5612244897959183, "grad_norm": 6.085058212280273, "learning_rate": 1.988775510204082e-05, "loss": 0.3607, "step": 110 }, { "epoch": 0.6122448979591837, "grad_norm": 0.6527583003044128, "learning_rate": 1.9877551020408165e-05, "loss": 0.4637, "step": 120 }, { "epoch": 0.6632653061224489, "grad_norm": 7.142705917358398, "learning_rate": 1.986734693877551e-05, "loss": 0.3851, "step": 130 }, { "epoch": 0.7142857142857143, "grad_norm": 5.793333053588867, "learning_rate": 1.985714285714286e-05, "loss": 0.2978, "step": 140 }, { "epoch": 0.7653061224489796, "grad_norm": 8.58443832397461, "learning_rate": 1.9846938775510205e-05, "loss": 0.3838, "step": 150 }, { "epoch": 0.8163265306122449, "grad_norm": 7.941507339477539, "learning_rate": 1.983673469387755e-05, "loss": 0.5102, "step": 160 }, { "epoch": 0.8673469387755102, "grad_norm": 12.863133430480957, "learning_rate": 1.9826530612244897e-05, "loss": 0.2993, "step": 170 }, { "epoch": 0.9183673469387755, "grad_norm": 2.576077461242676, "learning_rate": 1.9816326530612246e-05, "loss": 0.4559, "step": 180 }, { "epoch": 0.9693877551020408, "grad_norm": 9.176087379455566, "learning_rate": 1.9806122448979595e-05, "loss": 0.3733, "step": 190 }, { "epoch": 1.0, "eval_accuracy": 0.9025270758122743, "eval_loss": 0.35576319694519043, "eval_runtime": 1.0851, "eval_samples_per_second": 255.287, "eval_steps_per_second": 32.256, "step": 196 }, { "epoch": 1.0204081632653061, "grad_norm": 11.31506061553955, "learning_rate": 1.979591836734694e-05, "loss": 0.4317, "step": 200 }, { "epoch": 1.0714285714285714, "grad_norm": 9.852068901062012, "learning_rate": 1.9785714285714287e-05, "loss": 0.3314, "step": 210 }, { "epoch": 1.1224489795918366, "grad_norm": 14.15665340423584, "learning_rate": 1.9775510204081633e-05, "loss": 0.2945, "step": 220 }, { "epoch": 1.1734693877551021, "grad_norm": 13.346512794494629, "learning_rate": 1.9765306122448982e-05, "loss": 0.3732, "step": 230 }, { "epoch": 1.2244897959183674, "grad_norm": 5.178635597229004, "learning_rate": 1.9755102040816328e-05, "loss": 0.2018, "step": 240 }, { "epoch": 1.2755102040816326, "grad_norm": 6.916537761688232, "learning_rate": 1.9744897959183677e-05, "loss": 0.5151, "step": 250 }, { "epoch": 1.3265306122448979, "grad_norm": 2.007774829864502, "learning_rate": 1.9734693877551023e-05, "loss": 0.3405, "step": 260 }, { "epoch": 1.3775510204081631, "grad_norm": 0.4539733827114105, "learning_rate": 1.972448979591837e-05, "loss": 0.4038, "step": 270 }, { "epoch": 1.4285714285714286, "grad_norm": 0.35147103667259216, "learning_rate": 1.9714285714285718e-05, "loss": 0.1768, "step": 280 }, { "epoch": 1.4795918367346939, "grad_norm": 13.382933616638184, "learning_rate": 1.9704081632653063e-05, "loss": 0.4585, "step": 290 }, { "epoch": 1.5306122448979593, "grad_norm": 9.720928192138672, "learning_rate": 1.969387755102041e-05, "loss": 0.5692, "step": 300 }, { "epoch": 1.5816326530612246, "grad_norm": 4.945394039154053, "learning_rate": 1.9683673469387755e-05, "loss": 0.3939, "step": 310 }, { "epoch": 1.6326530612244898, "grad_norm": 6.790468692779541, "learning_rate": 1.9673469387755104e-05, "loss": 0.3216, "step": 320 }, { "epoch": 1.683673469387755, "grad_norm": 8.602747917175293, "learning_rate": 1.966326530612245e-05, "loss": 0.57, "step": 330 }, { "epoch": 1.7346938775510203, "grad_norm": 6.8074188232421875, "learning_rate": 1.96530612244898e-05, "loss": 0.286, "step": 340 }, { "epoch": 1.7857142857142856, "grad_norm": 8.234601020812988, "learning_rate": 1.9642857142857145e-05, "loss": 0.2392, "step": 350 }, { "epoch": 1.836734693877551, "grad_norm": 1.2509891986846924, "learning_rate": 1.963265306122449e-05, "loss": 0.3685, "step": 360 }, { "epoch": 1.8877551020408163, "grad_norm": 9.954570770263672, "learning_rate": 1.962244897959184e-05, "loss": 0.2905, "step": 370 }, { "epoch": 1.9387755102040818, "grad_norm": 3.916882038116455, "learning_rate": 1.9612244897959186e-05, "loss": 0.3471, "step": 380 }, { "epoch": 1.989795918367347, "grad_norm": 0.4415203332901001, "learning_rate": 1.960204081632653e-05, "loss": 0.3727, "step": 390 }, { "epoch": 2.0, "eval_accuracy": 0.8989169675090253, "eval_loss": 0.4097772538661957, "eval_runtime": 1.1341, "eval_samples_per_second": 244.255, "eval_steps_per_second": 30.863, "step": 392 }, { "epoch": 2.0408163265306123, "grad_norm": 6.868724822998047, "learning_rate": 1.9591836734693877e-05, "loss": 0.2288, "step": 400 }, { "epoch": 2.0918367346938775, "grad_norm": 5.795254230499268, "learning_rate": 1.9581632653061227e-05, "loss": 0.3767, "step": 410 }, { "epoch": 2.142857142857143, "grad_norm": 19.87067413330078, "learning_rate": 1.9571428571428572e-05, "loss": 0.2423, "step": 420 }, { "epoch": 2.193877551020408, "grad_norm": 8.45821762084961, "learning_rate": 1.956122448979592e-05, "loss": 0.2894, "step": 430 }, { "epoch": 2.2448979591836733, "grad_norm": 5.071498394012451, "learning_rate": 1.9551020408163267e-05, "loss": 0.3454, "step": 440 }, { "epoch": 2.295918367346939, "grad_norm": 8.527978897094727, "learning_rate": 1.9540816326530613e-05, "loss": 0.4041, "step": 450 }, { "epoch": 2.3469387755102042, "grad_norm": 5.713216781616211, "learning_rate": 1.9530612244897962e-05, "loss": 0.2135, "step": 460 }, { "epoch": 2.3979591836734695, "grad_norm": 3.3041906356811523, "learning_rate": 1.9520408163265308e-05, "loss": 0.448, "step": 470 }, { "epoch": 2.4489795918367347, "grad_norm": 5.204842567443848, "learning_rate": 1.9510204081632654e-05, "loss": 0.3678, "step": 480 }, { "epoch": 2.5, "grad_norm": 0.16867049038410187, "learning_rate": 1.95e-05, "loss": 0.2899, "step": 490 }, { "epoch": 2.5510204081632653, "grad_norm": 2.28920578956604, "learning_rate": 1.948979591836735e-05, "loss": 0.4641, "step": 500 }, { "epoch": 2.6020408163265305, "grad_norm": 3.266674757003784, "learning_rate": 1.9479591836734695e-05, "loss": 0.2603, "step": 510 }, { "epoch": 2.6530612244897958, "grad_norm": 0.5857279896736145, "learning_rate": 1.9469387755102044e-05, "loss": 0.2862, "step": 520 }, { "epoch": 2.704081632653061, "grad_norm": 11.884221076965332, "learning_rate": 1.945918367346939e-05, "loss": 0.3568, "step": 530 }, { "epoch": 2.7551020408163263, "grad_norm": 6.461760520935059, "learning_rate": 1.9448979591836735e-05, "loss": 0.6281, "step": 540 }, { "epoch": 2.806122448979592, "grad_norm": 3.488375186920166, "learning_rate": 1.9438775510204085e-05, "loss": 0.1697, "step": 550 }, { "epoch": 2.857142857142857, "grad_norm": 2.6364121437072754, "learning_rate": 1.942857142857143e-05, "loss": 0.2035, "step": 560 }, { "epoch": 2.9081632653061225, "grad_norm": 1.679729700088501, "learning_rate": 1.941836734693878e-05, "loss": 0.2044, "step": 570 }, { "epoch": 2.9591836734693877, "grad_norm": 0.911145806312561, "learning_rate": 1.9408163265306122e-05, "loss": 0.3901, "step": 580 }, { "epoch": 3.0, "eval_accuracy": 0.8989169675090253, "eval_loss": 0.2668152451515198, "eval_runtime": 1.0883, "eval_samples_per_second": 254.528, "eval_steps_per_second": 32.161, "step": 588 }, { "epoch": 3.010204081632653, "grad_norm": 11.727137565612793, "learning_rate": 1.939795918367347e-05, "loss": 0.3525, "step": 590 }, { "epoch": 3.061224489795918, "grad_norm": 6.488338470458984, "learning_rate": 1.9387755102040817e-05, "loss": 0.2955, "step": 600 }, { "epoch": 3.1122448979591835, "grad_norm": 4.388267517089844, "learning_rate": 1.9377551020408166e-05, "loss": 0.2752, "step": 610 }, { "epoch": 3.163265306122449, "grad_norm": 2.686152696609497, "learning_rate": 1.9367346938775512e-05, "loss": 0.2081, "step": 620 }, { "epoch": 3.2142857142857144, "grad_norm": 9.708331108093262, "learning_rate": 1.9357142857142858e-05, "loss": 0.2318, "step": 630 }, { "epoch": 3.2653061224489797, "grad_norm": 11.37060832977295, "learning_rate": 1.9346938775510207e-05, "loss": 0.2549, "step": 640 }, { "epoch": 3.316326530612245, "grad_norm": 5.047173023223877, "learning_rate": 1.9336734693877553e-05, "loss": 0.3084, "step": 650 }, { "epoch": 3.36734693877551, "grad_norm": 26.176551818847656, "learning_rate": 1.9326530612244902e-05, "loss": 0.2798, "step": 660 }, { "epoch": 3.4183673469387754, "grad_norm": 8.97615909576416, "learning_rate": 1.9316326530612248e-05, "loss": 0.2584, "step": 670 }, { "epoch": 3.4693877551020407, "grad_norm": 13.583206176757812, "learning_rate": 1.9306122448979593e-05, "loss": 0.3422, "step": 680 }, { "epoch": 3.520408163265306, "grad_norm": 0.3657621443271637, "learning_rate": 1.929591836734694e-05, "loss": 0.2467, "step": 690 }, { "epoch": 3.571428571428571, "grad_norm": 1.1742420196533203, "learning_rate": 1.928571428571429e-05, "loss": 0.2256, "step": 700 }, { "epoch": 3.622448979591837, "grad_norm": 0.15040180087089539, "learning_rate": 1.9275510204081634e-05, "loss": 0.2438, "step": 710 }, { "epoch": 3.673469387755102, "grad_norm": 15.626485824584961, "learning_rate": 1.926530612244898e-05, "loss": 0.2757, "step": 720 }, { "epoch": 3.7244897959183674, "grad_norm": 11.002825736999512, "learning_rate": 1.925510204081633e-05, "loss": 0.2672, "step": 730 }, { "epoch": 3.7755102040816326, "grad_norm": 17.428829193115234, "learning_rate": 1.9244897959183675e-05, "loss": 0.3696, "step": 740 }, { "epoch": 3.826530612244898, "grad_norm": 0.7305737137794495, "learning_rate": 1.9234693877551024e-05, "loss": 0.6645, "step": 750 }, { "epoch": 3.877551020408163, "grad_norm": 2.158804416656494, "learning_rate": 1.922448979591837e-05, "loss": 0.2754, "step": 760 }, { "epoch": 3.928571428571429, "grad_norm": 7.073704719543457, "learning_rate": 1.9214285714285716e-05, "loss": 0.2142, "step": 770 }, { "epoch": 3.979591836734694, "grad_norm": 0.2575110197067261, "learning_rate": 1.920408163265306e-05, "loss": 0.3421, "step": 780 }, { "epoch": 4.0, "eval_accuracy": 0.9169675090252708, "eval_loss": 0.2612481415271759, "eval_runtime": 1.1075, "eval_samples_per_second": 250.103, "eval_steps_per_second": 31.602, "step": 784 }, { "epoch": 4.030612244897959, "grad_norm": 7.986507415771484, "learning_rate": 1.919387755102041e-05, "loss": 0.2865, "step": 790 }, { "epoch": 4.081632653061225, "grad_norm": 0.27147501707077026, "learning_rate": 1.9183673469387756e-05, "loss": 0.3173, "step": 800 }, { "epoch": 4.13265306122449, "grad_norm": 0.6972355842590332, "learning_rate": 1.9173469387755102e-05, "loss": 0.195, "step": 810 }, { "epoch": 4.183673469387755, "grad_norm": 7.292689800262451, "learning_rate": 1.916326530612245e-05, "loss": 0.2473, "step": 820 }, { "epoch": 4.23469387755102, "grad_norm": 2.2111926078796387, "learning_rate": 1.9153061224489797e-05, "loss": 0.1566, "step": 830 }, { "epoch": 4.285714285714286, "grad_norm": 0.3641502559185028, "learning_rate": 1.9142857142857146e-05, "loss": 0.1351, "step": 840 }, { "epoch": 4.336734693877551, "grad_norm": 0.37421703338623047, "learning_rate": 1.9132653061224492e-05, "loss": 0.2136, "step": 850 }, { "epoch": 4.387755102040816, "grad_norm": 1.0747158527374268, "learning_rate": 1.9122448979591838e-05, "loss": 0.1552, "step": 860 }, { "epoch": 4.438775510204081, "grad_norm": 11.20666217803955, "learning_rate": 1.9112244897959184e-05, "loss": 0.2423, "step": 870 }, { "epoch": 4.489795918367347, "grad_norm": 14.15116024017334, "learning_rate": 1.9102040816326533e-05, "loss": 0.384, "step": 880 }, { "epoch": 4.540816326530612, "grad_norm": 8.69979476928711, "learning_rate": 1.909183673469388e-05, "loss": 0.2787, "step": 890 }, { "epoch": 4.591836734693878, "grad_norm": 11.211145401000977, "learning_rate": 1.9081632653061225e-05, "loss": 0.2566, "step": 900 }, { "epoch": 4.642857142857143, "grad_norm": 8.875814437866211, "learning_rate": 1.9071428571428574e-05, "loss": 0.4062, "step": 910 }, { "epoch": 4.6938775510204085, "grad_norm": 3.876970052719116, "learning_rate": 1.906122448979592e-05, "loss": 0.3204, "step": 920 }, { "epoch": 4.744897959183674, "grad_norm": 0.0954553633928299, "learning_rate": 1.905102040816327e-05, "loss": 0.2081, "step": 930 }, { "epoch": 4.795918367346939, "grad_norm": 0.11895910650491714, "learning_rate": 1.9040816326530614e-05, "loss": 0.2659, "step": 940 }, { "epoch": 4.846938775510204, "grad_norm": 10.455253601074219, "learning_rate": 1.903061224489796e-05, "loss": 0.3347, "step": 950 }, { "epoch": 4.8979591836734695, "grad_norm": 2.3552839756011963, "learning_rate": 1.9020408163265306e-05, "loss": 0.2562, "step": 960 }, { "epoch": 4.948979591836735, "grad_norm": 9.100960731506348, "learning_rate": 1.9010204081632655e-05, "loss": 0.2246, "step": 970 }, { "epoch": 5.0, "grad_norm": 11.723458290100098, "learning_rate": 1.9e-05, "loss": 0.2703, "step": 980 }, { "epoch": 5.0, "eval_accuracy": 0.927797833935018, "eval_loss": 0.20614954829216003, "eval_runtime": 1.0932, "eval_samples_per_second": 253.381, "eval_steps_per_second": 32.016, "step": 980 }, { "epoch": 5.051020408163265, "grad_norm": 3.7358005046844482, "learning_rate": 1.898979591836735e-05, "loss": 0.1603, "step": 990 }, { "epoch": 5.1020408163265305, "grad_norm": 6.193492412567139, "learning_rate": 1.8979591836734696e-05, "loss": 0.4193, "step": 1000 }, { "epoch": 5.153061224489796, "grad_norm": 0.43159356713294983, "learning_rate": 1.8969387755102042e-05, "loss": 0.0858, "step": 1010 }, { "epoch": 5.204081632653061, "grad_norm": 4.994882583618164, "learning_rate": 1.895918367346939e-05, "loss": 0.0647, "step": 1020 }, { "epoch": 5.255102040816326, "grad_norm": 13.521966934204102, "learning_rate": 1.8948979591836737e-05, "loss": 0.2822, "step": 1030 }, { "epoch": 5.3061224489795915, "grad_norm": 3.2106552124023438, "learning_rate": 1.8938775510204083e-05, "loss": 0.1019, "step": 1040 }, { "epoch": 5.357142857142857, "grad_norm": 2.878549337387085, "learning_rate": 1.892857142857143e-05, "loss": 0.1572, "step": 1050 }, { "epoch": 5.408163265306122, "grad_norm": 0.01558644324541092, "learning_rate": 1.8918367346938778e-05, "loss": 0.2504, "step": 1060 }, { "epoch": 5.459183673469388, "grad_norm": 0.09513021260499954, "learning_rate": 1.8908163265306123e-05, "loss": 0.1602, "step": 1070 }, { "epoch": 5.510204081632653, "grad_norm": 7.256558895111084, "learning_rate": 1.8897959183673473e-05, "loss": 0.2183, "step": 1080 }, { "epoch": 5.561224489795919, "grad_norm": 0.5117791891098022, "learning_rate": 1.888775510204082e-05, "loss": 0.2858, "step": 1090 }, { "epoch": 5.612244897959184, "grad_norm": 6.478097438812256, "learning_rate": 1.8877551020408164e-05, "loss": 0.4146, "step": 1100 }, { "epoch": 5.663265306122449, "grad_norm": 1.04160475730896, "learning_rate": 1.8867346938775513e-05, "loss": 0.1616, "step": 1110 }, { "epoch": 5.714285714285714, "grad_norm": 1.311572551727295, "learning_rate": 1.885714285714286e-05, "loss": 0.218, "step": 1120 }, { "epoch": 5.76530612244898, "grad_norm": 2.2785165309906006, "learning_rate": 1.8846938775510205e-05, "loss": 0.2073, "step": 1130 }, { "epoch": 5.816326530612245, "grad_norm": 4.542452335357666, "learning_rate": 1.883673469387755e-05, "loss": 0.2174, "step": 1140 }, { "epoch": 5.86734693877551, "grad_norm": 9.469289779663086, "learning_rate": 1.88265306122449e-05, "loss": 0.228, "step": 1150 }, { "epoch": 5.918367346938775, "grad_norm": 1.4974592924118042, "learning_rate": 1.8816326530612246e-05, "loss": 0.1239, "step": 1160 }, { "epoch": 5.969387755102041, "grad_norm": 2.6563713550567627, "learning_rate": 1.8806122448979595e-05, "loss": 0.1734, "step": 1170 }, { "epoch": 6.0, "eval_accuracy": 0.927797833935018, "eval_loss": 0.2568311393260956, "eval_runtime": 1.1306, "eval_samples_per_second": 244.998, "eval_steps_per_second": 30.956, "step": 1176 }, { "epoch": 6.020408163265306, "grad_norm": 4.085999965667725, "learning_rate": 1.879591836734694e-05, "loss": 0.2931, "step": 1180 }, { "epoch": 6.071428571428571, "grad_norm": 2.6255083084106445, "learning_rate": 1.8785714285714286e-05, "loss": 0.2181, "step": 1190 }, { "epoch": 6.122448979591836, "grad_norm": 0.7216970920562744, "learning_rate": 1.8775510204081636e-05, "loss": 0.1825, "step": 1200 }, { "epoch": 6.173469387755102, "grad_norm": 0.6231855154037476, "learning_rate": 1.876530612244898e-05, "loss": 0.3418, "step": 1210 }, { "epoch": 6.224489795918367, "grad_norm": 0.2662213146686554, "learning_rate": 1.8755102040816327e-05, "loss": 0.1201, "step": 1220 }, { "epoch": 6.275510204081632, "grad_norm": 0.05140956491231918, "learning_rate": 1.8744897959183673e-05, "loss": 0.1012, "step": 1230 }, { "epoch": 6.326530612244898, "grad_norm": 0.5565296411514282, "learning_rate": 1.8734693877551022e-05, "loss": 0.1231, "step": 1240 }, { "epoch": 6.377551020408164, "grad_norm": 3.492568254470825, "learning_rate": 1.8724489795918368e-05, "loss": 0.3298, "step": 1250 }, { "epoch": 6.428571428571429, "grad_norm": 0.6862087249755859, "learning_rate": 1.8714285714285717e-05, "loss": 0.2095, "step": 1260 }, { "epoch": 6.479591836734694, "grad_norm": 0.2907864451408386, "learning_rate": 1.8704081632653063e-05, "loss": 0.1708, "step": 1270 }, { "epoch": 6.530612244897959, "grad_norm": 0.9514654874801636, "learning_rate": 1.869387755102041e-05, "loss": 0.1433, "step": 1280 }, { "epoch": 6.581632653061225, "grad_norm": 0.08231165260076523, "learning_rate": 1.8683673469387758e-05, "loss": 0.0884, "step": 1290 }, { "epoch": 6.63265306122449, "grad_norm": 8.10069465637207, "learning_rate": 1.8673469387755104e-05, "loss": 0.535, "step": 1300 }, { "epoch": 6.683673469387755, "grad_norm": 0.581271231174469, "learning_rate": 1.866326530612245e-05, "loss": 0.2301, "step": 1310 }, { "epoch": 6.73469387755102, "grad_norm": 4.828273296356201, "learning_rate": 1.8653061224489795e-05, "loss": 0.2447, "step": 1320 }, { "epoch": 6.785714285714286, "grad_norm": 2.0581777095794678, "learning_rate": 1.8642857142857144e-05, "loss": 0.2385, "step": 1330 }, { "epoch": 6.836734693877551, "grad_norm": 8.729135513305664, "learning_rate": 1.863265306122449e-05, "loss": 0.2933, "step": 1340 }, { "epoch": 6.887755102040816, "grad_norm": 1.4226897954940796, "learning_rate": 1.862244897959184e-05, "loss": 0.2291, "step": 1350 }, { "epoch": 6.938775510204081, "grad_norm": 18.637195587158203, "learning_rate": 1.8612244897959185e-05, "loss": 0.2219, "step": 1360 }, { "epoch": 6.989795918367347, "grad_norm": 0.1524399071931839, "learning_rate": 1.860204081632653e-05, "loss": 0.1385, "step": 1370 }, { "epoch": 7.0, "eval_accuracy": 0.9205776173285198, "eval_loss": 0.324248731136322, "eval_runtime": 1.1268, "eval_samples_per_second": 245.831, "eval_steps_per_second": 31.062, "step": 1372 }, { "epoch": 7.040816326530612, "grad_norm": 7.494378566741943, "learning_rate": 1.859183673469388e-05, "loss": 0.1273, "step": 1380 }, { "epoch": 7.091836734693878, "grad_norm": 0.700007438659668, "learning_rate": 1.8581632653061226e-05, "loss": 0.0487, "step": 1390 }, { "epoch": 7.142857142857143, "grad_norm": 5.898748874664307, "learning_rate": 1.8571428571428575e-05, "loss": 0.2186, "step": 1400 }, { "epoch": 7.1938775510204085, "grad_norm": 5.224879264831543, "learning_rate": 1.856122448979592e-05, "loss": 0.3189, "step": 1410 }, { "epoch": 7.244897959183674, "grad_norm": 2.707554578781128, "learning_rate": 1.8551020408163267e-05, "loss": 0.0948, "step": 1420 }, { "epoch": 7.295918367346939, "grad_norm": 10.165546417236328, "learning_rate": 1.8540816326530613e-05, "loss": 0.2619, "step": 1430 }, { "epoch": 7.346938775510204, "grad_norm": 0.14978091418743134, "learning_rate": 1.853061224489796e-05, "loss": 0.1523, "step": 1440 }, { "epoch": 7.3979591836734695, "grad_norm": 20.640615463256836, "learning_rate": 1.8520408163265307e-05, "loss": 0.0609, "step": 1450 }, { "epoch": 7.448979591836735, "grad_norm": 9.119196891784668, "learning_rate": 1.8510204081632653e-05, "loss": 0.4619, "step": 1460 }, { "epoch": 7.5, "grad_norm": 4.146620273590088, "learning_rate": 1.8500000000000002e-05, "loss": 0.1289, "step": 1470 }, { "epoch": 7.551020408163265, "grad_norm": 4.590761184692383, "learning_rate": 1.8489795918367348e-05, "loss": 0.2705, "step": 1480 }, { "epoch": 7.6020408163265305, "grad_norm": 1.6500192880630493, "learning_rate": 1.8479591836734697e-05, "loss": 0.154, "step": 1490 }, { "epoch": 7.653061224489796, "grad_norm": 0.5105675458908081, "learning_rate": 1.8469387755102043e-05, "loss": 0.2111, "step": 1500 }, { "epoch": 7.704081632653061, "grad_norm": 10.746613502502441, "learning_rate": 1.845918367346939e-05, "loss": 0.1202, "step": 1510 }, { "epoch": 7.755102040816326, "grad_norm": 11.645087242126465, "learning_rate": 1.8448979591836735e-05, "loss": 0.1716, "step": 1520 }, { "epoch": 7.8061224489795915, "grad_norm": 8.031530380249023, "learning_rate": 1.8438775510204084e-05, "loss": 0.2984, "step": 1530 }, { "epoch": 7.857142857142857, "grad_norm": 0.9529069066047668, "learning_rate": 1.842857142857143e-05, "loss": 0.3926, "step": 1540 }, { "epoch": 7.908163265306122, "grad_norm": 0.053690504282712936, "learning_rate": 1.8418367346938776e-05, "loss": 0.0706, "step": 1550 }, { "epoch": 7.959183673469388, "grad_norm": 13.771781921386719, "learning_rate": 1.8408163265306125e-05, "loss": 0.3237, "step": 1560 }, { "epoch": 8.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.29217731952667236, "eval_runtime": 1.0946, "eval_samples_per_second": 253.065, "eval_steps_per_second": 31.976, "step": 1568 }, { "epoch": 8.010204081632653, "grad_norm": 7.22590970993042, "learning_rate": 1.839795918367347e-05, "loss": 0.0606, "step": 1570 }, { "epoch": 8.061224489795919, "grad_norm": 20.33266258239746, "learning_rate": 1.838775510204082e-05, "loss": 0.2946, "step": 1580 }, { "epoch": 8.112244897959183, "grad_norm": 0.23184408247470856, "learning_rate": 1.8377551020408165e-05, "loss": 0.1473, "step": 1590 }, { "epoch": 8.16326530612245, "grad_norm": 0.03063291497528553, "learning_rate": 1.836734693877551e-05, "loss": 0.1242, "step": 1600 }, { "epoch": 8.214285714285714, "grad_norm": 4.882504940032959, "learning_rate": 1.8357142857142857e-05, "loss": 0.2341, "step": 1610 }, { "epoch": 8.26530612244898, "grad_norm": 0.16896574199199677, "learning_rate": 1.8346938775510206e-05, "loss": 0.1384, "step": 1620 }, { "epoch": 8.316326530612244, "grad_norm": 0.20970839262008667, "learning_rate": 1.8336734693877552e-05, "loss": 0.1664, "step": 1630 }, { "epoch": 8.36734693877551, "grad_norm": 0.34165337681770325, "learning_rate": 1.8326530612244898e-05, "loss": 0.1864, "step": 1640 }, { "epoch": 8.418367346938776, "grad_norm": 5.2462615966796875, "learning_rate": 1.8316326530612247e-05, "loss": 0.0757, "step": 1650 }, { "epoch": 8.46938775510204, "grad_norm": 6.203608512878418, "learning_rate": 1.8306122448979593e-05, "loss": 0.1641, "step": 1660 }, { "epoch": 8.520408163265307, "grad_norm": 3.5064170360565186, "learning_rate": 1.8295918367346942e-05, "loss": 0.0636, "step": 1670 }, { "epoch": 8.571428571428571, "grad_norm": 0.3010745644569397, "learning_rate": 1.8285714285714288e-05, "loss": 0.0748, "step": 1680 }, { "epoch": 8.622448979591837, "grad_norm": 0.12958110868930817, "learning_rate": 1.8275510204081634e-05, "loss": 0.123, "step": 1690 }, { "epoch": 8.673469387755102, "grad_norm": 0.1100921481847763, "learning_rate": 1.826530612244898e-05, "loss": 0.1682, "step": 1700 }, { "epoch": 8.724489795918368, "grad_norm": 14.562979698181152, "learning_rate": 1.825510204081633e-05, "loss": 0.1526, "step": 1710 }, { "epoch": 8.775510204081632, "grad_norm": 0.04527562856674194, "learning_rate": 1.8244897959183674e-05, "loss": 0.2321, "step": 1720 }, { "epoch": 8.826530612244898, "grad_norm": 6.943336009979248, "learning_rate": 1.823469387755102e-05, "loss": 0.2089, "step": 1730 }, { "epoch": 8.877551020408163, "grad_norm": 0.48435041308403015, "learning_rate": 1.822448979591837e-05, "loss": 0.1027, "step": 1740 }, { "epoch": 8.928571428571429, "grad_norm": 0.03828850015997887, "learning_rate": 1.8214285714285715e-05, "loss": 0.0944, "step": 1750 }, { "epoch": 8.979591836734693, "grad_norm": 15.109698295593262, "learning_rate": 1.8204081632653064e-05, "loss": 0.236, "step": 1760 }, { "epoch": 9.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.3043595254421234, "eval_runtime": 1.0818, "eval_samples_per_second": 256.061, "eval_steps_per_second": 32.354, "step": 1764 }, { "epoch": 9.03061224489796, "grad_norm": 7.278554916381836, "learning_rate": 1.819387755102041e-05, "loss": 0.2078, "step": 1770 }, { "epoch": 9.081632653061224, "grad_norm": 14.484221458435059, "learning_rate": 1.8183673469387756e-05, "loss": 0.1024, "step": 1780 }, { "epoch": 9.13265306122449, "grad_norm": 5.776803016662598, "learning_rate": 1.81734693877551e-05, "loss": 0.0596, "step": 1790 }, { "epoch": 9.183673469387756, "grad_norm": 0.5540585517883301, "learning_rate": 1.816326530612245e-05, "loss": 0.1216, "step": 1800 }, { "epoch": 9.23469387755102, "grad_norm": 0.41855359077453613, "learning_rate": 1.8153061224489797e-05, "loss": 0.101, "step": 1810 }, { "epoch": 9.285714285714286, "grad_norm": 17.29263687133789, "learning_rate": 1.8142857142857146e-05, "loss": 0.1049, "step": 1820 }, { "epoch": 9.33673469387755, "grad_norm": 0.22831247746944427, "learning_rate": 1.813265306122449e-05, "loss": 0.2447, "step": 1830 }, { "epoch": 9.387755102040817, "grad_norm": 7.199280261993408, "learning_rate": 1.8122448979591837e-05, "loss": 0.2071, "step": 1840 }, { "epoch": 9.438775510204081, "grad_norm": 0.9161941409111023, "learning_rate": 1.8112244897959187e-05, "loss": 0.0782, "step": 1850 }, { "epoch": 9.489795918367347, "grad_norm": 0.006657553371042013, "learning_rate": 1.8102040816326532e-05, "loss": 0.0924, "step": 1860 }, { "epoch": 9.540816326530612, "grad_norm": 13.556918144226074, "learning_rate": 1.8091836734693878e-05, "loss": 0.1964, "step": 1870 }, { "epoch": 9.591836734693878, "grad_norm": 1.3914557695388794, "learning_rate": 1.8081632653061224e-05, "loss": 0.1666, "step": 1880 }, { "epoch": 9.642857142857142, "grad_norm": 20.378259658813477, "learning_rate": 1.8071428571428573e-05, "loss": 0.0895, "step": 1890 }, { "epoch": 9.693877551020408, "grad_norm": 0.7154764533042908, "learning_rate": 1.806122448979592e-05, "loss": 0.3664, "step": 1900 }, { "epoch": 9.744897959183673, "grad_norm": 0.21097755432128906, "learning_rate": 1.8051020408163268e-05, "loss": 0.1424, "step": 1910 }, { "epoch": 9.795918367346939, "grad_norm": 0.10001561045646667, "learning_rate": 1.8040816326530614e-05, "loss": 0.1168, "step": 1920 }, { "epoch": 9.846938775510203, "grad_norm": 0.19192112982273102, "learning_rate": 1.803061224489796e-05, "loss": 0.0664, "step": 1930 }, { "epoch": 9.89795918367347, "grad_norm": 3.052227735519409, "learning_rate": 1.802040816326531e-05, "loss": 0.2538, "step": 1940 }, { "epoch": 9.948979591836736, "grad_norm": 0.14643241465091705, "learning_rate": 1.8010204081632655e-05, "loss": 0.1577, "step": 1950 }, { "epoch": 10.0, "grad_norm": 2.229574203491211, "learning_rate": 1.8e-05, "loss": 0.2124, "step": 1960 }, { "epoch": 10.0, "eval_accuracy": 0.9061371841155235, "eval_loss": 0.384831041097641, "eval_runtime": 1.0822, "eval_samples_per_second": 255.965, "eval_steps_per_second": 32.342, "step": 1960 }, { "epoch": 10.051020408163266, "grad_norm": 0.1543138176202774, "learning_rate": 1.7989795918367346e-05, "loss": 0.0973, "step": 1970 }, { "epoch": 10.10204081632653, "grad_norm": 0.06818034499883652, "learning_rate": 1.7979591836734695e-05, "loss": 0.1331, "step": 1980 }, { "epoch": 10.153061224489797, "grad_norm": 2.3080074787139893, "learning_rate": 1.796938775510204e-05, "loss": 0.1329, "step": 1990 }, { "epoch": 10.204081632653061, "grad_norm": 2.1832737922668457, "learning_rate": 1.795918367346939e-05, "loss": 0.0873, "step": 2000 }, { "epoch": 10.255102040816327, "grad_norm": 20.37071418762207, "learning_rate": 1.7948979591836736e-05, "loss": 0.2335, "step": 2010 }, { "epoch": 10.306122448979592, "grad_norm": 0.6018389463424683, "learning_rate": 1.7938775510204082e-05, "loss": 0.1283, "step": 2020 }, { "epoch": 10.357142857142858, "grad_norm": 1.4753310680389404, "learning_rate": 1.792857142857143e-05, "loss": 0.1785, "step": 2030 }, { "epoch": 10.408163265306122, "grad_norm": 1.3111803531646729, "learning_rate": 1.7918367346938777e-05, "loss": 0.0655, "step": 2040 }, { "epoch": 10.459183673469388, "grad_norm": 3.6380674839019775, "learning_rate": 1.7908163265306123e-05, "loss": 0.1624, "step": 2050 }, { "epoch": 10.510204081632653, "grad_norm": 2.4741482734680176, "learning_rate": 1.789795918367347e-05, "loss": 0.0367, "step": 2060 }, { "epoch": 10.561224489795919, "grad_norm": 0.022678235545754433, "learning_rate": 1.7887755102040818e-05, "loss": 0.1777, "step": 2070 }, { "epoch": 10.612244897959183, "grad_norm": 11.971243858337402, "learning_rate": 1.7877551020408164e-05, "loss": 0.0965, "step": 2080 }, { "epoch": 10.66326530612245, "grad_norm": 11.603116989135742, "learning_rate": 1.7867346938775513e-05, "loss": 0.2982, "step": 2090 }, { "epoch": 10.714285714285714, "grad_norm": 11.610998153686523, "learning_rate": 1.785714285714286e-05, "loss": 0.0333, "step": 2100 }, { "epoch": 10.76530612244898, "grad_norm": 5.430611610412598, "learning_rate": 1.7846938775510204e-05, "loss": 0.0881, "step": 2110 }, { "epoch": 10.816326530612244, "grad_norm": 0.03246472030878067, "learning_rate": 1.7836734693877553e-05, "loss": 0.2477, "step": 2120 }, { "epoch": 10.86734693877551, "grad_norm": 9.989664077758789, "learning_rate": 1.78265306122449e-05, "loss": 0.0317, "step": 2130 }, { "epoch": 10.918367346938776, "grad_norm": 0.7854372262954712, "learning_rate": 1.781632653061225e-05, "loss": 0.1334, "step": 2140 }, { "epoch": 10.96938775510204, "grad_norm": 14.79015064239502, "learning_rate": 1.780612244897959e-05, "loss": 0.0454, "step": 2150 }, { "epoch": 11.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.35273221135139465, "eval_runtime": 1.0952, "eval_samples_per_second": 252.912, "eval_steps_per_second": 31.956, "step": 2156 }, { "epoch": 11.020408163265307, "grad_norm": 1.9553345441818237, "learning_rate": 1.779591836734694e-05, "loss": 0.1266, "step": 2160 }, { "epoch": 11.071428571428571, "grad_norm": 0.019537143409252167, "learning_rate": 1.7785714285714286e-05, "loss": 0.1632, "step": 2170 }, { "epoch": 11.122448979591837, "grad_norm": 14.975594520568848, "learning_rate": 1.7775510204081635e-05, "loss": 0.1683, "step": 2180 }, { "epoch": 11.173469387755102, "grad_norm": 3.7547662258148193, "learning_rate": 1.776530612244898e-05, "loss": 0.042, "step": 2190 }, { "epoch": 11.224489795918368, "grad_norm": 10.076713562011719, "learning_rate": 1.7755102040816327e-05, "loss": 0.1116, "step": 2200 }, { "epoch": 11.275510204081632, "grad_norm": 22.89598846435547, "learning_rate": 1.7744897959183676e-05, "loss": 0.3212, "step": 2210 }, { "epoch": 11.326530612244898, "grad_norm": 13.761092185974121, "learning_rate": 1.773469387755102e-05, "loss": 0.0981, "step": 2220 }, { "epoch": 11.377551020408163, "grad_norm": 14.772689819335938, "learning_rate": 1.772448979591837e-05, "loss": 0.0533, "step": 2230 }, { "epoch": 11.428571428571429, "grad_norm": 10.945533752441406, "learning_rate": 1.7714285714285717e-05, "loss": 0.0592, "step": 2240 }, { "epoch": 11.479591836734693, "grad_norm": 5.76173210144043, "learning_rate": 1.7704081632653062e-05, "loss": 0.0462, "step": 2250 }, { "epoch": 11.53061224489796, "grad_norm": 14.388745307922363, "learning_rate": 1.7693877551020408e-05, "loss": 0.0991, "step": 2260 }, { "epoch": 11.581632653061224, "grad_norm": 0.008243952877819538, "learning_rate": 1.7683673469387757e-05, "loss": 0.0725, "step": 2270 }, { "epoch": 11.63265306122449, "grad_norm": 0.624907910823822, "learning_rate": 1.7673469387755103e-05, "loss": 0.1539, "step": 2280 }, { "epoch": 11.683673469387756, "grad_norm": 0.7264893054962158, "learning_rate": 1.766326530612245e-05, "loss": 0.039, "step": 2290 }, { "epoch": 11.73469387755102, "grad_norm": 26.08190155029297, "learning_rate": 1.7653061224489798e-05, "loss": 0.2854, "step": 2300 }, { "epoch": 11.785714285714286, "grad_norm": 0.15984481573104858, "learning_rate": 1.7642857142857144e-05, "loss": 0.247, "step": 2310 }, { "epoch": 11.83673469387755, "grad_norm": 0.20649082958698273, "learning_rate": 1.7632653061224493e-05, "loss": 0.243, "step": 2320 }, { "epoch": 11.887755102040817, "grad_norm": 17.853322982788086, "learning_rate": 1.762244897959184e-05, "loss": 0.2227, "step": 2330 }, { "epoch": 11.938775510204081, "grad_norm": 11.218436241149902, "learning_rate": 1.7612244897959185e-05, "loss": 0.0581, "step": 2340 }, { "epoch": 11.989795918367347, "grad_norm": 3.662997245788574, "learning_rate": 1.760204081632653e-05, "loss": 0.0756, "step": 2350 }, { "epoch": 12.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.2844396233558655, "eval_runtime": 1.0881, "eval_samples_per_second": 254.568, "eval_steps_per_second": 32.166, "step": 2352 }, { "epoch": 12.040816326530612, "grad_norm": 0.23816126585006714, "learning_rate": 1.759183673469388e-05, "loss": 0.0653, "step": 2360 }, { "epoch": 12.091836734693878, "grad_norm": 15.191213607788086, "learning_rate": 1.7581632653061225e-05, "loss": 0.2628, "step": 2370 }, { "epoch": 12.142857142857142, "grad_norm": 0.43516838550567627, "learning_rate": 1.757142857142857e-05, "loss": 0.0804, "step": 2380 }, { "epoch": 12.193877551020408, "grad_norm": 0.052536483854055405, "learning_rate": 1.756122448979592e-05, "loss": 0.0423, "step": 2390 }, { "epoch": 12.244897959183673, "grad_norm": 0.0405183769762516, "learning_rate": 1.7551020408163266e-05, "loss": 0.1398, "step": 2400 }, { "epoch": 12.295918367346939, "grad_norm": 0.0356355682015419, "learning_rate": 1.7540816326530615e-05, "loss": 0.1984, "step": 2410 }, { "epoch": 12.346938775510203, "grad_norm": 0.8615005612373352, "learning_rate": 1.753061224489796e-05, "loss": 0.058, "step": 2420 }, { "epoch": 12.39795918367347, "grad_norm": 12.601280212402344, "learning_rate": 1.7520408163265307e-05, "loss": 0.1513, "step": 2430 }, { "epoch": 12.448979591836734, "grad_norm": 0.04719792306423187, "learning_rate": 1.7510204081632653e-05, "loss": 0.0789, "step": 2440 }, { "epoch": 12.5, "grad_norm": 0.6252068281173706, "learning_rate": 1.7500000000000002e-05, "loss": 0.156, "step": 2450 }, { "epoch": 12.551020408163264, "grad_norm": 0.0064176819287240505, "learning_rate": 1.748979591836735e-05, "loss": 0.0292, "step": 2460 }, { "epoch": 12.60204081632653, "grad_norm": 0.006039516068994999, "learning_rate": 1.7479591836734693e-05, "loss": 0.0697, "step": 2470 }, { "epoch": 12.653061224489797, "grad_norm": 0.7756688594818115, "learning_rate": 1.7469387755102043e-05, "loss": 0.1369, "step": 2480 }, { "epoch": 12.704081632653061, "grad_norm": 11.478680610656738, "learning_rate": 1.745918367346939e-05, "loss": 0.0637, "step": 2490 }, { "epoch": 12.755102040816327, "grad_norm": 0.38068854808807373, "learning_rate": 1.7448979591836738e-05, "loss": 0.172, "step": 2500 }, { "epoch": 12.806122448979592, "grad_norm": 24.819625854492188, "learning_rate": 1.7438775510204083e-05, "loss": 0.1557, "step": 2510 }, { "epoch": 12.857142857142858, "grad_norm": 0.30620601773262024, "learning_rate": 1.742857142857143e-05, "loss": 0.1083, "step": 2520 }, { "epoch": 12.908163265306122, "grad_norm": 0.040806639939546585, "learning_rate": 1.7418367346938775e-05, "loss": 0.0875, "step": 2530 }, { "epoch": 12.959183673469388, "grad_norm": 6.181074142456055, "learning_rate": 1.7408163265306124e-05, "loss": 0.0605, "step": 2540 }, { "epoch": 13.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.3077334761619568, "eval_runtime": 1.0865, "eval_samples_per_second": 254.949, "eval_steps_per_second": 32.214, "step": 2548 }, { "epoch": 13.010204081632653, "grad_norm": 3.649362802505493, "learning_rate": 1.7397959183673473e-05, "loss": 0.1996, "step": 2550 }, { "epoch": 13.061224489795919, "grad_norm": 8.460799217224121, "learning_rate": 1.738775510204082e-05, "loss": 0.175, "step": 2560 }, { "epoch": 13.112244897959183, "grad_norm": 0.353739470243454, "learning_rate": 1.7377551020408165e-05, "loss": 0.0743, "step": 2570 }, { "epoch": 13.16326530612245, "grad_norm": 14.380840301513672, "learning_rate": 1.736734693877551e-05, "loss": 0.0635, "step": 2580 }, { "epoch": 13.214285714285714, "grad_norm": 6.895202159881592, "learning_rate": 1.735714285714286e-05, "loss": 0.121, "step": 2590 }, { "epoch": 13.26530612244898, "grad_norm": 11.851558685302734, "learning_rate": 1.7346938775510206e-05, "loss": 0.1027, "step": 2600 }, { "epoch": 13.316326530612244, "grad_norm": 11.554853439331055, "learning_rate": 1.733673469387755e-05, "loss": 0.2741, "step": 2610 }, { "epoch": 13.36734693877551, "grad_norm": 0.6941150426864624, "learning_rate": 1.7326530612244897e-05, "loss": 0.2284, "step": 2620 }, { "epoch": 13.418367346938776, "grad_norm": 1.5535848140716553, "learning_rate": 1.7316326530612246e-05, "loss": 0.0357, "step": 2630 }, { "epoch": 13.46938775510204, "grad_norm": 0.03214215114712715, "learning_rate": 1.7306122448979596e-05, "loss": 0.0434, "step": 2640 }, { "epoch": 13.520408163265307, "grad_norm": 2.40728497505188, "learning_rate": 1.729591836734694e-05, "loss": 0.1654, "step": 2650 }, { "epoch": 13.571428571428571, "grad_norm": 0.17484958469867706, "learning_rate": 1.7285714285714287e-05, "loss": 0.0379, "step": 2660 }, { "epoch": 13.622448979591837, "grad_norm": 1.2288073301315308, "learning_rate": 1.7275510204081633e-05, "loss": 0.1237, "step": 2670 }, { "epoch": 13.673469387755102, "grad_norm": 12.001127243041992, "learning_rate": 1.7265306122448982e-05, "loss": 0.1388, "step": 2680 }, { "epoch": 13.724489795918368, "grad_norm": 14.433393478393555, "learning_rate": 1.7255102040816328e-05, "loss": 0.1118, "step": 2690 }, { "epoch": 13.775510204081632, "grad_norm": 0.018689775839447975, "learning_rate": 1.7244897959183674e-05, "loss": 0.0355, "step": 2700 }, { "epoch": 13.826530612244898, "grad_norm": 14.766127586364746, "learning_rate": 1.723469387755102e-05, "loss": 0.0885, "step": 2710 }, { "epoch": 13.877551020408163, "grad_norm": 0.3029196560382843, "learning_rate": 1.722448979591837e-05, "loss": 0.1137, "step": 2720 }, { "epoch": 13.928571428571429, "grad_norm": 0.006814345717430115, "learning_rate": 1.7214285714285718e-05, "loss": 0.1058, "step": 2730 }, { "epoch": 13.979591836734693, "grad_norm": 0.014352614991366863, "learning_rate": 1.7204081632653064e-05, "loss": 0.0214, "step": 2740 }, { "epoch": 14.0, "eval_accuracy": 0.9025270758122743, "eval_loss": 0.6295154690742493, "eval_runtime": 1.0784, "eval_samples_per_second": 256.869, "eval_steps_per_second": 32.456, "step": 2744 }, { "epoch": 14.03061224489796, "grad_norm": 12.165445327758789, "learning_rate": 1.719387755102041e-05, "loss": 0.1698, "step": 2750 }, { "epoch": 14.081632653061224, "grad_norm": 0.9271863102912903, "learning_rate": 1.7183673469387755e-05, "loss": 0.0824, "step": 2760 }, { "epoch": 14.13265306122449, "grad_norm": 10.690723419189453, "learning_rate": 1.7173469387755104e-05, "loss": 0.0212, "step": 2770 }, { "epoch": 14.183673469387756, "grad_norm": 0.00257474509999156, "learning_rate": 1.716326530612245e-05, "loss": 0.17, "step": 2780 }, { "epoch": 14.23469387755102, "grad_norm": 0.00675271125510335, "learning_rate": 1.7153061224489796e-05, "loss": 0.0483, "step": 2790 }, { "epoch": 14.285714285714286, "grad_norm": 0.220039963722229, "learning_rate": 1.7142857142857142e-05, "loss": 0.0568, "step": 2800 }, { "epoch": 14.33673469387755, "grad_norm": 0.12012650817632675, "learning_rate": 1.713265306122449e-05, "loss": 0.0801, "step": 2810 }, { "epoch": 14.387755102040817, "grad_norm": 6.478158473968506, "learning_rate": 1.712244897959184e-05, "loss": 0.03, "step": 2820 }, { "epoch": 14.438775510204081, "grad_norm": 18.334325790405273, "learning_rate": 1.7112244897959186e-05, "loss": 0.1009, "step": 2830 }, { "epoch": 14.489795918367347, "grad_norm": 19.94879722595215, "learning_rate": 1.7102040816326532e-05, "loss": 0.0626, "step": 2840 }, { "epoch": 14.540816326530612, "grad_norm": 1.723865032196045, "learning_rate": 1.7091836734693878e-05, "loss": 0.1199, "step": 2850 }, { "epoch": 14.591836734693878, "grad_norm": 0.01016510371118784, "learning_rate": 1.7081632653061227e-05, "loss": 0.0631, "step": 2860 }, { "epoch": 14.642857142857142, "grad_norm": 0.17090550065040588, "learning_rate": 1.7071428571428573e-05, "loss": 0.1654, "step": 2870 }, { "epoch": 14.693877551020408, "grad_norm": 16.334474563598633, "learning_rate": 1.7061224489795922e-05, "loss": 0.1762, "step": 2880 }, { "epoch": 14.744897959183673, "grad_norm": 6.637477397918701, "learning_rate": 1.7051020408163264e-05, "loss": 0.1625, "step": 2890 }, { "epoch": 14.795918367346939, "grad_norm": 2.353304862976074, "learning_rate": 1.7040816326530613e-05, "loss": 0.134, "step": 2900 }, { "epoch": 14.846938775510203, "grad_norm": 27.230615615844727, "learning_rate": 1.7030612244897962e-05, "loss": 0.1221, "step": 2910 }, { "epoch": 14.89795918367347, "grad_norm": 8.546542167663574, "learning_rate": 1.7020408163265308e-05, "loss": 0.1062, "step": 2920 }, { "epoch": 14.948979591836736, "grad_norm": 11.233489990234375, "learning_rate": 1.7010204081632654e-05, "loss": 0.0891, "step": 2930 }, { "epoch": 15.0, "grad_norm": 25.588375091552734, "learning_rate": 1.7e-05, "loss": 0.1816, "step": 2940 }, { "epoch": 15.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.29960137605667114, "eval_runtime": 1.0859, "eval_samples_per_second": 255.086, "eval_steps_per_second": 32.231, "step": 2940 }, { "epoch": 15.051020408163266, "grad_norm": 20.615747451782227, "learning_rate": 1.698979591836735e-05, "loss": 0.2646, "step": 2950 }, { "epoch": 15.10204081632653, "grad_norm": 0.06437131017446518, "learning_rate": 1.6979591836734695e-05, "loss": 0.1745, "step": 2960 }, { "epoch": 15.153061224489797, "grad_norm": 0.03643803671002388, "learning_rate": 1.6969387755102044e-05, "loss": 0.0643, "step": 2970 }, { "epoch": 15.204081632653061, "grad_norm": 0.024485215544700623, "learning_rate": 1.695918367346939e-05, "loss": 0.026, "step": 2980 }, { "epoch": 15.255102040816327, "grad_norm": 0.43951642513275146, "learning_rate": 1.6948979591836736e-05, "loss": 0.0018, "step": 2990 }, { "epoch": 15.306122448979592, "grad_norm": 2.285606622695923, "learning_rate": 1.6938775510204085e-05, "loss": 0.0091, "step": 3000 }, { "epoch": 15.357142857142858, "grad_norm": 0.10704608261585236, "learning_rate": 1.692857142857143e-05, "loss": 0.0494, "step": 3010 }, { "epoch": 15.408163265306122, "grad_norm": 0.704951286315918, "learning_rate": 1.6918367346938776e-05, "loss": 0.0182, "step": 3020 }, { "epoch": 15.459183673469388, "grad_norm": 0.2643127143383026, "learning_rate": 1.6908163265306122e-05, "loss": 0.0143, "step": 3030 }, { "epoch": 15.510204081632653, "grad_norm": 16.261783599853516, "learning_rate": 1.689795918367347e-05, "loss": 0.1466, "step": 3040 }, { "epoch": 15.561224489795919, "grad_norm": 0.7178455591201782, "learning_rate": 1.6887755102040817e-05, "loss": 0.0773, "step": 3050 }, { "epoch": 15.612244897959183, "grad_norm": 12.605389595031738, "learning_rate": 1.6877551020408166e-05, "loss": 0.1953, "step": 3060 }, { "epoch": 15.66326530612245, "grad_norm": 0.14849582314491272, "learning_rate": 1.6867346938775512e-05, "loss": 0.0965, "step": 3070 }, { "epoch": 15.714285714285714, "grad_norm": 0.6593707203865051, "learning_rate": 1.6857142857142858e-05, "loss": 0.1183, "step": 3080 }, { "epoch": 15.76530612244898, "grad_norm": 17.250619888305664, "learning_rate": 1.6846938775510207e-05, "loss": 0.0798, "step": 3090 }, { "epoch": 15.816326530612244, "grad_norm": 0.04563483968377113, "learning_rate": 1.6836734693877553e-05, "loss": 0.1091, "step": 3100 }, { "epoch": 15.86734693877551, "grad_norm": 0.596813976764679, "learning_rate": 1.68265306122449e-05, "loss": 0.1421, "step": 3110 }, { "epoch": 15.918367346938776, "grad_norm": 0.006604531779885292, "learning_rate": 1.6816326530612244e-05, "loss": 0.0696, "step": 3120 }, { "epoch": 15.96938775510204, "grad_norm": 0.015337258577346802, "learning_rate": 1.6806122448979594e-05, "loss": 0.0338, "step": 3130 }, { "epoch": 16.0, "eval_accuracy": 0.927797833935018, "eval_loss": 0.3596982955932617, "eval_runtime": 1.0814, "eval_samples_per_second": 256.153, "eval_steps_per_second": 32.366, "step": 3136 }, { "epoch": 16.020408163265305, "grad_norm": 0.020698636770248413, "learning_rate": 1.679591836734694e-05, "loss": 0.1661, "step": 3140 }, { "epoch": 16.071428571428573, "grad_norm": 0.24158287048339844, "learning_rate": 1.678571428571429e-05, "loss": 0.0471, "step": 3150 }, { "epoch": 16.122448979591837, "grad_norm": 0.019348515197634697, "learning_rate": 1.6775510204081634e-05, "loss": 0.174, "step": 3160 }, { "epoch": 16.1734693877551, "grad_norm": 9.088598251342773, "learning_rate": 1.676530612244898e-05, "loss": 0.0353, "step": 3170 }, { "epoch": 16.224489795918366, "grad_norm": 1.041342854499817, "learning_rate": 1.675510204081633e-05, "loss": 0.0277, "step": 3180 }, { "epoch": 16.275510204081634, "grad_norm": 0.007071027532219887, "learning_rate": 1.6744897959183675e-05, "loss": 0.0447, "step": 3190 }, { "epoch": 16.3265306122449, "grad_norm": 0.057512346655130386, "learning_rate": 1.673469387755102e-05, "loss": 0.0566, "step": 3200 }, { "epoch": 16.377551020408163, "grad_norm": 0.0070049758069217205, "learning_rate": 1.6724489795918367e-05, "loss": 0.0071, "step": 3210 }, { "epoch": 16.428571428571427, "grad_norm": 17.923377990722656, "learning_rate": 1.6714285714285716e-05, "loss": 0.134, "step": 3220 }, { "epoch": 16.479591836734695, "grad_norm": 3.667469024658203, "learning_rate": 1.6704081632653062e-05, "loss": 0.1446, "step": 3230 }, { "epoch": 16.53061224489796, "grad_norm": 0.08790452033281326, "learning_rate": 1.669387755102041e-05, "loss": 0.2328, "step": 3240 }, { "epoch": 16.581632653061224, "grad_norm": 0.07376696914434433, "learning_rate": 1.6683673469387757e-05, "loss": 0.115, "step": 3250 }, { "epoch": 16.632653061224488, "grad_norm": 4.208004951477051, "learning_rate": 1.6673469387755102e-05, "loss": 0.1408, "step": 3260 }, { "epoch": 16.683673469387756, "grad_norm": 0.6688640117645264, "learning_rate": 1.666326530612245e-05, "loss": 0.0081, "step": 3270 }, { "epoch": 16.73469387755102, "grad_norm": 13.945034980773926, "learning_rate": 1.6653061224489797e-05, "loss": 0.0777, "step": 3280 }, { "epoch": 16.785714285714285, "grad_norm": 14.318987846374512, "learning_rate": 1.6642857142857147e-05, "loss": 0.0232, "step": 3290 }, { "epoch": 16.836734693877553, "grad_norm": 14.283474922180176, "learning_rate": 1.6632653061224492e-05, "loss": 0.2254, "step": 3300 }, { "epoch": 16.887755102040817, "grad_norm": 25.923355102539062, "learning_rate": 1.6622448979591838e-05, "loss": 0.0943, "step": 3310 }, { "epoch": 16.93877551020408, "grad_norm": 0.262794554233551, "learning_rate": 1.6612244897959184e-05, "loss": 0.0652, "step": 3320 }, { "epoch": 16.989795918367346, "grad_norm": 18.311269760131836, "learning_rate": 1.6602040816326533e-05, "loss": 0.2136, "step": 3330 }, { "epoch": 17.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.4069954752922058, "eval_runtime": 1.0812, "eval_samples_per_second": 256.194, "eval_steps_per_second": 32.371, "step": 3332 }, { "epoch": 17.040816326530614, "grad_norm": 0.4477146565914154, "learning_rate": 1.659183673469388e-05, "loss": 0.0609, "step": 3340 }, { "epoch": 17.091836734693878, "grad_norm": 0.008269555866718292, "learning_rate": 1.6581632653061225e-05, "loss": 0.0167, "step": 3350 }, { "epoch": 17.142857142857142, "grad_norm": 3.7902915477752686, "learning_rate": 1.6571428571428574e-05, "loss": 0.0072, "step": 3360 }, { "epoch": 17.193877551020407, "grad_norm": 7.191639423370361, "learning_rate": 1.656122448979592e-05, "loss": 0.0811, "step": 3370 }, { "epoch": 17.244897959183675, "grad_norm": 19.537282943725586, "learning_rate": 1.655102040816327e-05, "loss": 0.0874, "step": 3380 }, { "epoch": 17.29591836734694, "grad_norm": 0.1004362404346466, "learning_rate": 1.6540816326530615e-05, "loss": 0.045, "step": 3390 }, { "epoch": 17.346938775510203, "grad_norm": 0.0025940965861082077, "learning_rate": 1.653061224489796e-05, "loss": 0.0702, "step": 3400 }, { "epoch": 17.397959183673468, "grad_norm": 0.02602744847536087, "learning_rate": 1.6520408163265306e-05, "loss": 0.0602, "step": 3410 }, { "epoch": 17.448979591836736, "grad_norm": 0.019690580666065216, "learning_rate": 1.6510204081632655e-05, "loss": 0.0564, "step": 3420 }, { "epoch": 17.5, "grad_norm": 0.012594260275363922, "learning_rate": 1.65e-05, "loss": 0.0455, "step": 3430 }, { "epoch": 17.551020408163264, "grad_norm": 3.738819122314453, "learning_rate": 1.6489795918367347e-05, "loss": 0.1015, "step": 3440 }, { "epoch": 17.602040816326532, "grad_norm": 1.2673053741455078, "learning_rate": 1.6479591836734696e-05, "loss": 0.0495, "step": 3450 }, { "epoch": 17.653061224489797, "grad_norm": 28.030643463134766, "learning_rate": 1.6469387755102042e-05, "loss": 0.1033, "step": 3460 }, { "epoch": 17.70408163265306, "grad_norm": 0.04187894985079765, "learning_rate": 1.645918367346939e-05, "loss": 0.0661, "step": 3470 }, { "epoch": 17.755102040816325, "grad_norm": 20.57587432861328, "learning_rate": 1.6448979591836737e-05, "loss": 0.0317, "step": 3480 }, { "epoch": 17.806122448979593, "grad_norm": 0.0063165766187012196, "learning_rate": 1.6438775510204083e-05, "loss": 0.0035, "step": 3490 }, { "epoch": 17.857142857142858, "grad_norm": 8.107522964477539, "learning_rate": 1.642857142857143e-05, "loss": 0.1788, "step": 3500 }, { "epoch": 17.908163265306122, "grad_norm": 2.974186420440674, "learning_rate": 1.6418367346938778e-05, "loss": 0.0784, "step": 3510 }, { "epoch": 17.959183673469386, "grad_norm": 29.06330680847168, "learning_rate": 1.6408163265306124e-05, "loss": 0.188, "step": 3520 }, { "epoch": 18.0, "eval_accuracy": 0.9458483754512635, "eval_loss": 0.35322803258895874, "eval_runtime": 1.0824, "eval_samples_per_second": 255.924, "eval_steps_per_second": 32.337, "step": 3528 }, { "epoch": 18.010204081632654, "grad_norm": 0.003397200722247362, "learning_rate": 1.639795918367347e-05, "loss": 0.0342, "step": 3530 }, { "epoch": 18.06122448979592, "grad_norm": 0.11866701394319534, "learning_rate": 1.638775510204082e-05, "loss": 0.0041, "step": 3540 }, { "epoch": 18.112244897959183, "grad_norm": 17.841978073120117, "learning_rate": 1.6377551020408164e-05, "loss": 0.1523, "step": 3550 }, { "epoch": 18.163265306122447, "grad_norm": 19.46847152709961, "learning_rate": 1.6367346938775513e-05, "loss": 0.0851, "step": 3560 }, { "epoch": 18.214285714285715, "grad_norm": 0.02146238461136818, "learning_rate": 1.635714285714286e-05, "loss": 0.075, "step": 3570 }, { "epoch": 18.26530612244898, "grad_norm": 16.66864585876465, "learning_rate": 1.6346938775510205e-05, "loss": 0.0843, "step": 3580 }, { "epoch": 18.316326530612244, "grad_norm": 0.005906553473323584, "learning_rate": 1.633673469387755e-05, "loss": 0.1285, "step": 3590 }, { "epoch": 18.367346938775512, "grad_norm": 23.329856872558594, "learning_rate": 1.63265306122449e-05, "loss": 0.0197, "step": 3600 }, { "epoch": 18.418367346938776, "grad_norm": 0.00302005629055202, "learning_rate": 1.6316326530612246e-05, "loss": 0.0025, "step": 3610 }, { "epoch": 18.46938775510204, "grad_norm": 14.957867622375488, "learning_rate": 1.630612244897959e-05, "loss": 0.1245, "step": 3620 }, { "epoch": 18.520408163265305, "grad_norm": 0.6144952774047852, "learning_rate": 1.629591836734694e-05, "loss": 0.1126, "step": 3630 }, { "epoch": 18.571428571428573, "grad_norm": 0.008064229972660542, "learning_rate": 1.6285714285714287e-05, "loss": 0.0053, "step": 3640 }, { "epoch": 18.622448979591837, "grad_norm": 2.8381404876708984, "learning_rate": 1.6275510204081636e-05, "loss": 0.0896, "step": 3650 }, { "epoch": 18.6734693877551, "grad_norm": 0.015243437141180038, "learning_rate": 1.626530612244898e-05, "loss": 0.0916, "step": 3660 }, { "epoch": 18.724489795918366, "grad_norm": 0.07484673708677292, "learning_rate": 1.6255102040816327e-05, "loss": 0.1214, "step": 3670 }, { "epoch": 18.775510204081634, "grad_norm": 19.52052116394043, "learning_rate": 1.6244897959183673e-05, "loss": 0.0779, "step": 3680 }, { "epoch": 18.8265306122449, "grad_norm": 9.364171028137207, "learning_rate": 1.6234693877551022e-05, "loss": 0.0457, "step": 3690 }, { "epoch": 18.877551020408163, "grad_norm": 0.29549679160118103, "learning_rate": 1.6224489795918368e-05, "loss": 0.0136, "step": 3700 }, { "epoch": 18.928571428571427, "grad_norm": 0.04433099552989006, "learning_rate": 1.6214285714285717e-05, "loss": 0.1046, "step": 3710 }, { "epoch": 18.979591836734695, "grad_norm": 0.11120835691690445, "learning_rate": 1.6204081632653063e-05, "loss": 0.0539, "step": 3720 }, { "epoch": 19.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.3842667043209076, "eval_runtime": 1.0912, "eval_samples_per_second": 253.843, "eval_steps_per_second": 32.074, "step": 3724 }, { "epoch": 19.03061224489796, "grad_norm": 18.89168930053711, "learning_rate": 1.619387755102041e-05, "loss": 0.1791, "step": 3730 }, { "epoch": 19.081632653061224, "grad_norm": 0.20775316655635834, "learning_rate": 1.6183673469387758e-05, "loss": 0.0886, "step": 3740 }, { "epoch": 19.132653061224488, "grad_norm": 0.00598714267835021, "learning_rate": 1.6173469387755104e-05, "loss": 0.0211, "step": 3750 }, { "epoch": 19.183673469387756, "grad_norm": 0.03521345555782318, "learning_rate": 1.616326530612245e-05, "loss": 0.1761, "step": 3760 }, { "epoch": 19.23469387755102, "grad_norm": 0.10337071865797043, "learning_rate": 1.6153061224489795e-05, "loss": 0.0588, "step": 3770 }, { "epoch": 19.285714285714285, "grad_norm": 9.74257755279541, "learning_rate": 1.6142857142857145e-05, "loss": 0.2181, "step": 3780 }, { "epoch": 19.336734693877553, "grad_norm": 0.004976754076778889, "learning_rate": 1.613265306122449e-05, "loss": 0.0013, "step": 3790 }, { "epoch": 19.387755102040817, "grad_norm": 0.07156233489513397, "learning_rate": 1.612244897959184e-05, "loss": 0.1816, "step": 3800 }, { "epoch": 19.43877551020408, "grad_norm": 0.01723037101328373, "learning_rate": 1.6112244897959185e-05, "loss": 0.0955, "step": 3810 }, { "epoch": 19.489795918367346, "grad_norm": 0.005089986138045788, "learning_rate": 1.610204081632653e-05, "loss": 0.0018, "step": 3820 }, { "epoch": 19.540816326530614, "grad_norm": 0.011756361462175846, "learning_rate": 1.609183673469388e-05, "loss": 0.1058, "step": 3830 }, { "epoch": 19.591836734693878, "grad_norm": 0.9381488561630249, "learning_rate": 1.6081632653061226e-05, "loss": 0.0395, "step": 3840 }, { "epoch": 19.642857142857142, "grad_norm": 12.116026878356934, "learning_rate": 1.6071428571428572e-05, "loss": 0.03, "step": 3850 }, { "epoch": 19.693877551020407, "grad_norm": 17.397939682006836, "learning_rate": 1.6061224489795918e-05, "loss": 0.2443, "step": 3860 }, { "epoch": 19.744897959183675, "grad_norm": 3.33341383934021, "learning_rate": 1.6051020408163267e-05, "loss": 0.0443, "step": 3870 }, { "epoch": 19.79591836734694, "grad_norm": 0.02881155163049698, "learning_rate": 1.6040816326530613e-05, "loss": 0.0081, "step": 3880 }, { "epoch": 19.846938775510203, "grad_norm": 0.1269502192735672, "learning_rate": 1.6030612244897962e-05, "loss": 0.0312, "step": 3890 }, { "epoch": 19.897959183673468, "grad_norm": 0.020297830924391747, "learning_rate": 1.6020408163265308e-05, "loss": 0.0032, "step": 3900 }, { "epoch": 19.948979591836736, "grad_norm": 0.9172767996788025, "learning_rate": 1.6010204081632653e-05, "loss": 0.0944, "step": 3910 }, { "epoch": 20.0, "grad_norm": 36.7784538269043, "learning_rate": 1.6000000000000003e-05, "loss": 0.0992, "step": 3920 }, { "epoch": 20.0, "eval_accuracy": 0.9422382671480144, "eval_loss": 0.3904285728931427, "eval_runtime": 1.0843, "eval_samples_per_second": 255.462, "eval_steps_per_second": 32.279, "step": 3920 }, { "epoch": 20.051020408163264, "grad_norm": 0.0048736752942204475, "learning_rate": 1.598979591836735e-05, "loss": 0.0715, "step": 3930 }, { "epoch": 20.102040816326532, "grad_norm": 0.0897139459848404, "learning_rate": 1.5979591836734694e-05, "loss": 0.0005, "step": 3940 }, { "epoch": 20.153061224489797, "grad_norm": 0.014886999502778053, "learning_rate": 1.596938775510204e-05, "loss": 0.0978, "step": 3950 }, { "epoch": 20.20408163265306, "grad_norm": 19.357213973999023, "learning_rate": 1.595918367346939e-05, "loss": 0.132, "step": 3960 }, { "epoch": 20.255102040816325, "grad_norm": 19.083843231201172, "learning_rate": 1.5948979591836735e-05, "loss": 0.0157, "step": 3970 }, { "epoch": 20.306122448979593, "grad_norm": 0.005426835268735886, "learning_rate": 1.5938775510204084e-05, "loss": 0.2043, "step": 3980 }, { "epoch": 20.357142857142858, "grad_norm": 0.004939820151776075, "learning_rate": 1.592857142857143e-05, "loss": 0.0122, "step": 3990 }, { "epoch": 20.408163265306122, "grad_norm": 0.012766397558152676, "learning_rate": 1.5918367346938776e-05, "loss": 0.0209, "step": 4000 }, { "epoch": 20.459183673469386, "grad_norm": 0.18527641892433167, "learning_rate": 1.5908163265306125e-05, "loss": 0.145, "step": 4010 }, { "epoch": 20.510204081632654, "grad_norm": 0.013389199040830135, "learning_rate": 1.589795918367347e-05, "loss": 0.0478, "step": 4020 }, { "epoch": 20.56122448979592, "grad_norm": 0.026373926550149918, "learning_rate": 1.588775510204082e-05, "loss": 0.0837, "step": 4030 }, { "epoch": 20.612244897959183, "grad_norm": 20.30522346496582, "learning_rate": 1.5877551020408162e-05, "loss": 0.0205, "step": 4040 }, { "epoch": 20.663265306122447, "grad_norm": 0.0056726750917732716, "learning_rate": 1.586734693877551e-05, "loss": 0.0077, "step": 4050 }, { "epoch": 20.714285714285715, "grad_norm": 0.023087698966264725, "learning_rate": 1.5857142857142857e-05, "loss": 0.001, "step": 4060 }, { "epoch": 20.76530612244898, "grad_norm": 4.620561122894287, "learning_rate": 1.5846938775510206e-05, "loss": 0.1519, "step": 4070 }, { "epoch": 20.816326530612244, "grad_norm": 3.535813808441162, "learning_rate": 1.5836734693877552e-05, "loss": 0.0374, "step": 4080 }, { "epoch": 20.867346938775512, "grad_norm": 0.02633027546107769, "learning_rate": 1.5826530612244898e-05, "loss": 0.0152, "step": 4090 }, { "epoch": 20.918367346938776, "grad_norm": 2.311264753341675, "learning_rate": 1.5816326530612247e-05, "loss": 0.0735, "step": 4100 }, { "epoch": 20.96938775510204, "grad_norm": 0.03760617598891258, "learning_rate": 1.5806122448979593e-05, "loss": 0.0019, "step": 4110 }, { "epoch": 21.0, "eval_accuracy": 0.9458483754512635, "eval_loss": 0.3732448220252991, "eval_runtime": 1.0906, "eval_samples_per_second": 253.98, "eval_steps_per_second": 32.091, "step": 4116 }, { "epoch": 21.020408163265305, "grad_norm": 3.107679605484009, "learning_rate": 1.5795918367346942e-05, "loss": 0.048, "step": 4120 }, { "epoch": 21.071428571428573, "grad_norm": 13.65282154083252, "learning_rate": 1.5785714285714288e-05, "loss": 0.0755, "step": 4130 }, { "epoch": 21.122448979591837, "grad_norm": 0.49130183458328247, "learning_rate": 1.5775510204081634e-05, "loss": 0.0997, "step": 4140 }, { "epoch": 21.1734693877551, "grad_norm": 0.004252033773809671, "learning_rate": 1.576530612244898e-05, "loss": 0.0392, "step": 4150 }, { "epoch": 21.224489795918366, "grad_norm": 19.566246032714844, "learning_rate": 1.575510204081633e-05, "loss": 0.1262, "step": 4160 }, { "epoch": 21.275510204081634, "grad_norm": 0.003313811495900154, "learning_rate": 1.5744897959183675e-05, "loss": 0.0867, "step": 4170 }, { "epoch": 21.3265306122449, "grad_norm": 0.3749760091304779, "learning_rate": 1.573469387755102e-05, "loss": 0.1837, "step": 4180 }, { "epoch": 21.377551020408163, "grad_norm": 1.984567403793335, "learning_rate": 1.572448979591837e-05, "loss": 0.078, "step": 4190 }, { "epoch": 21.428571428571427, "grad_norm": 0.10399512201547623, "learning_rate": 1.5714285714285715e-05, "loss": 0.0045, "step": 4200 }, { "epoch": 21.479591836734695, "grad_norm": 0.002528528915718198, "learning_rate": 1.5704081632653065e-05, "loss": 0.189, "step": 4210 }, { "epoch": 21.53061224489796, "grad_norm": 20.07097053527832, "learning_rate": 1.569387755102041e-05, "loss": 0.0501, "step": 4220 }, { "epoch": 21.581632653061224, "grad_norm": 0.05640505999326706, "learning_rate": 1.5683673469387756e-05, "loss": 0.0684, "step": 4230 }, { "epoch": 21.632653061224488, "grad_norm": 0.009093325585126877, "learning_rate": 1.5673469387755102e-05, "loss": 0.1911, "step": 4240 }, { "epoch": 21.683673469387756, "grad_norm": 0.025773674249649048, "learning_rate": 1.566326530612245e-05, "loss": 0.0511, "step": 4250 }, { "epoch": 21.73469387755102, "grad_norm": 22.773637771606445, "learning_rate": 1.5653061224489797e-05, "loss": 0.1296, "step": 4260 }, { "epoch": 21.785714285714285, "grad_norm": 0.24630846083164215, "learning_rate": 1.5642857142857143e-05, "loss": 0.0996, "step": 4270 }, { "epoch": 21.836734693877553, "grad_norm": 0.052978452295064926, "learning_rate": 1.5632653061224492e-05, "loss": 0.1596, "step": 4280 }, { "epoch": 21.887755102040817, "grad_norm": 0.08490794152021408, "learning_rate": 1.5622448979591838e-05, "loss": 0.0044, "step": 4290 }, { "epoch": 21.93877551020408, "grad_norm": 0.29748037457466125, "learning_rate": 1.5612244897959187e-05, "loss": 0.0714, "step": 4300 }, { "epoch": 21.989795918367346, "grad_norm": 1.1432496309280396, "learning_rate": 1.5602040816326533e-05, "loss": 0.0348, "step": 4310 }, { "epoch": 22.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.4021454453468323, "eval_runtime": 1.1035, "eval_samples_per_second": 251.016, "eval_steps_per_second": 31.717, "step": 4312 }, { "epoch": 22.040816326530614, "grad_norm": 14.441971778869629, "learning_rate": 1.559183673469388e-05, "loss": 0.0226, "step": 4320 }, { "epoch": 22.091836734693878, "grad_norm": 0.3996661901473999, "learning_rate": 1.5581632653061224e-05, "loss": 0.2515, "step": 4330 }, { "epoch": 22.142857142857142, "grad_norm": 0.036494385451078415, "learning_rate": 1.5571428571428573e-05, "loss": 0.018, "step": 4340 }, { "epoch": 22.193877551020407, "grad_norm": 10.0978422164917, "learning_rate": 1.556122448979592e-05, "loss": 0.045, "step": 4350 }, { "epoch": 22.244897959183675, "grad_norm": 16.72916603088379, "learning_rate": 1.5551020408163265e-05, "loss": 0.0302, "step": 4360 }, { "epoch": 22.29591836734694, "grad_norm": 0.19142484664916992, "learning_rate": 1.5540816326530614e-05, "loss": 0.0929, "step": 4370 }, { "epoch": 22.346938775510203, "grad_norm": 28.01141357421875, "learning_rate": 1.553061224489796e-05, "loss": 0.1371, "step": 4380 }, { "epoch": 22.397959183673468, "grad_norm": 0.004009624011814594, "learning_rate": 1.552040816326531e-05, "loss": 0.1294, "step": 4390 }, { "epoch": 22.448979591836736, "grad_norm": 0.03476819396018982, "learning_rate": 1.5510204081632655e-05, "loss": 0.0016, "step": 4400 }, { "epoch": 22.5, "grad_norm": 18.83043670654297, "learning_rate": 1.55e-05, "loss": 0.1413, "step": 4410 }, { "epoch": 22.551020408163264, "grad_norm": 2.8161182403564453, "learning_rate": 1.5489795918367346e-05, "loss": 0.0335, "step": 4420 }, { "epoch": 22.602040816326532, "grad_norm": 0.07493819296360016, "learning_rate": 1.5479591836734696e-05, "loss": 0.118, "step": 4430 }, { "epoch": 22.653061224489797, "grad_norm": 21.360933303833008, "learning_rate": 1.546938775510204e-05, "loss": 0.0297, "step": 4440 }, { "epoch": 22.70408163265306, "grad_norm": 0.4809311628341675, "learning_rate": 1.545918367346939e-05, "loss": 0.0293, "step": 4450 }, { "epoch": 22.755102040816325, "grad_norm": 0.01991378143429756, "learning_rate": 1.5448979591836736e-05, "loss": 0.0275, "step": 4460 }, { "epoch": 22.806122448979593, "grad_norm": 0.01184681337326765, "learning_rate": 1.5438775510204082e-05, "loss": 0.007, "step": 4470 }, { "epoch": 22.857142857142858, "grad_norm": 0.00782001856714487, "learning_rate": 1.542857142857143e-05, "loss": 0.0223, "step": 4480 }, { "epoch": 22.908163265306122, "grad_norm": 0.032825034111738205, "learning_rate": 1.5418367346938777e-05, "loss": 0.0043, "step": 4490 }, { "epoch": 22.959183673469386, "grad_norm": 0.32028093934059143, "learning_rate": 1.5408163265306123e-05, "loss": 0.0823, "step": 4500 }, { "epoch": 23.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.4216640889644623, "eval_runtime": 1.0779, "eval_samples_per_second": 256.989, "eval_steps_per_second": 32.472, "step": 4508 }, { "epoch": 23.010204081632654, "grad_norm": 0.0210735984146595, "learning_rate": 1.539795918367347e-05, "loss": 0.0444, "step": 4510 }, { "epoch": 23.06122448979592, "grad_norm": 0.0035948522854596376, "learning_rate": 1.5387755102040818e-05, "loss": 0.0021, "step": 4520 }, { "epoch": 23.112244897959183, "grad_norm": 0.03267397731542587, "learning_rate": 1.5377551020408164e-05, "loss": 0.1172, "step": 4530 }, { "epoch": 23.163265306122447, "grad_norm": 0.2000296711921692, "learning_rate": 1.5367346938775513e-05, "loss": 0.0506, "step": 4540 }, { "epoch": 23.214285714285715, "grad_norm": 19.305952072143555, "learning_rate": 1.535714285714286e-05, "loss": 0.0229, "step": 4550 }, { "epoch": 23.26530612244898, "grad_norm": 19.652467727661133, "learning_rate": 1.5346938775510204e-05, "loss": 0.0758, "step": 4560 }, { "epoch": 23.316326530612244, "grad_norm": 0.10466165095567703, "learning_rate": 1.5336734693877554e-05, "loss": 0.0174, "step": 4570 }, { "epoch": 23.367346938775512, "grad_norm": 0.03658609837293625, "learning_rate": 1.53265306122449e-05, "loss": 0.1824, "step": 4580 }, { "epoch": 23.418367346938776, "grad_norm": 0.005342472344636917, "learning_rate": 1.5316326530612245e-05, "loss": 0.0059, "step": 4590 }, { "epoch": 23.46938775510204, "grad_norm": 0.003301397431641817, "learning_rate": 1.530612244897959e-05, "loss": 0.0112, "step": 4600 }, { "epoch": 23.520408163265305, "grad_norm": 22.812711715698242, "learning_rate": 1.529591836734694e-05, "loss": 0.1342, "step": 4610 }, { "epoch": 23.571428571428573, "grad_norm": 0.0017427552957087755, "learning_rate": 1.5285714285714286e-05, "loss": 0.0065, "step": 4620 }, { "epoch": 23.622448979591837, "grad_norm": 0.07252141088247299, "learning_rate": 1.5275510204081635e-05, "loss": 0.0195, "step": 4630 }, { "epoch": 23.6734693877551, "grad_norm": 0.0034201748203486204, "learning_rate": 1.526530612244898e-05, "loss": 0.0044, "step": 4640 }, { "epoch": 23.724489795918366, "grad_norm": 0.003710463410243392, "learning_rate": 1.5255102040816327e-05, "loss": 0.1342, "step": 4650 }, { "epoch": 23.775510204081634, "grad_norm": 0.041329726576805115, "learning_rate": 1.5244897959183676e-05, "loss": 0.001, "step": 4660 }, { "epoch": 23.8265306122449, "grad_norm": 7.312625885009766, "learning_rate": 1.5234693877551022e-05, "loss": 0.0778, "step": 4670 }, { "epoch": 23.877551020408163, "grad_norm": 9.63132095336914, "learning_rate": 1.522448979591837e-05, "loss": 0.0137, "step": 4680 }, { "epoch": 23.928571428571427, "grad_norm": 0.05912260338664055, "learning_rate": 1.5214285714285715e-05, "loss": 0.1853, "step": 4690 }, { "epoch": 23.979591836734695, "grad_norm": 0.0024821078404784203, "learning_rate": 1.5204081632653063e-05, "loss": 0.1125, "step": 4700 }, { "epoch": 24.0, "eval_accuracy": 0.9097472924187726, "eval_loss": 0.47038131952285767, "eval_runtime": 1.0814, "eval_samples_per_second": 256.159, "eval_steps_per_second": 32.367, "step": 4704 }, { "epoch": 24.03061224489796, "grad_norm": 0.004066828638315201, "learning_rate": 1.5193877551020408e-05, "loss": 0.0009, "step": 4710 }, { "epoch": 24.081632653061224, "grad_norm": 0.008945166133344173, "learning_rate": 1.5183673469387756e-05, "loss": 0.0502, "step": 4720 }, { "epoch": 24.132653061224488, "grad_norm": 0.06662328541278839, "learning_rate": 1.5173469387755105e-05, "loss": 0.1419, "step": 4730 }, { "epoch": 24.183673469387756, "grad_norm": 28.455562591552734, "learning_rate": 1.516326530612245e-05, "loss": 0.122, "step": 4740 }, { "epoch": 24.23469387755102, "grad_norm": 0.11682406812906265, "learning_rate": 1.5153061224489798e-05, "loss": 0.1796, "step": 4750 }, { "epoch": 24.285714285714285, "grad_norm": 0.02721104770898819, "learning_rate": 1.5142857142857144e-05, "loss": 0.0523, "step": 4760 }, { "epoch": 24.336734693877553, "grad_norm": 4.006387233734131, "learning_rate": 1.5132653061224492e-05, "loss": 0.0906, "step": 4770 }, { "epoch": 24.387755102040817, "grad_norm": 1.0888190269470215, "learning_rate": 1.5122448979591837e-05, "loss": 0.0199, "step": 4780 }, { "epoch": 24.43877551020408, "grad_norm": 0.011628500185906887, "learning_rate": 1.5112244897959185e-05, "loss": 0.0057, "step": 4790 }, { "epoch": 24.489795918367346, "grad_norm": 0.010286957025527954, "learning_rate": 1.510204081632653e-05, "loss": 0.1457, "step": 4800 }, { "epoch": 24.540816326530614, "grad_norm": 0.32226938009262085, "learning_rate": 1.5091836734693878e-05, "loss": 0.0009, "step": 4810 }, { "epoch": 24.591836734693878, "grad_norm": 0.005303681828081608, "learning_rate": 1.5081632653061227e-05, "loss": 0.026, "step": 4820 }, { "epoch": 24.642857142857142, "grad_norm": 14.81651496887207, "learning_rate": 1.5071428571428573e-05, "loss": 0.0849, "step": 4830 }, { "epoch": 24.693877551020407, "grad_norm": 20.982263565063477, "learning_rate": 1.506122448979592e-05, "loss": 0.0329, "step": 4840 }, { "epoch": 24.744897959183675, "grad_norm": 7.7448954582214355, "learning_rate": 1.5051020408163266e-05, "loss": 0.1013, "step": 4850 }, { "epoch": 24.79591836734694, "grad_norm": 0.06572245061397552, "learning_rate": 1.5040816326530614e-05, "loss": 0.058, "step": 4860 }, { "epoch": 24.846938775510203, "grad_norm": 0.00522198062390089, "learning_rate": 1.503061224489796e-05, "loss": 0.0095, "step": 4870 }, { "epoch": 24.897959183673468, "grad_norm": 32.695030212402344, "learning_rate": 1.5020408163265307e-05, "loss": 0.0216, "step": 4880 }, { "epoch": 24.948979591836736, "grad_norm": 0.03301665931940079, "learning_rate": 1.5010204081632653e-05, "loss": 0.1196, "step": 4890 }, { "epoch": 25.0, "grad_norm": 1.3941667079925537, "learning_rate": 1.5000000000000002e-05, "loss": 0.0173, "step": 4900 }, { "epoch": 25.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.37002092599868774, "eval_runtime": 1.0802, "eval_samples_per_second": 256.445, "eval_steps_per_second": 32.403, "step": 4900 }, { "epoch": 25.051020408163264, "grad_norm": 0.1842292845249176, "learning_rate": 1.498979591836735e-05, "loss": 0.0116, "step": 4910 }, { "epoch": 25.102040816326532, "grad_norm": 7.273754596710205, "learning_rate": 1.4979591836734695e-05, "loss": 0.1267, "step": 4920 }, { "epoch": 25.153061224489797, "grad_norm": 0.06846355646848679, "learning_rate": 1.4969387755102043e-05, "loss": 0.1304, "step": 4930 }, { "epoch": 25.20408163265306, "grad_norm": 0.0030682089272886515, "learning_rate": 1.4959183673469389e-05, "loss": 0.2696, "step": 4940 }, { "epoch": 25.255102040816325, "grad_norm": 0.15560543537139893, "learning_rate": 1.4948979591836736e-05, "loss": 0.1626, "step": 4950 }, { "epoch": 25.306122448979593, "grad_norm": 0.0133698470890522, "learning_rate": 1.4938775510204082e-05, "loss": 0.0571, "step": 4960 }, { "epoch": 25.357142857142858, "grad_norm": 0.008196322247385979, "learning_rate": 1.492857142857143e-05, "loss": 0.1697, "step": 4970 }, { "epoch": 25.408163265306122, "grad_norm": 0.008791767060756683, "learning_rate": 1.4918367346938775e-05, "loss": 0.1088, "step": 4980 }, { "epoch": 25.459183673469386, "grad_norm": 0.10973454266786575, "learning_rate": 1.4908163265306124e-05, "loss": 0.0558, "step": 4990 }, { "epoch": 25.510204081632654, "grad_norm": 3.0360498428344727, "learning_rate": 1.4897959183673472e-05, "loss": 0.0523, "step": 5000 }, { "epoch": 25.56122448979592, "grad_norm": 0.002700751880183816, "learning_rate": 1.4887755102040818e-05, "loss": 0.0128, "step": 5010 }, { "epoch": 25.612244897959183, "grad_norm": 12.229923248291016, "learning_rate": 1.4877551020408165e-05, "loss": 0.0797, "step": 5020 }, { "epoch": 25.663265306122447, "grad_norm": 15.881488800048828, "learning_rate": 1.4867346938775511e-05, "loss": 0.0687, "step": 5030 }, { "epoch": 25.714285714285715, "grad_norm": 0.00416394229978323, "learning_rate": 1.4857142857142858e-05, "loss": 0.0225, "step": 5040 }, { "epoch": 25.76530612244898, "grad_norm": 0.07719247043132782, "learning_rate": 1.4846938775510204e-05, "loss": 0.1166, "step": 5050 }, { "epoch": 25.816326530612244, "grad_norm": 0.16827912628650665, "learning_rate": 1.4836734693877552e-05, "loss": 0.0434, "step": 5060 }, { "epoch": 25.867346938775512, "grad_norm": 0.3525458872318268, "learning_rate": 1.4826530612244897e-05, "loss": 0.0028, "step": 5070 }, { "epoch": 25.918367346938776, "grad_norm": 0.09424704313278198, "learning_rate": 1.4816326530612247e-05, "loss": 0.1716, "step": 5080 }, { "epoch": 25.96938775510204, "grad_norm": 9.926280975341797, "learning_rate": 1.4806122448979594e-05, "loss": 0.0442, "step": 5090 }, { "epoch": 26.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.37251153588294983, "eval_runtime": 1.1382, "eval_samples_per_second": 243.368, "eval_steps_per_second": 30.75, "step": 5096 }, { "epoch": 26.020408163265305, "grad_norm": 0.0038044501561671495, "learning_rate": 1.479591836734694e-05, "loss": 0.0042, "step": 5100 }, { "epoch": 26.071428571428573, "grad_norm": 0.0032165925949811935, "learning_rate": 1.4785714285714287e-05, "loss": 0.1928, "step": 5110 }, { "epoch": 26.122448979591837, "grad_norm": 0.029962917789816856, "learning_rate": 1.4775510204081633e-05, "loss": 0.0424, "step": 5120 }, { "epoch": 26.1734693877551, "grad_norm": 0.00389060634188354, "learning_rate": 1.476530612244898e-05, "loss": 0.0014, "step": 5130 }, { "epoch": 26.224489795918366, "grad_norm": 0.20665211975574493, "learning_rate": 1.4755102040816326e-05, "loss": 0.0302, "step": 5140 }, { "epoch": 26.275510204081634, "grad_norm": 29.720434188842773, "learning_rate": 1.4744897959183676e-05, "loss": 0.1544, "step": 5150 }, { "epoch": 26.3265306122449, "grad_norm": 0.003939055372029543, "learning_rate": 1.4734693877551021e-05, "loss": 0.0854, "step": 5160 }, { "epoch": 26.377551020408163, "grad_norm": 0.0020340175833553076, "learning_rate": 1.4724489795918369e-05, "loss": 0.0562, "step": 5170 }, { "epoch": 26.428571428571427, "grad_norm": 0.002443284960463643, "learning_rate": 1.4714285714285716e-05, "loss": 0.1435, "step": 5180 }, { "epoch": 26.479591836734695, "grad_norm": 0.36641427874565125, "learning_rate": 1.4704081632653062e-05, "loss": 0.0115, "step": 5190 }, { "epoch": 26.53061224489796, "grad_norm": 0.3134573698043823, "learning_rate": 1.469387755102041e-05, "loss": 0.1414, "step": 5200 }, { "epoch": 26.581632653061224, "grad_norm": 33.40412139892578, "learning_rate": 1.4683673469387756e-05, "loss": 0.0945, "step": 5210 }, { "epoch": 26.632653061224488, "grad_norm": 1.546060562133789, "learning_rate": 1.4673469387755103e-05, "loss": 0.1156, "step": 5220 }, { "epoch": 26.683673469387756, "grad_norm": 0.004960376303642988, "learning_rate": 1.4663265306122449e-05, "loss": 0.0909, "step": 5230 }, { "epoch": 26.73469387755102, "grad_norm": 0.042291510850191116, "learning_rate": 1.4653061224489798e-05, "loss": 0.0938, "step": 5240 }, { "epoch": 26.785714285714285, "grad_norm": 0.04998750239610672, "learning_rate": 1.4642857142857144e-05, "loss": 0.0529, "step": 5250 }, { "epoch": 26.836734693877553, "grad_norm": 0.010813983157277107, "learning_rate": 1.4632653061224491e-05, "loss": 0.1066, "step": 5260 }, { "epoch": 26.887755102040817, "grad_norm": 0.004219889175146818, "learning_rate": 1.4622448979591839e-05, "loss": 0.0628, "step": 5270 }, { "epoch": 26.93877551020408, "grad_norm": 0.022697672247886658, "learning_rate": 1.4612244897959185e-05, "loss": 0.055, "step": 5280 }, { "epoch": 26.989795918367346, "grad_norm": 0.007089439779520035, "learning_rate": 1.4602040816326532e-05, "loss": 0.0009, "step": 5290 }, { "epoch": 27.0, "eval_accuracy": 0.927797833935018, "eval_loss": 0.4819308817386627, "eval_runtime": 1.0784, "eval_samples_per_second": 256.855, "eval_steps_per_second": 32.455, "step": 5292 }, { "epoch": 27.040816326530614, "grad_norm": 0.014885811135172844, "learning_rate": 1.4591836734693878e-05, "loss": 0.1368, "step": 5300 }, { "epoch": 27.091836734693878, "grad_norm": 22.175621032714844, "learning_rate": 1.4581632653061227e-05, "loss": 0.023, "step": 5310 }, { "epoch": 27.142857142857142, "grad_norm": 0.14784382283687592, "learning_rate": 1.4571428571428573e-05, "loss": 0.0336, "step": 5320 }, { "epoch": 27.193877551020407, "grad_norm": 20.872800827026367, "learning_rate": 1.456122448979592e-05, "loss": 0.0217, "step": 5330 }, { "epoch": 27.244897959183675, "grad_norm": 0.012372996658086777, "learning_rate": 1.4551020408163266e-05, "loss": 0.0076, "step": 5340 }, { "epoch": 27.29591836734694, "grad_norm": 36.15734100341797, "learning_rate": 1.4540816326530614e-05, "loss": 0.0903, "step": 5350 }, { "epoch": 27.346938775510203, "grad_norm": 20.528474807739258, "learning_rate": 1.4530612244897961e-05, "loss": 0.0616, "step": 5360 }, { "epoch": 27.397959183673468, "grad_norm": 0.12114187330007553, "learning_rate": 1.4520408163265307e-05, "loss": 0.0006, "step": 5370 }, { "epoch": 27.448979591836736, "grad_norm": 0.0014444569824263453, "learning_rate": 1.4510204081632654e-05, "loss": 0.096, "step": 5380 }, { "epoch": 27.5, "grad_norm": 0.22934114933013916, "learning_rate": 1.45e-05, "loss": 0.0251, "step": 5390 }, { "epoch": 27.551020408163264, "grad_norm": 9.213054656982422, "learning_rate": 1.448979591836735e-05, "loss": 0.1695, "step": 5400 }, { "epoch": 27.602040816326532, "grad_norm": 0.011412393301725388, "learning_rate": 1.4479591836734695e-05, "loss": 0.001, "step": 5410 }, { "epoch": 27.653061224489797, "grad_norm": 29.546188354492188, "learning_rate": 1.4469387755102043e-05, "loss": 0.0858, "step": 5420 }, { "epoch": 27.70408163265306, "grad_norm": 0.011909236200153828, "learning_rate": 1.4459183673469388e-05, "loss": 0.1862, "step": 5430 }, { "epoch": 27.755102040816325, "grad_norm": 26.724388122558594, "learning_rate": 1.4448979591836736e-05, "loss": 0.0865, "step": 5440 }, { "epoch": 27.806122448979593, "grad_norm": 2.0963311195373535, "learning_rate": 1.4438775510204083e-05, "loss": 0.0945, "step": 5450 }, { "epoch": 27.857142857142858, "grad_norm": 0.18771357834339142, "learning_rate": 1.4428571428571429e-05, "loss": 0.009, "step": 5460 }, { "epoch": 27.908163265306122, "grad_norm": 0.5093016028404236, "learning_rate": 1.4418367346938778e-05, "loss": 0.0017, "step": 5470 }, { "epoch": 27.959183673469386, "grad_norm": 0.007935337722301483, "learning_rate": 1.4408163265306122e-05, "loss": 0.0087, "step": 5480 }, { "epoch": 28.0, "eval_accuracy": 0.9169675090252708, "eval_loss": 0.6492011547088623, "eval_runtime": 1.0803, "eval_samples_per_second": 256.42, "eval_steps_per_second": 32.4, "step": 5488 }, { "epoch": 28.010204081632654, "grad_norm": 0.01704835705459118, "learning_rate": 1.4397959183673472e-05, "loss": 0.0452, "step": 5490 }, { "epoch": 28.06122448979592, "grad_norm": 0.00471374299377203, "learning_rate": 1.4387755102040817e-05, "loss": 0.0011, "step": 5500 }, { "epoch": 28.112244897959183, "grad_norm": 1.8450199365615845, "learning_rate": 1.4377551020408165e-05, "loss": 0.0444, "step": 5510 }, { "epoch": 28.163265306122447, "grad_norm": 17.47882652282715, "learning_rate": 1.436734693877551e-05, "loss": 0.0641, "step": 5520 }, { "epoch": 28.214285714285715, "grad_norm": 18.021268844604492, "learning_rate": 1.4357142857142858e-05, "loss": 0.0235, "step": 5530 }, { "epoch": 28.26530612244898, "grad_norm": 0.0034432827960699797, "learning_rate": 1.4346938775510206e-05, "loss": 0.0965, "step": 5540 }, { "epoch": 28.316326530612244, "grad_norm": 0.016300557181239128, "learning_rate": 1.4336734693877551e-05, "loss": 0.0014, "step": 5550 }, { "epoch": 28.367346938775512, "grad_norm": 1.197704553604126, "learning_rate": 1.43265306122449e-05, "loss": 0.0014, "step": 5560 }, { "epoch": 28.418367346938776, "grad_norm": 0.17368820309638977, "learning_rate": 1.4316326530612246e-05, "loss": 0.0737, "step": 5570 }, { "epoch": 28.46938775510204, "grad_norm": 21.89083480834961, "learning_rate": 1.4306122448979594e-05, "loss": 0.1268, "step": 5580 }, { "epoch": 28.520408163265305, "grad_norm": 2.6276450157165527, "learning_rate": 1.429591836734694e-05, "loss": 0.1495, "step": 5590 }, { "epoch": 28.571428571428573, "grad_norm": 0.7949331998825073, "learning_rate": 1.4285714285714287e-05, "loss": 0.245, "step": 5600 }, { "epoch": 28.622448979591837, "grad_norm": 0.021174795925617218, "learning_rate": 1.4275510204081633e-05, "loss": 0.0014, "step": 5610 }, { "epoch": 28.6734693877551, "grad_norm": 0.0018668081611394882, "learning_rate": 1.426530612244898e-05, "loss": 0.2229, "step": 5620 }, { "epoch": 28.724489795918366, "grad_norm": 0.006676795892417431, "learning_rate": 1.425510204081633e-05, "loss": 0.0998, "step": 5630 }, { "epoch": 28.775510204081634, "grad_norm": 0.023391634225845337, "learning_rate": 1.4244897959183674e-05, "loss": 0.121, "step": 5640 }, { "epoch": 28.8265306122449, "grad_norm": 0.048326849937438965, "learning_rate": 1.4234693877551023e-05, "loss": 0.0031, "step": 5650 }, { "epoch": 28.877551020408163, "grad_norm": 0.19087979197502136, "learning_rate": 1.4224489795918369e-05, "loss": 0.0081, "step": 5660 }, { "epoch": 28.928571428571427, "grad_norm": 0.10267847776412964, "learning_rate": 1.4214285714285716e-05, "loss": 0.009, "step": 5670 }, { "epoch": 28.979591836734695, "grad_norm": 0.0035039009526371956, "learning_rate": 1.4204081632653062e-05, "loss": 0.0021, "step": 5680 }, { "epoch": 29.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.5297299027442932, "eval_runtime": 1.0793, "eval_samples_per_second": 256.647, "eval_steps_per_second": 32.428, "step": 5684 }, { "epoch": 29.03061224489796, "grad_norm": 0.4309476912021637, "learning_rate": 1.419387755102041e-05, "loss": 0.1136, "step": 5690 }, { "epoch": 29.081632653061224, "grad_norm": 7.9662628173828125, "learning_rate": 1.4183673469387755e-05, "loss": 0.0855, "step": 5700 }, { "epoch": 29.132653061224488, "grad_norm": 0.8649994134902954, "learning_rate": 1.4173469387755103e-05, "loss": 0.0722, "step": 5710 }, { "epoch": 29.183673469387756, "grad_norm": 0.4231327772140503, "learning_rate": 1.4163265306122452e-05, "loss": 0.0418, "step": 5720 }, { "epoch": 29.23469387755102, "grad_norm": 0.06898737698793411, "learning_rate": 1.4153061224489798e-05, "loss": 0.1159, "step": 5730 }, { "epoch": 29.285714285714285, "grad_norm": 0.02984306961297989, "learning_rate": 1.4142857142857145e-05, "loss": 0.0751, "step": 5740 }, { "epoch": 29.336734693877553, "grad_norm": 0.026546800509095192, "learning_rate": 1.4132653061224491e-05, "loss": 0.0557, "step": 5750 }, { "epoch": 29.387755102040817, "grad_norm": 17.836233139038086, "learning_rate": 1.4122448979591838e-05, "loss": 0.0151, "step": 5760 }, { "epoch": 29.43877551020408, "grad_norm": 0.0028448128141462803, "learning_rate": 1.4112244897959184e-05, "loss": 0.0779, "step": 5770 }, { "epoch": 29.489795918367346, "grad_norm": 2.767103910446167, "learning_rate": 1.4102040816326532e-05, "loss": 0.071, "step": 5780 }, { "epoch": 29.540816326530614, "grad_norm": 17.4755916595459, "learning_rate": 1.4091836734693877e-05, "loss": 0.223, "step": 5790 }, { "epoch": 29.591836734693878, "grad_norm": 0.010452112182974815, "learning_rate": 1.4081632653061225e-05, "loss": 0.0014, "step": 5800 }, { "epoch": 29.642857142857142, "grad_norm": 0.01051530335098505, "learning_rate": 1.4071428571428574e-05, "loss": 0.1004, "step": 5810 }, { "epoch": 29.693877551020407, "grad_norm": 0.004229763988405466, "learning_rate": 1.406122448979592e-05, "loss": 0.0236, "step": 5820 }, { "epoch": 29.744897959183675, "grad_norm": 11.499141693115234, "learning_rate": 1.4051020408163267e-05, "loss": 0.0846, "step": 5830 }, { "epoch": 29.79591836734694, "grad_norm": 3.29797625541687, "learning_rate": 1.4040816326530613e-05, "loss": 0.0896, "step": 5840 }, { "epoch": 29.846938775510203, "grad_norm": 0.013699451461434364, "learning_rate": 1.403061224489796e-05, "loss": 0.046, "step": 5850 }, { "epoch": 29.897959183673468, "grad_norm": 0.003947919234633446, "learning_rate": 1.4020408163265307e-05, "loss": 0.0016, "step": 5860 }, { "epoch": 29.948979591836736, "grad_norm": 0.0032293214462697506, "learning_rate": 1.4010204081632654e-05, "loss": 0.0149, "step": 5870 }, { "epoch": 30.0, "grad_norm": 0.028891170397400856, "learning_rate": 1.4e-05, "loss": 0.2552, "step": 5880 }, { "epoch": 30.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.44820117950439453, "eval_runtime": 1.1261, "eval_samples_per_second": 245.985, "eval_steps_per_second": 31.081, "step": 5880 }, { "epoch": 30.051020408163264, "grad_norm": 0.014046141877770424, "learning_rate": 1.3989795918367349e-05, "loss": 0.0862, "step": 5890 }, { "epoch": 30.102040816326532, "grad_norm": 19.06598663330078, "learning_rate": 1.3979591836734696e-05, "loss": 0.0929, "step": 5900 }, { "epoch": 30.153061224489797, "grad_norm": 18.164772033691406, "learning_rate": 1.3969387755102042e-05, "loss": 0.0391, "step": 5910 }, { "epoch": 30.20408163265306, "grad_norm": 0.003082715207710862, "learning_rate": 1.395918367346939e-05, "loss": 0.02, "step": 5920 }, { "epoch": 30.255102040816325, "grad_norm": 0.5550926327705383, "learning_rate": 1.3948979591836736e-05, "loss": 0.1176, "step": 5930 }, { "epoch": 30.306122448979593, "grad_norm": 0.23072606325149536, "learning_rate": 1.3938775510204083e-05, "loss": 0.0023, "step": 5940 }, { "epoch": 30.357142857142858, "grad_norm": 0.001888998318463564, "learning_rate": 1.3928571428571429e-05, "loss": 0.0093, "step": 5950 }, { "epoch": 30.408163265306122, "grad_norm": 0.0487925261259079, "learning_rate": 1.3918367346938776e-05, "loss": 0.0475, "step": 5960 }, { "epoch": 30.459183673469386, "grad_norm": 1.5973352193832397, "learning_rate": 1.3908163265306122e-05, "loss": 0.0022, "step": 5970 }, { "epoch": 30.510204081632654, "grad_norm": 0.7020425796508789, "learning_rate": 1.3897959183673471e-05, "loss": 0.0295, "step": 5980 }, { "epoch": 30.56122448979592, "grad_norm": 0.004678144119679928, "learning_rate": 1.3887755102040819e-05, "loss": 0.0033, "step": 5990 }, { "epoch": 30.612244897959183, "grad_norm": 0.011463306844234467, "learning_rate": 1.3877551020408165e-05, "loss": 0.0009, "step": 6000 }, { "epoch": 30.663265306122447, "grad_norm": 0.0027208600658923388, "learning_rate": 1.3867346938775512e-05, "loss": 0.0314, "step": 6010 }, { "epoch": 30.714285714285715, "grad_norm": 0.045380089432001114, "learning_rate": 1.3857142857142858e-05, "loss": 0.0411, "step": 6020 }, { "epoch": 30.76530612244898, "grad_norm": 0.00499904528260231, "learning_rate": 1.3846938775510205e-05, "loss": 0.1203, "step": 6030 }, { "epoch": 30.816326530612244, "grad_norm": 0.6247161030769348, "learning_rate": 1.3836734693877551e-05, "loss": 0.0195, "step": 6040 }, { "epoch": 30.867346938775512, "grad_norm": 0.008205076679587364, "learning_rate": 1.38265306122449e-05, "loss": 0.0351, "step": 6050 }, { "epoch": 30.918367346938776, "grad_norm": 3.9026269912719727, "learning_rate": 1.3816326530612244e-05, "loss": 0.1038, "step": 6060 }, { "epoch": 30.96938775510204, "grad_norm": 0.01678004302084446, "learning_rate": 1.3806122448979594e-05, "loss": 0.0154, "step": 6070 }, { "epoch": 31.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.6074639558792114, "eval_runtime": 1.0927, "eval_samples_per_second": 253.501, "eval_steps_per_second": 32.031, "step": 6076 }, { "epoch": 31.020408163265305, "grad_norm": 0.0011565345339477062, "learning_rate": 1.3795918367346941e-05, "loss": 0.0032, "step": 6080 }, { "epoch": 31.071428571428573, "grad_norm": 0.008425744250416756, "learning_rate": 1.3785714285714287e-05, "loss": 0.0597, "step": 6090 }, { "epoch": 31.122448979591837, "grad_norm": 0.0023939176462590694, "learning_rate": 1.3775510204081634e-05, "loss": 0.0088, "step": 6100 }, { "epoch": 31.1734693877551, "grad_norm": 34.218536376953125, "learning_rate": 1.376530612244898e-05, "loss": 0.1183, "step": 6110 }, { "epoch": 31.224489795918366, "grad_norm": 0.06664474308490753, "learning_rate": 1.3755102040816328e-05, "loss": 0.1702, "step": 6120 }, { "epoch": 31.275510204081634, "grad_norm": 0.028960296884179115, "learning_rate": 1.3744897959183673e-05, "loss": 0.0983, "step": 6130 }, { "epoch": 31.3265306122449, "grad_norm": 0.004296618048101664, "learning_rate": 1.3734693877551023e-05, "loss": 0.026, "step": 6140 }, { "epoch": 31.377551020408163, "grad_norm": 0.009873881936073303, "learning_rate": 1.3724489795918368e-05, "loss": 0.0019, "step": 6150 }, { "epoch": 31.428571428571427, "grad_norm": 0.022070279344916344, "learning_rate": 1.3714285714285716e-05, "loss": 0.0226, "step": 6160 }, { "epoch": 31.479591836734695, "grad_norm": 0.006415162701159716, "learning_rate": 1.3704081632653062e-05, "loss": 0.0026, "step": 6170 }, { "epoch": 31.53061224489796, "grad_norm": 0.042083319276571274, "learning_rate": 1.3693877551020409e-05, "loss": 0.111, "step": 6180 }, { "epoch": 31.581632653061224, "grad_norm": 0.0045786648988723755, "learning_rate": 1.3683673469387757e-05, "loss": 0.0448, "step": 6190 }, { "epoch": 31.632653061224488, "grad_norm": 0.028560718521475792, "learning_rate": 1.3673469387755102e-05, "loss": 0.0767, "step": 6200 }, { "epoch": 31.683673469387756, "grad_norm": 0.008348624221980572, "learning_rate": 1.366326530612245e-05, "loss": 0.0171, "step": 6210 }, { "epoch": 31.73469387755102, "grad_norm": 0.1219976395368576, "learning_rate": 1.3653061224489796e-05, "loss": 0.0106, "step": 6220 }, { "epoch": 31.785714285714285, "grad_norm": 0.0019576449412852526, "learning_rate": 1.3642857142857145e-05, "loss": 0.0262, "step": 6230 }, { "epoch": 31.836734693877553, "grad_norm": 0.009538091719150543, "learning_rate": 1.363265306122449e-05, "loss": 0.0009, "step": 6240 }, { "epoch": 31.887755102040817, "grad_norm": 0.0054687936790287495, "learning_rate": 1.3622448979591838e-05, "loss": 0.0014, "step": 6250 }, { "epoch": 31.93877551020408, "grad_norm": 15.86870002746582, "learning_rate": 1.3612244897959184e-05, "loss": 0.0992, "step": 6260 }, { "epoch": 31.989795918367346, "grad_norm": 0.0018693085294216871, "learning_rate": 1.3602040816326531e-05, "loss": 0.0009, "step": 6270 }, { "epoch": 32.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.4100903570652008, "eval_runtime": 1.1346, "eval_samples_per_second": 244.147, "eval_steps_per_second": 30.849, "step": 6272 }, { "epoch": 32.04081632653061, "grad_norm": 0.0012913087848573923, "learning_rate": 1.3591836734693879e-05, "loss": 0.0231, "step": 6280 }, { "epoch": 32.09183673469388, "grad_norm": 0.010441271588206291, "learning_rate": 1.3581632653061225e-05, "loss": 0.0225, "step": 6290 }, { "epoch": 32.142857142857146, "grad_norm": 0.18999116122722626, "learning_rate": 1.3571428571428574e-05, "loss": 0.2475, "step": 6300 }, { "epoch": 32.19387755102041, "grad_norm": 0.011192509904503822, "learning_rate": 1.356122448979592e-05, "loss": 0.0138, "step": 6310 }, { "epoch": 32.244897959183675, "grad_norm": 0.006277843844145536, "learning_rate": 1.3551020408163267e-05, "loss": 0.114, "step": 6320 }, { "epoch": 32.295918367346935, "grad_norm": 0.002191660925745964, "learning_rate": 1.3540816326530613e-05, "loss": 0.0038, "step": 6330 }, { "epoch": 32.3469387755102, "grad_norm": 0.010471176356077194, "learning_rate": 1.353061224489796e-05, "loss": 0.0251, "step": 6340 }, { "epoch": 32.39795918367347, "grad_norm": 0.41251301765441895, "learning_rate": 1.3520408163265306e-05, "loss": 0.0025, "step": 6350 }, { "epoch": 32.44897959183673, "grad_norm": 0.004002411849796772, "learning_rate": 1.3510204081632654e-05, "loss": 0.1102, "step": 6360 }, { "epoch": 32.5, "grad_norm": 0.011486790142953396, "learning_rate": 1.3500000000000001e-05, "loss": 0.0004, "step": 6370 }, { "epoch": 32.55102040816327, "grad_norm": 30.469715118408203, "learning_rate": 1.3489795918367347e-05, "loss": 0.1941, "step": 6380 }, { "epoch": 32.60204081632653, "grad_norm": 0.5846330523490906, "learning_rate": 1.3479591836734696e-05, "loss": 0.0012, "step": 6390 }, { "epoch": 32.6530612244898, "grad_norm": 0.0026396731846034527, "learning_rate": 1.3469387755102042e-05, "loss": 0.0283, "step": 6400 }, { "epoch": 32.704081632653065, "grad_norm": 29.736446380615234, "learning_rate": 1.345918367346939e-05, "loss": 0.0657, "step": 6410 }, { "epoch": 32.755102040816325, "grad_norm": 0.00833828840404749, "learning_rate": 1.3448979591836735e-05, "loss": 0.1125, "step": 6420 }, { "epoch": 32.80612244897959, "grad_norm": 0.002807618584483862, "learning_rate": 1.3438775510204083e-05, "loss": 0.0006, "step": 6430 }, { "epoch": 32.857142857142854, "grad_norm": 0.02780280075967312, "learning_rate": 1.3428571428571429e-05, "loss": 0.0531, "step": 6440 }, { "epoch": 32.90816326530612, "grad_norm": 0.005529416725039482, "learning_rate": 1.3418367346938776e-05, "loss": 0.1206, "step": 6450 }, { "epoch": 32.95918367346939, "grad_norm": 0.008539948612451553, "learning_rate": 1.3408163265306125e-05, "loss": 0.1626, "step": 6460 }, { "epoch": 33.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.46525996923446655, "eval_runtime": 1.0828, "eval_samples_per_second": 255.817, "eval_steps_per_second": 32.323, "step": 6468 }, { "epoch": 33.01020408163265, "grad_norm": 0.0022733339574187994, "learning_rate": 1.3397959183673471e-05, "loss": 0.0831, "step": 6470 }, { "epoch": 33.06122448979592, "grad_norm": 0.013245400972664356, "learning_rate": 1.3387755102040818e-05, "loss": 0.0005, "step": 6480 }, { "epoch": 33.11224489795919, "grad_norm": 0.003069472499191761, "learning_rate": 1.3377551020408164e-05, "loss": 0.1701, "step": 6490 }, { "epoch": 33.16326530612245, "grad_norm": 0.01831808313727379, "learning_rate": 1.3367346938775512e-05, "loss": 0.0601, "step": 6500 }, { "epoch": 33.214285714285715, "grad_norm": 0.0012898804852738976, "learning_rate": 1.3357142857142858e-05, "loss": 0.0849, "step": 6510 }, { "epoch": 33.265306122448976, "grad_norm": 1.121724009513855, "learning_rate": 1.3346938775510205e-05, "loss": 0.0266, "step": 6520 }, { "epoch": 33.316326530612244, "grad_norm": 0.008910677395761013, "learning_rate": 1.333673469387755e-05, "loss": 0.1502, "step": 6530 }, { "epoch": 33.36734693877551, "grad_norm": 0.012071785517036915, "learning_rate": 1.3326530612244898e-05, "loss": 0.1133, "step": 6540 }, { "epoch": 33.41836734693877, "grad_norm": 0.016740737482905388, "learning_rate": 1.3316326530612247e-05, "loss": 0.0019, "step": 6550 }, { "epoch": 33.46938775510204, "grad_norm": 0.2254507839679718, "learning_rate": 1.3306122448979593e-05, "loss": 0.1177, "step": 6560 }, { "epoch": 33.52040816326531, "grad_norm": 0.002020282903686166, "learning_rate": 1.329591836734694e-05, "loss": 0.0477, "step": 6570 }, { "epoch": 33.57142857142857, "grad_norm": 0.003004984697327018, "learning_rate": 1.3285714285714287e-05, "loss": 0.1335, "step": 6580 }, { "epoch": 33.62244897959184, "grad_norm": 0.15333491563796997, "learning_rate": 1.3275510204081634e-05, "loss": 0.007, "step": 6590 }, { "epoch": 33.673469387755105, "grad_norm": 0.001080219866707921, "learning_rate": 1.326530612244898e-05, "loss": 0.0161, "step": 6600 }, { "epoch": 33.724489795918366, "grad_norm": 0.0072516873478889465, "learning_rate": 1.3255102040816327e-05, "loss": 0.0628, "step": 6610 }, { "epoch": 33.775510204081634, "grad_norm": 0.0021581093315035105, "learning_rate": 1.3244897959183673e-05, "loss": 0.0182, "step": 6620 }, { "epoch": 33.826530612244895, "grad_norm": 0.09171311557292938, "learning_rate": 1.323469387755102e-05, "loss": 0.0162, "step": 6630 }, { "epoch": 33.87755102040816, "grad_norm": 0.9838017821311951, "learning_rate": 1.322448979591837e-05, "loss": 0.0394, "step": 6640 }, { "epoch": 33.92857142857143, "grad_norm": 0.5027598738670349, "learning_rate": 1.3214285714285716e-05, "loss": 0.0649, "step": 6650 }, { "epoch": 33.97959183673469, "grad_norm": 20.55791473388672, "learning_rate": 1.3204081632653063e-05, "loss": 0.0276, "step": 6660 }, { "epoch": 34.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.4173925220966339, "eval_runtime": 1.0906, "eval_samples_per_second": 253.982, "eval_steps_per_second": 32.092, "step": 6664 }, { "epoch": 34.03061224489796, "grad_norm": 1.3319889307022095, "learning_rate": 1.3193877551020409e-05, "loss": 0.0598, "step": 6670 }, { "epoch": 34.08163265306123, "grad_norm": 0.0036518473643809557, "learning_rate": 1.3183673469387756e-05, "loss": 0.0556, "step": 6680 }, { "epoch": 34.13265306122449, "grad_norm": 6.119904041290283, "learning_rate": 1.3173469387755102e-05, "loss": 0.0483, "step": 6690 }, { "epoch": 34.183673469387756, "grad_norm": 0.005274696741253138, "learning_rate": 1.316326530612245e-05, "loss": 0.0103, "step": 6700 }, { "epoch": 34.234693877551024, "grad_norm": 0.05321372300386429, "learning_rate": 1.3153061224489795e-05, "loss": 0.1824, "step": 6710 }, { "epoch": 34.285714285714285, "grad_norm": 0.0330558717250824, "learning_rate": 1.3142857142857145e-05, "loss": 0.0513, "step": 6720 }, { "epoch": 34.33673469387755, "grad_norm": 0.0024037493858486414, "learning_rate": 1.3132653061224492e-05, "loss": 0.0879, "step": 6730 }, { "epoch": 34.38775510204081, "grad_norm": 0.0018719666404649615, "learning_rate": 1.3122448979591838e-05, "loss": 0.0321, "step": 6740 }, { "epoch": 34.43877551020408, "grad_norm": 0.006237344350665808, "learning_rate": 1.3112244897959185e-05, "loss": 0.0007, "step": 6750 }, { "epoch": 34.48979591836735, "grad_norm": 0.002857339335605502, "learning_rate": 1.3102040816326531e-05, "loss": 0.1109, "step": 6760 }, { "epoch": 34.54081632653061, "grad_norm": 0.5727746486663818, "learning_rate": 1.3091836734693879e-05, "loss": 0.083, "step": 6770 }, { "epoch": 34.59183673469388, "grad_norm": 0.0016499439952895045, "learning_rate": 1.3081632653061224e-05, "loss": 0.0981, "step": 6780 }, { "epoch": 34.642857142857146, "grad_norm": 0.012267863377928734, "learning_rate": 1.3071428571428572e-05, "loss": 0.0542, "step": 6790 }, { "epoch": 34.69387755102041, "grad_norm": 0.008558155037462711, "learning_rate": 1.3061224489795918e-05, "loss": 0.0018, "step": 6800 }, { "epoch": 34.744897959183675, "grad_norm": 10.578545570373535, "learning_rate": 1.3051020408163267e-05, "loss": 0.0186, "step": 6810 }, { "epoch": 34.795918367346935, "grad_norm": 0.009608473628759384, "learning_rate": 1.3040816326530614e-05, "loss": 0.1214, "step": 6820 }, { "epoch": 34.8469387755102, "grad_norm": 13.965607643127441, "learning_rate": 1.303061224489796e-05, "loss": 0.0874, "step": 6830 }, { "epoch": 34.89795918367347, "grad_norm": 0.0072668869979679585, "learning_rate": 1.3020408163265308e-05, "loss": 0.0002, "step": 6840 }, { "epoch": 34.94897959183673, "grad_norm": 1.1352980136871338, "learning_rate": 1.3010204081632653e-05, "loss": 0.0226, "step": 6850 }, { "epoch": 35.0, "grad_norm": 0.006007325369864702, "learning_rate": 1.3000000000000001e-05, "loss": 0.0139, "step": 6860 }, { "epoch": 35.0, "eval_accuracy": 0.9422382671480144, "eval_loss": 0.39917948842048645, "eval_runtime": 1.0839, "eval_samples_per_second": 255.551, "eval_steps_per_second": 32.29, "step": 6860 }, { "epoch": 35.05102040816327, "grad_norm": 40.382354736328125, "learning_rate": 1.2989795918367347e-05, "loss": 0.0329, "step": 6870 }, { "epoch": 35.10204081632653, "grad_norm": 0.006964458152651787, "learning_rate": 1.2979591836734696e-05, "loss": 0.0766, "step": 6880 }, { "epoch": 35.1530612244898, "grad_norm": 0.007393226958811283, "learning_rate": 1.2969387755102042e-05, "loss": 0.1791, "step": 6890 }, { "epoch": 35.204081632653065, "grad_norm": 0.009817572310566902, "learning_rate": 1.2959183673469389e-05, "loss": 0.0416, "step": 6900 }, { "epoch": 35.255102040816325, "grad_norm": 0.49033859372138977, "learning_rate": 1.2948979591836737e-05, "loss": 0.0021, "step": 6910 }, { "epoch": 35.30612244897959, "grad_norm": 0.0023106893058866262, "learning_rate": 1.2938775510204082e-05, "loss": 0.0245, "step": 6920 }, { "epoch": 35.357142857142854, "grad_norm": 0.0013510782737284899, "learning_rate": 1.292857142857143e-05, "loss": 0.0024, "step": 6930 }, { "epoch": 35.40816326530612, "grad_norm": 0.0037580933421850204, "learning_rate": 1.2918367346938776e-05, "loss": 0.0007, "step": 6940 }, { "epoch": 35.45918367346939, "grad_norm": 0.010321482084691525, "learning_rate": 1.2908163265306123e-05, "loss": 0.0038, "step": 6950 }, { "epoch": 35.51020408163265, "grad_norm": 0.010143114253878593, "learning_rate": 1.2897959183673469e-05, "loss": 0.0505, "step": 6960 }, { "epoch": 35.56122448979592, "grad_norm": 0.0018113257829099894, "learning_rate": 1.2887755102040818e-05, "loss": 0.0034, "step": 6970 }, { "epoch": 35.61224489795919, "grad_norm": 0.0014400702202692628, "learning_rate": 1.2877551020408164e-05, "loss": 0.0511, "step": 6980 }, { "epoch": 35.66326530612245, "grad_norm": 0.02184654027223587, "learning_rate": 1.2867346938775511e-05, "loss": 0.0257, "step": 6990 }, { "epoch": 35.714285714285715, "grad_norm": 0.011629834771156311, "learning_rate": 1.2857142857142859e-05, "loss": 0.0004, "step": 7000 }, { "epoch": 35.765306122448976, "grad_norm": 9.35701847076416, "learning_rate": 1.2846938775510205e-05, "loss": 0.0521, "step": 7010 }, { "epoch": 35.816326530612244, "grad_norm": 26.783231735229492, "learning_rate": 1.2836734693877552e-05, "loss": 0.0295, "step": 7020 }, { "epoch": 35.86734693877551, "grad_norm": 0.003975849132984877, "learning_rate": 1.2826530612244898e-05, "loss": 0.0026, "step": 7030 }, { "epoch": 35.91836734693877, "grad_norm": 1.7191163301467896, "learning_rate": 1.2816326530612247e-05, "loss": 0.0153, "step": 7040 }, { "epoch": 35.96938775510204, "grad_norm": 0.015699198469519615, "learning_rate": 1.2806122448979591e-05, "loss": 0.0023, "step": 7050 }, { "epoch": 36.0, "eval_accuracy": 0.9169675090252708, "eval_loss": 0.697161078453064, "eval_runtime": 1.0907, "eval_samples_per_second": 253.966, "eval_steps_per_second": 32.09, "step": 7056 }, { "epoch": 36.02040816326531, "grad_norm": 0.010540401563048363, "learning_rate": 1.279591836734694e-05, "loss": 0.0003, "step": 7060 }, { "epoch": 36.07142857142857, "grad_norm": 0.0060667600482702255, "learning_rate": 1.2785714285714286e-05, "loss": 0.007, "step": 7070 }, { "epoch": 36.12244897959184, "grad_norm": 0.0015166534576565027, "learning_rate": 1.2775510204081634e-05, "loss": 0.0497, "step": 7080 }, { "epoch": 36.173469387755105, "grad_norm": 0.011533818207681179, "learning_rate": 1.2765306122448981e-05, "loss": 0.0042, "step": 7090 }, { "epoch": 36.224489795918366, "grad_norm": 0.0015513466205447912, "learning_rate": 1.2755102040816327e-05, "loss": 0.0324, "step": 7100 }, { "epoch": 36.275510204081634, "grad_norm": 36.79665756225586, "learning_rate": 1.2744897959183674e-05, "loss": 0.1177, "step": 7110 }, { "epoch": 36.326530612244895, "grad_norm": 0.0011559132253751159, "learning_rate": 1.273469387755102e-05, "loss": 0.0004, "step": 7120 }, { "epoch": 36.37755102040816, "grad_norm": 0.003966885153204203, "learning_rate": 1.272448979591837e-05, "loss": 0.0004, "step": 7130 }, { "epoch": 36.42857142857143, "grad_norm": 0.011447950266301632, "learning_rate": 1.2714285714285715e-05, "loss": 0.2014, "step": 7140 }, { "epoch": 36.47959183673469, "grad_norm": 0.0026489044539630413, "learning_rate": 1.2704081632653063e-05, "loss": 0.0054, "step": 7150 }, { "epoch": 36.53061224489796, "grad_norm": 0.0035194705706089735, "learning_rate": 1.2693877551020409e-05, "loss": 0.0419, "step": 7160 }, { "epoch": 36.58163265306123, "grad_norm": 0.024103136733174324, "learning_rate": 1.2683673469387756e-05, "loss": 0.0006, "step": 7170 }, { "epoch": 36.63265306122449, "grad_norm": 0.007262670435011387, "learning_rate": 1.2673469387755104e-05, "loss": 0.0126, "step": 7180 }, { "epoch": 36.683673469387756, "grad_norm": 0.14022386074066162, "learning_rate": 1.266326530612245e-05, "loss": 0.0004, "step": 7190 }, { "epoch": 36.734693877551024, "grad_norm": 0.04986849054694176, "learning_rate": 1.2653061224489798e-05, "loss": 0.0484, "step": 7200 }, { "epoch": 36.785714285714285, "grad_norm": 0.020401503890752792, "learning_rate": 1.2642857142857143e-05, "loss": 0.0011, "step": 7210 }, { "epoch": 36.83673469387755, "grad_norm": 0.296879380941391, "learning_rate": 1.2632653061224492e-05, "loss": 0.0006, "step": 7220 }, { "epoch": 36.88775510204081, "grad_norm": 0.0007956013432703912, "learning_rate": 1.2622448979591838e-05, "loss": 0.1226, "step": 7230 }, { "epoch": 36.93877551020408, "grad_norm": 0.0013525192625820637, "learning_rate": 1.2612244897959185e-05, "loss": 0.0702, "step": 7240 }, { "epoch": 36.98979591836735, "grad_norm": 0.0007291435031220317, "learning_rate": 1.260204081632653e-05, "loss": 0.1264, "step": 7250 }, { "epoch": 37.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.49799981713294983, "eval_runtime": 1.0825, "eval_samples_per_second": 255.895, "eval_steps_per_second": 32.333, "step": 7252 }, { "epoch": 37.04081632653061, "grad_norm": 0.0008231271058320999, "learning_rate": 1.2591836734693878e-05, "loss": 0.0003, "step": 7260 }, { "epoch": 37.09183673469388, "grad_norm": 0.0009572224225848913, "learning_rate": 1.2581632653061226e-05, "loss": 0.1122, "step": 7270 }, { "epoch": 37.142857142857146, "grad_norm": 28.456127166748047, "learning_rate": 1.2571428571428572e-05, "loss": 0.0855, "step": 7280 }, { "epoch": 37.19387755102041, "grad_norm": 0.01756722666323185, "learning_rate": 1.256122448979592e-05, "loss": 0.0122, "step": 7290 }, { "epoch": 37.244897959183675, "grad_norm": 0.0025213295593857765, "learning_rate": 1.2551020408163267e-05, "loss": 0.0008, "step": 7300 }, { "epoch": 37.295918367346935, "grad_norm": 0.11927001923322678, "learning_rate": 1.2540816326530614e-05, "loss": 0.058, "step": 7310 }, { "epoch": 37.3469387755102, "grad_norm": 0.0014250340173020959, "learning_rate": 1.253061224489796e-05, "loss": 0.0139, "step": 7320 }, { "epoch": 37.39795918367347, "grad_norm": 0.3151053786277771, "learning_rate": 1.2520408163265307e-05, "loss": 0.0225, "step": 7330 }, { "epoch": 37.44897959183673, "grad_norm": 0.0017277757870033383, "learning_rate": 1.2510204081632653e-05, "loss": 0.0064, "step": 7340 }, { "epoch": 37.5, "grad_norm": 0.017651556059718132, "learning_rate": 1.25e-05, "loss": 0.0475, "step": 7350 }, { "epoch": 37.55102040816327, "grad_norm": 0.0035618271213024855, "learning_rate": 1.248979591836735e-05, "loss": 0.1414, "step": 7360 }, { "epoch": 37.60204081632653, "grad_norm": 0.0019024212379008532, "learning_rate": 1.2479591836734694e-05, "loss": 0.1168, "step": 7370 }, { "epoch": 37.6530612244898, "grad_norm": 0.020310290157794952, "learning_rate": 1.2469387755102043e-05, "loss": 0.0001, "step": 7380 }, { "epoch": 37.704081632653065, "grad_norm": 0.09215331822633743, "learning_rate": 1.2459183673469389e-05, "loss": 0.0004, "step": 7390 }, { "epoch": 37.755102040816325, "grad_norm": 0.0007944951066747308, "learning_rate": 1.2448979591836736e-05, "loss": 0.0002, "step": 7400 }, { "epoch": 37.80612244897959, "grad_norm": 0.001632043393328786, "learning_rate": 1.2438775510204082e-05, "loss": 0.118, "step": 7410 }, { "epoch": 37.857142857142854, "grad_norm": 0.009643031284213066, "learning_rate": 1.242857142857143e-05, "loss": 0.0047, "step": 7420 }, { "epoch": 37.90816326530612, "grad_norm": 0.00105887686368078, "learning_rate": 1.2418367346938775e-05, "loss": 0.0031, "step": 7430 }, { "epoch": 37.95918367346939, "grad_norm": 15.97204875946045, "learning_rate": 1.2408163265306123e-05, "loss": 0.0113, "step": 7440 }, { "epoch": 38.0, "eval_accuracy": 0.9169675090252708, "eval_loss": 0.7153842449188232, "eval_runtime": 1.0937, "eval_samples_per_second": 253.268, "eval_steps_per_second": 32.001, "step": 7448 }, { "epoch": 38.01020408163265, "grad_norm": 0.012341351248323917, "learning_rate": 1.2397959183673472e-05, "loss": 0.0072, "step": 7450 }, { "epoch": 38.06122448979592, "grad_norm": 0.027890844270586967, "learning_rate": 1.2387755102040818e-05, "loss": 0.0019, "step": 7460 }, { "epoch": 38.11224489795919, "grad_norm": 0.269838809967041, "learning_rate": 1.2377551020408165e-05, "loss": 0.0002, "step": 7470 }, { "epoch": 38.16326530612245, "grad_norm": 13.986335754394531, "learning_rate": 1.2367346938775511e-05, "loss": 0.3384, "step": 7480 }, { "epoch": 38.214285714285715, "grad_norm": 0.0661822110414505, "learning_rate": 1.2357142857142859e-05, "loss": 0.1083, "step": 7490 }, { "epoch": 38.265306122448976, "grad_norm": 0.31268712878227234, "learning_rate": 1.2346938775510204e-05, "loss": 0.0384, "step": 7500 }, { "epoch": 38.316326530612244, "grad_norm": 7.11651086807251, "learning_rate": 1.2336734693877552e-05, "loss": 0.0555, "step": 7510 }, { "epoch": 38.36734693877551, "grad_norm": 0.01363621186465025, "learning_rate": 1.2326530612244898e-05, "loss": 0.035, "step": 7520 }, { "epoch": 38.41836734693877, "grad_norm": 0.004974448587745428, "learning_rate": 1.2316326530612245e-05, "loss": 0.062, "step": 7530 }, { "epoch": 38.46938775510204, "grad_norm": 0.04702742397785187, "learning_rate": 1.2306122448979594e-05, "loss": 0.0021, "step": 7540 }, { "epoch": 38.52040816326531, "grad_norm": 0.0008039191598072648, "learning_rate": 1.229591836734694e-05, "loss": 0.0714, "step": 7550 }, { "epoch": 38.57142857142857, "grad_norm": 0.0021789586171507835, "learning_rate": 1.2285714285714288e-05, "loss": 0.0277, "step": 7560 }, { "epoch": 38.62244897959184, "grad_norm": 0.005339604336768389, "learning_rate": 1.2275510204081633e-05, "loss": 0.029, "step": 7570 }, { "epoch": 38.673469387755105, "grad_norm": 0.017994316294789314, "learning_rate": 1.2265306122448981e-05, "loss": 0.1415, "step": 7580 }, { "epoch": 38.724489795918366, "grad_norm": 0.0011043621925637126, "learning_rate": 1.2255102040816327e-05, "loss": 0.0002, "step": 7590 }, { "epoch": 38.775510204081634, "grad_norm": 0.0012469463981688023, "learning_rate": 1.2244897959183674e-05, "loss": 0.1146, "step": 7600 }, { "epoch": 38.826530612244895, "grad_norm": 0.0011851087911054492, "learning_rate": 1.223469387755102e-05, "loss": 0.0344, "step": 7610 }, { "epoch": 38.87755102040816, "grad_norm": 0.0006746912258677185, "learning_rate": 1.222448979591837e-05, "loss": 0.0007, "step": 7620 }, { "epoch": 38.92857142857143, "grad_norm": 11.72658634185791, "learning_rate": 1.2214285714285717e-05, "loss": 0.0777, "step": 7630 }, { "epoch": 38.97959183673469, "grad_norm": 1.4807186126708984, "learning_rate": 1.2204081632653062e-05, "loss": 0.0694, "step": 7640 }, { "epoch": 39.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.5443302392959595, "eval_runtime": 1.0879, "eval_samples_per_second": 254.619, "eval_steps_per_second": 32.172, "step": 7644 }, { "epoch": 39.03061224489796, "grad_norm": 0.0025618697982281446, "learning_rate": 1.219387755102041e-05, "loss": 0.0021, "step": 7650 }, { "epoch": 39.08163265306123, "grad_norm": 2.1901729106903076, "learning_rate": 1.2183673469387756e-05, "loss": 0.014, "step": 7660 }, { "epoch": 39.13265306122449, "grad_norm": 0.012508667074143887, "learning_rate": 1.2173469387755103e-05, "loss": 0.0064, "step": 7670 }, { "epoch": 39.183673469387756, "grad_norm": 0.0016731584910303354, "learning_rate": 1.2163265306122449e-05, "loss": 0.0098, "step": 7680 }, { "epoch": 39.234693877551024, "grad_norm": 0.0006948015070520341, "learning_rate": 1.2153061224489796e-05, "loss": 0.0436, "step": 7690 }, { "epoch": 39.285714285714285, "grad_norm": 0.017837168648838997, "learning_rate": 1.2142857142857142e-05, "loss": 0.0089, "step": 7700 }, { "epoch": 39.33673469387755, "grad_norm": 0.006527338642627001, "learning_rate": 1.2132653061224491e-05, "loss": 0.0233, "step": 7710 }, { "epoch": 39.38775510204081, "grad_norm": 0.0025100342463701963, "learning_rate": 1.2122448979591839e-05, "loss": 0.1696, "step": 7720 }, { "epoch": 39.43877551020408, "grad_norm": 0.03204730525612831, "learning_rate": 1.2112244897959185e-05, "loss": 0.2053, "step": 7730 }, { "epoch": 39.48979591836735, "grad_norm": 5.744424343109131, "learning_rate": 1.2102040816326532e-05, "loss": 0.0115, "step": 7740 }, { "epoch": 39.54081632653061, "grad_norm": 0.022196734324097633, "learning_rate": 1.2091836734693878e-05, "loss": 0.0008, "step": 7750 }, { "epoch": 39.59183673469388, "grad_norm": 0.001119808992370963, "learning_rate": 1.2081632653061225e-05, "loss": 0.0001, "step": 7760 }, { "epoch": 39.642857142857146, "grad_norm": 0.0024749517906457186, "learning_rate": 1.2071428571428571e-05, "loss": 0.0089, "step": 7770 }, { "epoch": 39.69387755102041, "grad_norm": 0.0008865215932019055, "learning_rate": 1.206122448979592e-05, "loss": 0.0005, "step": 7780 }, { "epoch": 39.744897959183675, "grad_norm": 0.7026830911636353, "learning_rate": 1.2051020408163265e-05, "loss": 0.0022, "step": 7790 }, { "epoch": 39.795918367346935, "grad_norm": 0.032258111983537674, "learning_rate": 1.2040816326530614e-05, "loss": 0.1236, "step": 7800 }, { "epoch": 39.8469387755102, "grad_norm": 19.399307250976562, "learning_rate": 1.2030612244897961e-05, "loss": 0.1088, "step": 7810 }, { "epoch": 39.89795918367347, "grad_norm": 7.389267921447754, "learning_rate": 1.2020408163265307e-05, "loss": 0.0191, "step": 7820 }, { "epoch": 39.94897959183673, "grad_norm": 0.006161638535559177, "learning_rate": 1.2010204081632655e-05, "loss": 0.0002, "step": 7830 }, { "epoch": 40.0, "grad_norm": 0.003440586617216468, "learning_rate": 1.2e-05, "loss": 0.0976, "step": 7840 }, { "epoch": 40.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.3852378726005554, "eval_runtime": 1.0903, "eval_samples_per_second": 254.064, "eval_steps_per_second": 32.102, "step": 7840 }, { "epoch": 40.05102040816327, "grad_norm": 0.02223014645278454, "learning_rate": 1.1989795918367348e-05, "loss": 0.053, "step": 7850 }, { "epoch": 40.10204081632653, "grad_norm": 0.6741572618484497, "learning_rate": 1.1979591836734694e-05, "loss": 0.062, "step": 7860 }, { "epoch": 40.1530612244898, "grad_norm": 0.0025492815766483545, "learning_rate": 1.1969387755102043e-05, "loss": 0.0066, "step": 7870 }, { "epoch": 40.204081632653065, "grad_norm": 0.2777367830276489, "learning_rate": 1.1959183673469389e-05, "loss": 0.0189, "step": 7880 }, { "epoch": 40.255102040816325, "grad_norm": 0.0011020988458767533, "learning_rate": 1.1948979591836736e-05, "loss": 0.0674, "step": 7890 }, { "epoch": 40.30612244897959, "grad_norm": 0.008376555517315865, "learning_rate": 1.1938775510204084e-05, "loss": 0.084, "step": 7900 }, { "epoch": 40.357142857142854, "grad_norm": 24.430980682373047, "learning_rate": 1.192857142857143e-05, "loss": 0.0347, "step": 7910 }, { "epoch": 40.40816326530612, "grad_norm": 0.005426087416708469, "learning_rate": 1.1918367346938777e-05, "loss": 0.058, "step": 7920 }, { "epoch": 40.45918367346939, "grad_norm": 0.0009207155671902001, "learning_rate": 1.1908163265306123e-05, "loss": 0.0003, "step": 7930 }, { "epoch": 40.51020408163265, "grad_norm": 0.06911341845989227, "learning_rate": 1.189795918367347e-05, "loss": 0.0174, "step": 7940 }, { "epoch": 40.56122448979592, "grad_norm": 18.8804874420166, "learning_rate": 1.1887755102040816e-05, "loss": 0.2298, "step": 7950 }, { "epoch": 40.61224489795919, "grad_norm": 0.39348581433296204, "learning_rate": 1.1877551020408165e-05, "loss": 0.0006, "step": 7960 }, { "epoch": 40.66326530612245, "grad_norm": 0.037014640867710114, "learning_rate": 1.186734693877551e-05, "loss": 0.0003, "step": 7970 }, { "epoch": 40.714285714285715, "grad_norm": 0.032051049172878265, "learning_rate": 1.1857142857142858e-05, "loss": 0.0009, "step": 7980 }, { "epoch": 40.765306122448976, "grad_norm": 0.00889838207513094, "learning_rate": 1.1846938775510206e-05, "loss": 0.0006, "step": 7990 }, { "epoch": 40.816326530612244, "grad_norm": 0.0016339683206751943, "learning_rate": 1.1836734693877552e-05, "loss": 0.0014, "step": 8000 }, { "epoch": 40.86734693877551, "grad_norm": 0.6572158336639404, "learning_rate": 1.1826530612244899e-05, "loss": 0.0005, "step": 8010 }, { "epoch": 40.91836734693877, "grad_norm": 1.6163885593414307, "learning_rate": 1.1816326530612245e-05, "loss": 0.0526, "step": 8020 }, { "epoch": 40.96938775510204, "grad_norm": 35.3330192565918, "learning_rate": 1.1806122448979594e-05, "loss": 0.1191, "step": 8030 }, { "epoch": 41.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.5397872924804688, "eval_runtime": 1.0866, "eval_samples_per_second": 254.932, "eval_steps_per_second": 32.212, "step": 8036 }, { "epoch": 41.02040816326531, "grad_norm": 0.001767844194546342, "learning_rate": 1.179591836734694e-05, "loss": 0.0009, "step": 8040 }, { "epoch": 41.07142857142857, "grad_norm": 0.4130685031414032, "learning_rate": 1.1785714285714287e-05, "loss": 0.0902, "step": 8050 }, { "epoch": 41.12244897959184, "grad_norm": 0.0023608789779245853, "learning_rate": 1.1775510204081633e-05, "loss": 0.3378, "step": 8060 }, { "epoch": 41.173469387755105, "grad_norm": 0.19113022089004517, "learning_rate": 1.176530612244898e-05, "loss": 0.0628, "step": 8070 }, { "epoch": 41.224489795918366, "grad_norm": 0.12973152101039886, "learning_rate": 1.1755102040816328e-05, "loss": 0.2052, "step": 8080 }, { "epoch": 41.275510204081634, "grad_norm": 0.016558591276407242, "learning_rate": 1.1744897959183674e-05, "loss": 0.0353, "step": 8090 }, { "epoch": 41.326530612244895, "grad_norm": 0.02750926837325096, "learning_rate": 1.1734693877551021e-05, "loss": 0.0012, "step": 8100 }, { "epoch": 41.37755102040816, "grad_norm": 31.035022735595703, "learning_rate": 1.1724489795918367e-05, "loss": 0.0512, "step": 8110 }, { "epoch": 41.42857142857143, "grad_norm": 0.0017843014793470502, "learning_rate": 1.1714285714285716e-05, "loss": 0.2303, "step": 8120 }, { "epoch": 41.47959183673469, "grad_norm": 0.060688044875860214, "learning_rate": 1.1704081632653062e-05, "loss": 0.0234, "step": 8130 }, { "epoch": 41.53061224489796, "grad_norm": 0.0029919047374278307, "learning_rate": 1.169387755102041e-05, "loss": 0.0696, "step": 8140 }, { "epoch": 41.58163265306123, "grad_norm": 0.04906465485692024, "learning_rate": 1.1683673469387755e-05, "loss": 0.0864, "step": 8150 }, { "epoch": 41.63265306122449, "grad_norm": 0.0038798872847110033, "learning_rate": 1.1673469387755103e-05, "loss": 0.0004, "step": 8160 }, { "epoch": 41.683673469387756, "grad_norm": 0.001200053608044982, "learning_rate": 1.166326530612245e-05, "loss": 0.0073, "step": 8170 }, { "epoch": 41.734693877551024, "grad_norm": 0.0035708125215023756, "learning_rate": 1.1653061224489796e-05, "loss": 0.1728, "step": 8180 }, { "epoch": 41.785714285714285, "grad_norm": 0.0050631132908165455, "learning_rate": 1.1642857142857145e-05, "loss": 0.0043, "step": 8190 }, { "epoch": 41.83673469387755, "grad_norm": 0.0015812460333108902, "learning_rate": 1.1632653061224491e-05, "loss": 0.1004, "step": 8200 }, { "epoch": 41.88775510204081, "grad_norm": 0.023328380659222603, "learning_rate": 1.1622448979591839e-05, "loss": 0.0006, "step": 8210 }, { "epoch": 41.93877551020408, "grad_norm": 0.005179435480386019, "learning_rate": 1.1612244897959184e-05, "loss": 0.0136, "step": 8220 }, { "epoch": 41.98979591836735, "grad_norm": 0.026809468865394592, "learning_rate": 1.1602040816326532e-05, "loss": 0.1249, "step": 8230 }, { "epoch": 42.0, "eval_accuracy": 0.9169675090252708, "eval_loss": 0.6196650266647339, "eval_runtime": 1.0939, "eval_samples_per_second": 253.218, "eval_steps_per_second": 31.995, "step": 8232 }, { "epoch": 42.04081632653061, "grad_norm": 0.5640822649002075, "learning_rate": 1.1591836734693878e-05, "loss": 0.015, "step": 8240 }, { "epoch": 42.09183673469388, "grad_norm": 1.1740339994430542, "learning_rate": 1.1581632653061225e-05, "loss": 0.0012, "step": 8250 }, { "epoch": 42.142857142857146, "grad_norm": 0.12013377249240875, "learning_rate": 1.1571428571428573e-05, "loss": 0.0007, "step": 8260 }, { "epoch": 42.19387755102041, "grad_norm": 0.016162415966391563, "learning_rate": 1.1561224489795918e-05, "loss": 0.1285, "step": 8270 }, { "epoch": 42.244897959183675, "grad_norm": 31.674270629882812, "learning_rate": 1.1551020408163268e-05, "loss": 0.0753, "step": 8280 }, { "epoch": 42.295918367346935, "grad_norm": 0.002882919041439891, "learning_rate": 1.1540816326530613e-05, "loss": 0.0025, "step": 8290 }, { "epoch": 42.3469387755102, "grad_norm": 0.0017901996616274118, "learning_rate": 1.1530612244897961e-05, "loss": 0.0003, "step": 8300 }, { "epoch": 42.39795918367347, "grad_norm": 0.021078303456306458, "learning_rate": 1.1520408163265307e-05, "loss": 0.0007, "step": 8310 }, { "epoch": 42.44897959183673, "grad_norm": 0.003345151199027896, "learning_rate": 1.1510204081632654e-05, "loss": 0.0751, "step": 8320 }, { "epoch": 42.5, "grad_norm": 0.04216817021369934, "learning_rate": 1.15e-05, "loss": 0.0051, "step": 8330 }, { "epoch": 42.55102040816327, "grad_norm": 0.0662442222237587, "learning_rate": 1.1489795918367347e-05, "loss": 0.0118, "step": 8340 }, { "epoch": 42.60204081632653, "grad_norm": 0.003338195849210024, "learning_rate": 1.1479591836734697e-05, "loss": 0.0003, "step": 8350 }, { "epoch": 42.6530612244898, "grad_norm": 18.405502319335938, "learning_rate": 1.146938775510204e-05, "loss": 0.0998, "step": 8360 }, { "epoch": 42.704081632653065, "grad_norm": 2.8103973865509033, "learning_rate": 1.145918367346939e-05, "loss": 0.0044, "step": 8370 }, { "epoch": 42.755102040816325, "grad_norm": 3.695430278778076, "learning_rate": 1.1448979591836736e-05, "loss": 0.0104, "step": 8380 }, { "epoch": 42.80612244897959, "grad_norm": 19.71965217590332, "learning_rate": 1.1438775510204083e-05, "loss": 0.0155, "step": 8390 }, { "epoch": 42.857142857142854, "grad_norm": 0.005389121826738119, "learning_rate": 1.1428571428571429e-05, "loss": 0.0123, "step": 8400 }, { "epoch": 42.90816326530612, "grad_norm": 19.24228286743164, "learning_rate": 1.1418367346938777e-05, "loss": 0.0422, "step": 8410 }, { "epoch": 42.95918367346939, "grad_norm": 0.003058884758502245, "learning_rate": 1.1408163265306122e-05, "loss": 0.0002, "step": 8420 }, { "epoch": 43.0, "eval_accuracy": 0.9133574007220217, "eval_loss": 0.6966848969459534, "eval_runtime": 1.0788, "eval_samples_per_second": 256.763, "eval_steps_per_second": 32.443, "step": 8428 }, { "epoch": 43.01020408163265, "grad_norm": 0.03175332769751549, "learning_rate": 1.139795918367347e-05, "loss": 0.0003, "step": 8430 }, { "epoch": 43.06122448979592, "grad_norm": 0.00693126255646348, "learning_rate": 1.1387755102040819e-05, "loss": 0.0451, "step": 8440 }, { "epoch": 43.11224489795919, "grad_norm": 3.32902455329895, "learning_rate": 1.1377551020408165e-05, "loss": 0.0023, "step": 8450 }, { "epoch": 43.16326530612245, "grad_norm": 0.19402101635932922, "learning_rate": 1.1367346938775512e-05, "loss": 0.0782, "step": 8460 }, { "epoch": 43.214285714285715, "grad_norm": 0.0006030689692124724, "learning_rate": 1.1357142857142858e-05, "loss": 0.0037, "step": 8470 }, { "epoch": 43.265306122448976, "grad_norm": 0.001989088486880064, "learning_rate": 1.1346938775510206e-05, "loss": 0.0215, "step": 8480 }, { "epoch": 43.316326530612244, "grad_norm": 0.00047579227248206735, "learning_rate": 1.1336734693877551e-05, "loss": 0.0027, "step": 8490 }, { "epoch": 43.36734693877551, "grad_norm": 0.0010240982519462705, "learning_rate": 1.1326530612244899e-05, "loss": 0.0008, "step": 8500 }, { "epoch": 43.41836734693877, "grad_norm": 0.005392876919358969, "learning_rate": 1.1316326530612245e-05, "loss": 0.0004, "step": 8510 }, { "epoch": 43.46938775510204, "grad_norm": 0.002739696530625224, "learning_rate": 1.1306122448979592e-05, "loss": 0.0006, "step": 8520 }, { "epoch": 43.52040816326531, "grad_norm": 0.00966839399188757, "learning_rate": 1.1295918367346941e-05, "loss": 0.0019, "step": 8530 }, { "epoch": 43.57142857142857, "grad_norm": 3.5476248264312744, "learning_rate": 1.1285714285714287e-05, "loss": 0.1192, "step": 8540 }, { "epoch": 43.62244897959184, "grad_norm": 2.2899904251098633, "learning_rate": 1.1275510204081635e-05, "loss": 0.0726, "step": 8550 }, { "epoch": 43.673469387755105, "grad_norm": 0.0023040808737277985, "learning_rate": 1.126530612244898e-05, "loss": 0.0622, "step": 8560 }, { "epoch": 43.724489795918366, "grad_norm": 0.11149712651968002, "learning_rate": 1.1255102040816328e-05, "loss": 0.0689, "step": 8570 }, { "epoch": 43.775510204081634, "grad_norm": 0.0007823500782251358, "learning_rate": 1.1244897959183674e-05, "loss": 0.0837, "step": 8580 }, { "epoch": 43.826530612244895, "grad_norm": 0.0009514589910395443, "learning_rate": 1.1234693877551021e-05, "loss": 0.0023, "step": 8590 }, { "epoch": 43.87755102040816, "grad_norm": 0.38932502269744873, "learning_rate": 1.1224489795918367e-05, "loss": 0.1081, "step": 8600 }, { "epoch": 43.92857142857143, "grad_norm": 0.0053463163785636425, "learning_rate": 1.1214285714285716e-05, "loss": 0.1072, "step": 8610 }, { "epoch": 43.97959183673469, "grad_norm": 0.14675822854042053, "learning_rate": 1.1204081632653062e-05, "loss": 0.1163, "step": 8620 }, { "epoch": 44.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.5697125196456909, "eval_runtime": 1.0837, "eval_samples_per_second": 255.614, "eval_steps_per_second": 32.298, "step": 8624 }, { "epoch": 44.03061224489796, "grad_norm": 0.0016547945560887456, "learning_rate": 1.119387755102041e-05, "loss": 0.0031, "step": 8630 }, { "epoch": 44.08163265306123, "grad_norm": 0.2414090633392334, "learning_rate": 1.1183673469387757e-05, "loss": 0.0059, "step": 8640 }, { "epoch": 44.13265306122449, "grad_norm": 0.031783342361450195, "learning_rate": 1.1173469387755103e-05, "loss": 0.1282, "step": 8650 }, { "epoch": 44.183673469387756, "grad_norm": 0.20312932133674622, "learning_rate": 1.116326530612245e-05, "loss": 0.0004, "step": 8660 }, { "epoch": 44.234693877551024, "grad_norm": 0.0017681847093626857, "learning_rate": 1.1153061224489796e-05, "loss": 0.0004, "step": 8670 }, { "epoch": 44.285714285714285, "grad_norm": 0.004451256711035967, "learning_rate": 1.1142857142857143e-05, "loss": 0.0369, "step": 8680 }, { "epoch": 44.33673469387755, "grad_norm": 0.006902993191033602, "learning_rate": 1.113265306122449e-05, "loss": 0.0007, "step": 8690 }, { "epoch": 44.38775510204081, "grad_norm": 4.341804027557373, "learning_rate": 1.1122448979591838e-05, "loss": 0.0027, "step": 8700 }, { "epoch": 44.43877551020408, "grad_norm": 0.47881796956062317, "learning_rate": 1.1112244897959184e-05, "loss": 0.0174, "step": 8710 }, { "epoch": 44.48979591836735, "grad_norm": 0.0018687021220102906, "learning_rate": 1.1102040816326532e-05, "loss": 0.1226, "step": 8720 }, { "epoch": 44.54081632653061, "grad_norm": 0.2001066356897354, "learning_rate": 1.1091836734693879e-05, "loss": 0.0454, "step": 8730 }, { "epoch": 44.59183673469388, "grad_norm": 0.04767950624227524, "learning_rate": 1.1081632653061225e-05, "loss": 0.0002, "step": 8740 }, { "epoch": 44.642857142857146, "grad_norm": 0.0022785020992159843, "learning_rate": 1.1071428571428572e-05, "loss": 0.0592, "step": 8750 }, { "epoch": 44.69387755102041, "grad_norm": 0.005052843596786261, "learning_rate": 1.1061224489795918e-05, "loss": 0.0179, "step": 8760 }, { "epoch": 44.744897959183675, "grad_norm": 0.005047740414738655, "learning_rate": 1.1051020408163267e-05, "loss": 0.0014, "step": 8770 }, { "epoch": 44.795918367346935, "grad_norm": 0.023382876068353653, "learning_rate": 1.1040816326530611e-05, "loss": 0.0001, "step": 8780 }, { "epoch": 44.8469387755102, "grad_norm": 0.21654243767261505, "learning_rate": 1.103061224489796e-05, "loss": 0.1045, "step": 8790 }, { "epoch": 44.89795918367347, "grad_norm": 0.053598444908857346, "learning_rate": 1.1020408163265306e-05, "loss": 0.0001, "step": 8800 }, { "epoch": 44.94897959183673, "grad_norm": 0.0025833400432020426, "learning_rate": 1.1010204081632654e-05, "loss": 0.0155, "step": 8810 }, { "epoch": 45.0, "grad_norm": 0.0010398955782875419, "learning_rate": 1.1000000000000001e-05, "loss": 0.0201, "step": 8820 }, { "epoch": 45.0, "eval_accuracy": 0.9133574007220217, "eval_loss": 0.7221089005470276, "eval_runtime": 1.0874, "eval_samples_per_second": 254.747, "eval_steps_per_second": 32.188, "step": 8820 }, { "epoch": 45.05102040816327, "grad_norm": 0.00840007048100233, "learning_rate": 1.0989795918367347e-05, "loss": 0.0006, "step": 8830 }, { "epoch": 45.10204081632653, "grad_norm": 0.12595318257808685, "learning_rate": 1.0979591836734695e-05, "loss": 0.0113, "step": 8840 }, { "epoch": 45.1530612244898, "grad_norm": 1.0861917734146118, "learning_rate": 1.096938775510204e-05, "loss": 0.0006, "step": 8850 }, { "epoch": 45.204081632653065, "grad_norm": 0.022522887215018272, "learning_rate": 1.095918367346939e-05, "loss": 0.132, "step": 8860 }, { "epoch": 45.255102040816325, "grad_norm": 0.00799714308232069, "learning_rate": 1.0948979591836735e-05, "loss": 0.0004, "step": 8870 }, { "epoch": 45.30612244897959, "grad_norm": 9.358474731445312, "learning_rate": 1.0938775510204083e-05, "loss": 0.0484, "step": 8880 }, { "epoch": 45.357142857142854, "grad_norm": 0.004407347645610571, "learning_rate": 1.0928571428571429e-05, "loss": 0.0105, "step": 8890 }, { "epoch": 45.40816326530612, "grad_norm": 0.022897858172655106, "learning_rate": 1.0918367346938776e-05, "loss": 0.0009, "step": 8900 }, { "epoch": 45.45918367346939, "grad_norm": 0.0016697036335244775, "learning_rate": 1.0908163265306124e-05, "loss": 0.0012, "step": 8910 }, { "epoch": 45.51020408163265, "grad_norm": 0.003171288874000311, "learning_rate": 1.089795918367347e-05, "loss": 0.0784, "step": 8920 }, { "epoch": 45.56122448979592, "grad_norm": 0.024639975279569626, "learning_rate": 1.0887755102040819e-05, "loss": 0.066, "step": 8930 }, { "epoch": 45.61224489795919, "grad_norm": 0.03386716544628143, "learning_rate": 1.0877551020408163e-05, "loss": 0.0111, "step": 8940 }, { "epoch": 45.66326530612245, "grad_norm": 0.007546681445091963, "learning_rate": 1.0867346938775512e-05, "loss": 0.0437, "step": 8950 }, { "epoch": 45.714285714285715, "grad_norm": 0.23721835017204285, "learning_rate": 1.0857142857142858e-05, "loss": 0.0335, "step": 8960 }, { "epoch": 45.765306122448976, "grad_norm": 0.0035565055441111326, "learning_rate": 1.0846938775510205e-05, "loss": 0.086, "step": 8970 }, { "epoch": 45.816326530612244, "grad_norm": 10.809990882873535, "learning_rate": 1.0836734693877551e-05, "loss": 0.0042, "step": 8980 }, { "epoch": 45.86734693877551, "grad_norm": 0.4658706486225128, "learning_rate": 1.0826530612244899e-05, "loss": 0.0009, "step": 8990 }, { "epoch": 45.91836734693877, "grad_norm": 0.0037951404228806496, "learning_rate": 1.0816326530612246e-05, "loss": 0.0015, "step": 9000 }, { "epoch": 45.96938775510204, "grad_norm": 0.03223824501037598, "learning_rate": 1.0806122448979592e-05, "loss": 0.0003, "step": 9010 }, { "epoch": 46.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.5253372192382812, "eval_runtime": 1.0902, "eval_samples_per_second": 254.074, "eval_steps_per_second": 32.103, "step": 9016 }, { "epoch": 46.02040816326531, "grad_norm": 5.910157680511475, "learning_rate": 1.0795918367346941e-05, "loss": 0.01, "step": 9020 }, { "epoch": 46.07142857142857, "grad_norm": 0.002357910620048642, "learning_rate": 1.0785714285714287e-05, "loss": 0.0085, "step": 9030 }, { "epoch": 46.12244897959184, "grad_norm": 0.03758726269006729, "learning_rate": 1.0775510204081634e-05, "loss": 0.0197, "step": 9040 }, { "epoch": 46.173469387755105, "grad_norm": 46.112918853759766, "learning_rate": 1.076530612244898e-05, "loss": 0.1066, "step": 9050 }, { "epoch": 46.224489795918366, "grad_norm": 0.002390614477917552, "learning_rate": 1.0755102040816328e-05, "loss": 0.2178, "step": 9060 }, { "epoch": 46.275510204081634, "grad_norm": 0.0037820618599653244, "learning_rate": 1.0744897959183673e-05, "loss": 0.0052, "step": 9070 }, { "epoch": 46.326530612244895, "grad_norm": 0.0011254741111770272, "learning_rate": 1.073469387755102e-05, "loss": 0.2469, "step": 9080 }, { "epoch": 46.37755102040816, "grad_norm": 0.0022717679385095835, "learning_rate": 1.072448979591837e-05, "loss": 0.0955, "step": 9090 }, { "epoch": 46.42857142857143, "grad_norm": 0.057991188019514084, "learning_rate": 1.0714285714285714e-05, "loss": 0.0002, "step": 9100 }, { "epoch": 46.47959183673469, "grad_norm": 0.003109829965978861, "learning_rate": 1.0704081632653063e-05, "loss": 0.0773, "step": 9110 }, { "epoch": 46.53061224489796, "grad_norm": 18.15728759765625, "learning_rate": 1.0693877551020409e-05, "loss": 0.1944, "step": 9120 }, { "epoch": 46.58163265306123, "grad_norm": 1.6146048307418823, "learning_rate": 1.0683673469387757e-05, "loss": 0.0489, "step": 9130 }, { "epoch": 46.63265306122449, "grad_norm": 11.649690628051758, "learning_rate": 1.0673469387755102e-05, "loss": 0.0056, "step": 9140 }, { "epoch": 46.683673469387756, "grad_norm": 0.0011012253817170858, "learning_rate": 1.066326530612245e-05, "loss": 0.0221, "step": 9150 }, { "epoch": 46.734693877551024, "grad_norm": 0.0017702613258734345, "learning_rate": 1.0653061224489796e-05, "loss": 0.0305, "step": 9160 }, { "epoch": 46.785714285714285, "grad_norm": 0.0036554394755512476, "learning_rate": 1.0642857142857143e-05, "loss": 0.0013, "step": 9170 }, { "epoch": 46.83673469387755, "grad_norm": 0.0011540923733264208, "learning_rate": 1.0632653061224492e-05, "loss": 0.0929, "step": 9180 }, { "epoch": 46.88775510204081, "grad_norm": 0.0004780712479259819, "learning_rate": 1.0622448979591838e-05, "loss": 0.0272, "step": 9190 }, { "epoch": 46.93877551020408, "grad_norm": 0.31946223974227905, "learning_rate": 1.0612244897959186e-05, "loss": 0.0323, "step": 9200 }, { "epoch": 46.98979591836735, "grad_norm": 0.19247731566429138, "learning_rate": 1.0602040816326531e-05, "loss": 0.0224, "step": 9210 }, { "epoch": 47.0, "eval_accuracy": 0.9494584837545126, "eval_loss": 0.38171839714050293, "eval_runtime": 1.1194, "eval_samples_per_second": 247.448, "eval_steps_per_second": 31.266, "step": 9212 }, { "epoch": 47.04081632653061, "grad_norm": 0.005759162828326225, "learning_rate": 1.0591836734693879e-05, "loss": 0.0031, "step": 9220 }, { "epoch": 47.09183673469388, "grad_norm": 0.004058485385030508, "learning_rate": 1.0581632653061225e-05, "loss": 0.0545, "step": 9230 }, { "epoch": 47.142857142857146, "grad_norm": 0.00166041636839509, "learning_rate": 1.0571428571428572e-05, "loss": 0.0044, "step": 9240 }, { "epoch": 47.19387755102041, "grad_norm": 0.026204131543636322, "learning_rate": 1.0561224489795918e-05, "loss": 0.0101, "step": 9250 }, { "epoch": 47.244897959183675, "grad_norm": 25.93852996826172, "learning_rate": 1.0551020408163265e-05, "loss": 0.083, "step": 9260 }, { "epoch": 47.295918367346935, "grad_norm": 0.38494938611984253, "learning_rate": 1.0540816326530615e-05, "loss": 0.001, "step": 9270 }, { "epoch": 47.3469387755102, "grad_norm": 0.06725030392408371, "learning_rate": 1.053061224489796e-05, "loss": 0.0663, "step": 9280 }, { "epoch": 47.39795918367347, "grad_norm": 16.13612174987793, "learning_rate": 1.0520408163265308e-05, "loss": 0.1407, "step": 9290 }, { "epoch": 47.44897959183673, "grad_norm": 0.003342223120853305, "learning_rate": 1.0510204081632654e-05, "loss": 0.0731, "step": 9300 }, { "epoch": 47.5, "grad_norm": 0.04980611801147461, "learning_rate": 1.0500000000000001e-05, "loss": 0.11, "step": 9310 }, { "epoch": 47.55102040816327, "grad_norm": 0.0018904568860307336, "learning_rate": 1.0489795918367347e-05, "loss": 0.0034, "step": 9320 }, { "epoch": 47.60204081632653, "grad_norm": 0.0871981531381607, "learning_rate": 1.0479591836734694e-05, "loss": 0.0029, "step": 9330 }, { "epoch": 47.6530612244898, "grad_norm": 0.006501348223537207, "learning_rate": 1.046938775510204e-05, "loss": 0.0018, "step": 9340 }, { "epoch": 47.704081632653065, "grad_norm": 0.04751432687044144, "learning_rate": 1.045918367346939e-05, "loss": 0.014, "step": 9350 }, { "epoch": 47.755102040816325, "grad_norm": 0.0024302105884999037, "learning_rate": 1.0448979591836737e-05, "loss": 0.0869, "step": 9360 }, { "epoch": 47.80612244897959, "grad_norm": 0.0041650948114693165, "learning_rate": 1.0438775510204083e-05, "loss": 0.0111, "step": 9370 }, { "epoch": 47.857142857142854, "grad_norm": 0.006547179538756609, "learning_rate": 1.042857142857143e-05, "loss": 0.0133, "step": 9380 }, { "epoch": 47.90816326530612, "grad_norm": 0.0007857574964873493, "learning_rate": 1.0418367346938776e-05, "loss": 0.0289, "step": 9390 }, { "epoch": 47.95918367346939, "grad_norm": 37.491302490234375, "learning_rate": 1.0408163265306123e-05, "loss": 0.0183, "step": 9400 }, { "epoch": 48.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.49663597345352173, "eval_runtime": 1.1034, "eval_samples_per_second": 251.053, "eval_steps_per_second": 31.721, "step": 9408 }, { "epoch": 48.01020408163265, "grad_norm": 0.69174724817276, "learning_rate": 1.039795918367347e-05, "loss": 0.2005, "step": 9410 }, { "epoch": 48.06122448979592, "grad_norm": 3.6260340213775635, "learning_rate": 1.0387755102040817e-05, "loss": 0.0345, "step": 9420 }, { "epoch": 48.11224489795919, "grad_norm": 15.992265701293945, "learning_rate": 1.0377551020408162e-05, "loss": 0.085, "step": 9430 }, { "epoch": 48.16326530612245, "grad_norm": 0.7569894194602966, "learning_rate": 1.0367346938775512e-05, "loss": 0.0142, "step": 9440 }, { "epoch": 48.214285714285715, "grad_norm": 31.049386978149414, "learning_rate": 1.0357142857142859e-05, "loss": 0.0378, "step": 9450 }, { "epoch": 48.265306122448976, "grad_norm": 0.03726164251565933, "learning_rate": 1.0346938775510205e-05, "loss": 0.0014, "step": 9460 }, { "epoch": 48.316326530612244, "grad_norm": 0.0010916799074038863, "learning_rate": 1.0336734693877552e-05, "loss": 0.0002, "step": 9470 }, { "epoch": 48.36734693877551, "grad_norm": 0.021485377103090286, "learning_rate": 1.0326530612244898e-05, "loss": 0.1133, "step": 9480 }, { "epoch": 48.41836734693877, "grad_norm": 25.665176391601562, "learning_rate": 1.0316326530612246e-05, "loss": 0.0584, "step": 9490 }, { "epoch": 48.46938775510204, "grad_norm": 0.011568904854357243, "learning_rate": 1.0306122448979591e-05, "loss": 0.0003, "step": 9500 }, { "epoch": 48.52040816326531, "grad_norm": 0.010891436599195004, "learning_rate": 1.029591836734694e-05, "loss": 0.0108, "step": 9510 }, { "epoch": 48.57142857142857, "grad_norm": 0.006526598706841469, "learning_rate": 1.0285714285714285e-05, "loss": 0.1035, "step": 9520 }, { "epoch": 48.62244897959184, "grad_norm": 0.004090987145900726, "learning_rate": 1.0275510204081634e-05, "loss": 0.0002, "step": 9530 }, { "epoch": 48.673469387755105, "grad_norm": 0.003347874153405428, "learning_rate": 1.0265306122448981e-05, "loss": 0.079, "step": 9540 }, { "epoch": 48.724489795918366, "grad_norm": 0.006006390321999788, "learning_rate": 1.0255102040816327e-05, "loss": 0.0008, "step": 9550 }, { "epoch": 48.775510204081634, "grad_norm": 0.012652132660150528, "learning_rate": 1.0244897959183675e-05, "loss": 0.0759, "step": 9560 }, { "epoch": 48.826530612244895, "grad_norm": 0.002586016431450844, "learning_rate": 1.023469387755102e-05, "loss": 0.0017, "step": 9570 }, { "epoch": 48.87755102040816, "grad_norm": 0.007763824891299009, "learning_rate": 1.0224489795918368e-05, "loss": 0.0176, "step": 9580 }, { "epoch": 48.92857142857143, "grad_norm": 0.0009625015081837773, "learning_rate": 1.0214285714285714e-05, "loss": 0.0005, "step": 9590 }, { "epoch": 48.97959183673469, "grad_norm": 6.086246013641357, "learning_rate": 1.0204081632653063e-05, "loss": 0.0077, "step": 9600 }, { "epoch": 49.0, "eval_accuracy": 0.9458483754512635, "eval_loss": 0.43489572405815125, "eval_runtime": 1.0745, "eval_samples_per_second": 257.796, "eval_steps_per_second": 32.573, "step": 9604 }, { "epoch": 49.03061224489796, "grad_norm": 0.004776608198881149, "learning_rate": 1.0193877551020409e-05, "loss": 0.001, "step": 9610 }, { "epoch": 49.08163265306123, "grad_norm": 0.004159730859100819, "learning_rate": 1.0183673469387756e-05, "loss": 0.0265, "step": 9620 }, { "epoch": 49.13265306122449, "grad_norm": 0.15269260108470917, "learning_rate": 1.0173469387755104e-05, "loss": 0.0223, "step": 9630 }, { "epoch": 49.183673469387756, "grad_norm": 31.990114212036133, "learning_rate": 1.016326530612245e-05, "loss": 0.0451, "step": 9640 }, { "epoch": 49.234693877551024, "grad_norm": 0.002044144319370389, "learning_rate": 1.0153061224489797e-05, "loss": 0.0998, "step": 9650 }, { "epoch": 49.285714285714285, "grad_norm": 34.2266845703125, "learning_rate": 1.0142857142857143e-05, "loss": 0.0398, "step": 9660 }, { "epoch": 49.33673469387755, "grad_norm": 0.0017812995938584208, "learning_rate": 1.013265306122449e-05, "loss": 0.0057, "step": 9670 }, { "epoch": 49.38775510204081, "grad_norm": 0.0006386591703630984, "learning_rate": 1.0122448979591836e-05, "loss": 0.0316, "step": 9680 }, { "epoch": 49.43877551020408, "grad_norm": 0.0030134113039821386, "learning_rate": 1.0112244897959185e-05, "loss": 0.1195, "step": 9690 }, { "epoch": 49.48979591836735, "grad_norm": 0.0013231539633125067, "learning_rate": 1.0102040816326531e-05, "loss": 0.0387, "step": 9700 }, { "epoch": 49.54081632653061, "grad_norm": 0.014563554897904396, "learning_rate": 1.0091836734693879e-05, "loss": 0.0002, "step": 9710 }, { "epoch": 49.59183673469388, "grad_norm": 0.0019114126916974783, "learning_rate": 1.0081632653061226e-05, "loss": 0.0004, "step": 9720 }, { "epoch": 49.642857142857146, "grad_norm": 0.01354127936065197, "learning_rate": 1.0071428571428572e-05, "loss": 0.0001, "step": 9730 }, { "epoch": 49.69387755102041, "grad_norm": 0.1543283611536026, "learning_rate": 1.006122448979592e-05, "loss": 0.0171, "step": 9740 }, { "epoch": 49.744897959183675, "grad_norm": 0.0009669380378909409, "learning_rate": 1.0051020408163265e-05, "loss": 0.0001, "step": 9750 }, { "epoch": 49.795918367346935, "grad_norm": 0.0006296084611676633, "learning_rate": 1.0040816326530614e-05, "loss": 0.0102, "step": 9760 }, { "epoch": 49.8469387755102, "grad_norm": 0.002462791046127677, "learning_rate": 1.003061224489796e-05, "loss": 0.0217, "step": 9770 }, { "epoch": 49.89795918367347, "grad_norm": 0.0016805252525955439, "learning_rate": 1.0020408163265308e-05, "loss": 0.0002, "step": 9780 }, { "epoch": 49.94897959183673, "grad_norm": 0.25625964999198914, "learning_rate": 1.0010204081632653e-05, "loss": 0.069, "step": 9790 }, { "epoch": 50.0, "grad_norm": 0.00462955329567194, "learning_rate": 1e-05, "loss": 0.0083, "step": 9800 }, { "epoch": 50.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.5191228985786438, "eval_runtime": 1.1015, "eval_samples_per_second": 251.47, "eval_steps_per_second": 31.774, "step": 9800 }, { "epoch": 50.05102040816327, "grad_norm": 0.774252712726593, "learning_rate": 9.989795918367348e-06, "loss": 0.2311, "step": 9810 }, { "epoch": 50.10204081632653, "grad_norm": 0.011286312714219093, "learning_rate": 9.979591836734694e-06, "loss": 0.0393, "step": 9820 }, { "epoch": 50.1530612244898, "grad_norm": 0.0007367839571088552, "learning_rate": 9.969387755102042e-06, "loss": 0.0072, "step": 9830 }, { "epoch": 50.204081632653065, "grad_norm": 23.6470890045166, "learning_rate": 9.959183673469387e-06, "loss": 0.0467, "step": 9840 }, { "epoch": 50.255102040816325, "grad_norm": 0.12046942114830017, "learning_rate": 9.948979591836737e-06, "loss": 0.0018, "step": 9850 }, { "epoch": 50.30612244897959, "grad_norm": 0.0007796313730068505, "learning_rate": 9.938775510204082e-06, "loss": 0.0004, "step": 9860 }, { "epoch": 50.357142857142854, "grad_norm": 0.0006711558671668172, "learning_rate": 9.92857142857143e-06, "loss": 0.0107, "step": 9870 }, { "epoch": 50.40816326530612, "grad_norm": 0.0016824215417727828, "learning_rate": 9.918367346938776e-06, "loss": 0.0102, "step": 9880 }, { "epoch": 50.45918367346939, "grad_norm": 0.0023227715864777565, "learning_rate": 9.908163265306123e-06, "loss": 0.0185, "step": 9890 }, { "epoch": 50.51020408163265, "grad_norm": 0.026345064863562584, "learning_rate": 9.89795918367347e-06, "loss": 0.0009, "step": 9900 }, { "epoch": 50.56122448979592, "grad_norm": 1.478464126586914, "learning_rate": 9.887755102040816e-06, "loss": 0.0904, "step": 9910 }, { "epoch": 50.61224489795919, "grad_norm": 0.3955780267715454, "learning_rate": 9.877551020408164e-06, "loss": 0.0448, "step": 9920 }, { "epoch": 50.66326530612245, "grad_norm": 0.018813159316778183, "learning_rate": 9.867346938775511e-06, "loss": 0.0565, "step": 9930 }, { "epoch": 50.714285714285715, "grad_norm": 1.7701644897460938, "learning_rate": 9.857142857142859e-06, "loss": 0.0015, "step": 9940 }, { "epoch": 50.765306122448976, "grad_norm": 13.985889434814453, "learning_rate": 9.846938775510205e-06, "loss": 0.1417, "step": 9950 }, { "epoch": 50.816326530612244, "grad_norm": 0.006177649367600679, "learning_rate": 9.836734693877552e-06, "loss": 0.143, "step": 9960 }, { "epoch": 50.86734693877551, "grad_norm": 26.465219497680664, "learning_rate": 9.8265306122449e-06, "loss": 0.0668, "step": 9970 }, { "epoch": 50.91836734693877, "grad_norm": 0.0019306351896375418, "learning_rate": 9.816326530612245e-06, "loss": 0.0123, "step": 9980 }, { "epoch": 50.96938775510204, "grad_norm": 0.016250109300017357, "learning_rate": 9.806122448979593e-06, "loss": 0.0571, "step": 9990 }, { "epoch": 51.0, "eval_accuracy": 0.9205776173285198, "eval_loss": 0.5826354026794434, "eval_runtime": 1.0778, "eval_samples_per_second": 257.01, "eval_steps_per_second": 32.474, "step": 9996 }, { "epoch": 51.02040816326531, "grad_norm": 0.5698087215423584, "learning_rate": 9.795918367346939e-06, "loss": 0.044, "step": 10000 }, { "epoch": 51.07142857142857, "grad_norm": 0.027690451592206955, "learning_rate": 9.785714285714286e-06, "loss": 0.0131, "step": 10010 }, { "epoch": 51.12244897959184, "grad_norm": 0.03833514451980591, "learning_rate": 9.775510204081634e-06, "loss": 0.0903, "step": 10020 }, { "epoch": 51.173469387755105, "grad_norm": 0.5732561349868774, "learning_rate": 9.765306122448981e-06, "loss": 0.0011, "step": 10030 }, { "epoch": 51.224489795918366, "grad_norm": 0.0030602721963077784, "learning_rate": 9.755102040816327e-06, "loss": 0.1443, "step": 10040 }, { "epoch": 51.275510204081634, "grad_norm": 0.0038297963328659534, "learning_rate": 9.744897959183674e-06, "loss": 0.0256, "step": 10050 }, { "epoch": 51.326530612244895, "grad_norm": 0.37286871671676636, "learning_rate": 9.734693877551022e-06, "loss": 0.0418, "step": 10060 }, { "epoch": 51.37755102040816, "grad_norm": 0.0006284485571086407, "learning_rate": 9.724489795918368e-06, "loss": 0.0003, "step": 10070 }, { "epoch": 51.42857142857143, "grad_norm": 0.0009145958465524018, "learning_rate": 9.714285714285715e-06, "loss": 0.0402, "step": 10080 }, { "epoch": 51.47959183673469, "grad_norm": 0.0007123093237169087, "learning_rate": 9.704081632653061e-06, "loss": 0.0001, "step": 10090 }, { "epoch": 51.53061224489796, "grad_norm": 0.04239608719944954, "learning_rate": 9.693877551020408e-06, "loss": 0.1977, "step": 10100 }, { "epoch": 51.58163265306123, "grad_norm": 0.004421767313033342, "learning_rate": 9.683673469387756e-06, "loss": 0.0014, "step": 10110 }, { "epoch": 51.63265306122449, "grad_norm": 0.0019722820725291967, "learning_rate": 9.673469387755103e-06, "loss": 0.0884, "step": 10120 }, { "epoch": 51.683673469387756, "grad_norm": 0.0012142544146627188, "learning_rate": 9.663265306122451e-06, "loss": 0.0003, "step": 10130 }, { "epoch": 51.734693877551024, "grad_norm": 0.004451120272278786, "learning_rate": 9.653061224489797e-06, "loss": 0.0006, "step": 10140 }, { "epoch": 51.785714285714285, "grad_norm": 0.0017673695692792535, "learning_rate": 9.642857142857144e-06, "loss": 0.0003, "step": 10150 }, { "epoch": 51.83673469387755, "grad_norm": 24.41917610168457, "learning_rate": 9.63265306122449e-06, "loss": 0.1044, "step": 10160 }, { "epoch": 51.88775510204081, "grad_norm": 0.0018762190593406558, "learning_rate": 9.622448979591837e-06, "loss": 0.0002, "step": 10170 }, { "epoch": 51.93877551020408, "grad_norm": 36.48452377319336, "learning_rate": 9.612244897959185e-06, "loss": 0.0848, "step": 10180 }, { "epoch": 51.98979591836735, "grad_norm": 0.08523822575807571, "learning_rate": 9.60204081632653e-06, "loss": 0.0583, "step": 10190 }, { "epoch": 52.0, "eval_accuracy": 0.9169675090252708, "eval_loss": 0.5334702134132385, "eval_runtime": 1.0793, "eval_samples_per_second": 256.64, "eval_steps_per_second": 32.427, "step": 10192 }, { "epoch": 52.04081632653061, "grad_norm": 0.0010746130719780922, "learning_rate": 9.591836734693878e-06, "loss": 0.0004, "step": 10200 }, { "epoch": 52.09183673469388, "grad_norm": 0.0028198871295899153, "learning_rate": 9.581632653061226e-06, "loss": 0.0842, "step": 10210 }, { "epoch": 52.142857142857146, "grad_norm": 0.0008603775640949607, "learning_rate": 9.571428571428573e-06, "loss": 0.0267, "step": 10220 }, { "epoch": 52.19387755102041, "grad_norm": 8.9909086227417, "learning_rate": 9.561224489795919e-06, "loss": 0.0047, "step": 10230 }, { "epoch": 52.244897959183675, "grad_norm": 0.003751999931409955, "learning_rate": 9.551020408163266e-06, "loss": 0.0002, "step": 10240 }, { "epoch": 52.295918367346935, "grad_norm": 0.002932459581643343, "learning_rate": 9.540816326530612e-06, "loss": 0.0001, "step": 10250 }, { "epoch": 52.3469387755102, "grad_norm": 0.0019935150630772114, "learning_rate": 9.53061224489796e-06, "loss": 0.0001, "step": 10260 }, { "epoch": 52.39795918367347, "grad_norm": 0.0009785660076886415, "learning_rate": 9.520408163265307e-06, "loss": 0.0787, "step": 10270 }, { "epoch": 52.44897959183673, "grad_norm": 0.010769267566502094, "learning_rate": 9.510204081632653e-06, "loss": 0.0808, "step": 10280 }, { "epoch": 52.5, "grad_norm": 1.6450892686843872, "learning_rate": 9.5e-06, "loss": 0.0147, "step": 10290 }, { "epoch": 52.55102040816327, "grad_norm": 0.030985673889517784, "learning_rate": 9.489795918367348e-06, "loss": 0.1102, "step": 10300 }, { "epoch": 52.60204081632653, "grad_norm": 0.2882721424102783, "learning_rate": 9.479591836734695e-06, "loss": 0.0011, "step": 10310 }, { "epoch": 52.6530612244898, "grad_norm": 0.003404113696888089, "learning_rate": 9.469387755102041e-06, "loss": 0.0003, "step": 10320 }, { "epoch": 52.704081632653065, "grad_norm": 0.005564435385167599, "learning_rate": 9.459183673469389e-06, "loss": 0.0012, "step": 10330 }, { "epoch": 52.755102040816325, "grad_norm": 0.0028408560901880264, "learning_rate": 9.448979591836736e-06, "loss": 0.0146, "step": 10340 }, { "epoch": 52.80612244897959, "grad_norm": 0.1083548367023468, "learning_rate": 9.438775510204082e-06, "loss": 0.0005, "step": 10350 }, { "epoch": 52.857142857142854, "grad_norm": 0.02419484406709671, "learning_rate": 9.42857142857143e-06, "loss": 0.0003, "step": 10360 }, { "epoch": 52.90816326530612, "grad_norm": 0.07203242927789688, "learning_rate": 9.418367346938775e-06, "loss": 0.0012, "step": 10370 }, { "epoch": 52.95918367346939, "grad_norm": 0.0007067306432873011, "learning_rate": 9.408163265306123e-06, "loss": 0.0019, "step": 10380 }, { "epoch": 53.0, "eval_accuracy": 0.9205776173285198, "eval_loss": 0.5842518210411072, "eval_runtime": 1.0787, "eval_samples_per_second": 256.796, "eval_steps_per_second": 32.447, "step": 10388 }, { "epoch": 53.01020408163265, "grad_norm": 0.04632231593132019, "learning_rate": 9.39795918367347e-06, "loss": 0.0016, "step": 10390 }, { "epoch": 53.06122448979592, "grad_norm": 23.553434371948242, "learning_rate": 9.387755102040818e-06, "loss": 0.0297, "step": 10400 }, { "epoch": 53.11224489795919, "grad_norm": 0.002462690230458975, "learning_rate": 9.377551020408164e-06, "loss": 0.0017, "step": 10410 }, { "epoch": 53.16326530612245, "grad_norm": 0.0016480302438139915, "learning_rate": 9.367346938775511e-06, "loss": 0.0003, "step": 10420 }, { "epoch": 53.214285714285715, "grad_norm": 0.6690643429756165, "learning_rate": 9.357142857142859e-06, "loss": 0.0463, "step": 10430 }, { "epoch": 53.265306122448976, "grad_norm": 0.002313209231942892, "learning_rate": 9.346938775510204e-06, "loss": 0.0016, "step": 10440 }, { "epoch": 53.316326530612244, "grad_norm": 0.04178142175078392, "learning_rate": 9.336734693877552e-06, "loss": 0.0002, "step": 10450 }, { "epoch": 53.36734693877551, "grad_norm": 0.000977375078946352, "learning_rate": 9.326530612244898e-06, "loss": 0.0246, "step": 10460 }, { "epoch": 53.41836734693877, "grad_norm": 0.6785934567451477, "learning_rate": 9.316326530612245e-06, "loss": 0.0075, "step": 10470 }, { "epoch": 53.46938775510204, "grad_norm": 23.148418426513672, "learning_rate": 9.306122448979593e-06, "loss": 0.0533, "step": 10480 }, { "epoch": 53.52040816326531, "grad_norm": 0.0018429437186568975, "learning_rate": 9.29591836734694e-06, "loss": 0.0003, "step": 10490 }, { "epoch": 53.57142857142857, "grad_norm": 0.0036811435129493475, "learning_rate": 9.285714285714288e-06, "loss": 0.0004, "step": 10500 }, { "epoch": 53.62244897959184, "grad_norm": 0.1689847856760025, "learning_rate": 9.275510204081633e-06, "loss": 0.011, "step": 10510 }, { "epoch": 53.673469387755105, "grad_norm": 0.0014095149235799909, "learning_rate": 9.26530612244898e-06, "loss": 0.0002, "step": 10520 }, { "epoch": 53.724489795918366, "grad_norm": 0.00196831370703876, "learning_rate": 9.255102040816327e-06, "loss": 0.0007, "step": 10530 }, { "epoch": 53.775510204081634, "grad_norm": 0.14258316159248352, "learning_rate": 9.244897959183674e-06, "loss": 0.0839, "step": 10540 }, { "epoch": 53.826530612244895, "grad_norm": 0.016741791740059853, "learning_rate": 9.234693877551022e-06, "loss": 0.0009, "step": 10550 }, { "epoch": 53.87755102040816, "grad_norm": 0.0005224159103818238, "learning_rate": 9.224489795918367e-06, "loss": 0.0235, "step": 10560 }, { "epoch": 53.92857142857143, "grad_norm": 0.0006043571047484875, "learning_rate": 9.214285714285715e-06, "loss": 0.0011, "step": 10570 }, { "epoch": 53.97959183673469, "grad_norm": 0.01142693217843771, "learning_rate": 9.204081632653062e-06, "loss": 0.0044, "step": 10580 }, { "epoch": 54.0, "eval_accuracy": 0.9205776173285198, "eval_loss": 0.5894723534584045, "eval_runtime": 1.0811, "eval_samples_per_second": 256.231, "eval_steps_per_second": 32.376, "step": 10584 }, { "epoch": 54.03061224489796, "grad_norm": 0.001169680617749691, "learning_rate": 9.19387755102041e-06, "loss": 0.123, "step": 10590 }, { "epoch": 54.08163265306123, "grad_norm": 0.010936969891190529, "learning_rate": 9.183673469387756e-06, "loss": 0.0005, "step": 10600 }, { "epoch": 54.13265306122449, "grad_norm": 0.0009202130604535341, "learning_rate": 9.173469387755103e-06, "loss": 0.01, "step": 10610 }, { "epoch": 54.183673469387756, "grad_norm": 0.010210862383246422, "learning_rate": 9.163265306122449e-06, "loss": 0.0001, "step": 10620 }, { "epoch": 54.234693877551024, "grad_norm": 0.0007355593261308968, "learning_rate": 9.153061224489796e-06, "loss": 0.0722, "step": 10630 }, { "epoch": 54.285714285714285, "grad_norm": 0.0015544629422947764, "learning_rate": 9.142857142857144e-06, "loss": 0.0006, "step": 10640 }, { "epoch": 54.33673469387755, "grad_norm": 0.0009080583113245666, "learning_rate": 9.13265306122449e-06, "loss": 0.0063, "step": 10650 }, { "epoch": 54.38775510204081, "grad_norm": 0.2235211431980133, "learning_rate": 9.122448979591837e-06, "loss": 0.0002, "step": 10660 }, { "epoch": 54.43877551020408, "grad_norm": 0.0005023422418162227, "learning_rate": 9.112244897959185e-06, "loss": 0.0014, "step": 10670 }, { "epoch": 54.48979591836735, "grad_norm": 0.0008095937664620578, "learning_rate": 9.102040816326532e-06, "loss": 0.128, "step": 10680 }, { "epoch": 54.54081632653061, "grad_norm": 0.39649301767349243, "learning_rate": 9.091836734693878e-06, "loss": 0.0536, "step": 10690 }, { "epoch": 54.59183673469388, "grad_norm": 0.0019005000358447433, "learning_rate": 9.081632653061225e-06, "loss": 0.0348, "step": 10700 }, { "epoch": 54.642857142857146, "grad_norm": 0.10276588052511215, "learning_rate": 9.071428571428573e-06, "loss": 0.0023, "step": 10710 }, { "epoch": 54.69387755102041, "grad_norm": 0.0008410296286456287, "learning_rate": 9.061224489795919e-06, "loss": 0.0001, "step": 10720 }, { "epoch": 54.744897959183675, "grad_norm": 0.005163641646504402, "learning_rate": 9.051020408163266e-06, "loss": 0.0009, "step": 10730 }, { "epoch": 54.795918367346935, "grad_norm": 18.602264404296875, "learning_rate": 9.040816326530612e-06, "loss": 0.05, "step": 10740 }, { "epoch": 54.8469387755102, "grad_norm": 0.0015052856178954244, "learning_rate": 9.03061224489796e-06, "loss": 0.0943, "step": 10750 }, { "epoch": 54.89795918367347, "grad_norm": 0.009039022959768772, "learning_rate": 9.020408163265307e-06, "loss": 0.034, "step": 10760 }, { "epoch": 54.94897959183673, "grad_norm": 0.0017823094967752695, "learning_rate": 9.010204081632654e-06, "loss": 0.0015, "step": 10770 }, { "epoch": 55.0, "grad_norm": 0.0845731869339943, "learning_rate": 9e-06, "loss": 0.0065, "step": 10780 }, { "epoch": 55.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.4486790597438812, "eval_runtime": 1.0762, "eval_samples_per_second": 257.397, "eval_steps_per_second": 32.523, "step": 10780 }, { "epoch": 55.05102040816327, "grad_norm": 0.009737039916217327, "learning_rate": 8.989795918367348e-06, "loss": 0.0001, "step": 10790 }, { "epoch": 55.10204081632653, "grad_norm": 3.418036460876465, "learning_rate": 8.979591836734695e-06, "loss": 0.0009, "step": 10800 }, { "epoch": 55.1530612244898, "grad_norm": 0.0022670028265565634, "learning_rate": 8.969387755102041e-06, "loss": 0.0265, "step": 10810 }, { "epoch": 55.204081632653065, "grad_norm": 0.06527600437402725, "learning_rate": 8.959183673469388e-06, "loss": 0.0337, "step": 10820 }, { "epoch": 55.255102040816325, "grad_norm": 0.00039897262468002737, "learning_rate": 8.948979591836734e-06, "loss": 0.0015, "step": 10830 }, { "epoch": 55.30612244897959, "grad_norm": 22.725341796875, "learning_rate": 8.938775510204082e-06, "loss": 0.0843, "step": 10840 }, { "epoch": 55.357142857142854, "grad_norm": 0.0012160622281953692, "learning_rate": 8.92857142857143e-06, "loss": 0.0626, "step": 10850 }, { "epoch": 55.40816326530612, "grad_norm": 1.4059704542160034, "learning_rate": 8.918367346938777e-06, "loss": 0.0028, "step": 10860 }, { "epoch": 55.45918367346939, "grad_norm": 0.17656470835208893, "learning_rate": 8.908163265306124e-06, "loss": 0.0003, "step": 10870 }, { "epoch": 55.51020408163265, "grad_norm": 19.98672866821289, "learning_rate": 8.89795918367347e-06, "loss": 0.0263, "step": 10880 }, { "epoch": 55.56122448979592, "grad_norm": 0.0058608935214579105, "learning_rate": 8.887755102040817e-06, "loss": 0.0002, "step": 10890 }, { "epoch": 55.61224489795919, "grad_norm": 0.010634215548634529, "learning_rate": 8.877551020408163e-06, "loss": 0.0004, "step": 10900 }, { "epoch": 55.66326530612245, "grad_norm": 0.0015789840836077929, "learning_rate": 8.86734693877551e-06, "loss": 0.0268, "step": 10910 }, { "epoch": 55.714285714285715, "grad_norm": 0.0014255768619477749, "learning_rate": 8.857142857142858e-06, "loss": 0.0061, "step": 10920 }, { "epoch": 55.765306122448976, "grad_norm": 0.020340051501989365, "learning_rate": 8.846938775510204e-06, "loss": 0.0013, "step": 10930 }, { "epoch": 55.816326530612244, "grad_norm": 0.0006444318569265306, "learning_rate": 8.836734693877552e-06, "loss": 0.0366, "step": 10940 }, { "epoch": 55.86734693877551, "grad_norm": 0.002159450901672244, "learning_rate": 8.826530612244899e-06, "loss": 0.0009, "step": 10950 }, { "epoch": 55.91836734693877, "grad_norm": 0.002897097496315837, "learning_rate": 8.816326530612247e-06, "loss": 0.0145, "step": 10960 }, { "epoch": 55.96938775510204, "grad_norm": 28.371925354003906, "learning_rate": 8.806122448979592e-06, "loss": 0.0126, "step": 10970 }, { "epoch": 56.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.6220578551292419, "eval_runtime": 1.0753, "eval_samples_per_second": 257.608, "eval_steps_per_second": 32.55, "step": 10976 }, { "epoch": 56.02040816326531, "grad_norm": 0.05436180531978607, "learning_rate": 8.79591836734694e-06, "loss": 0.0236, "step": 10980 }, { "epoch": 56.07142857142857, "grad_norm": 0.0007396138389594853, "learning_rate": 8.785714285714286e-06, "loss": 0.0299, "step": 10990 }, { "epoch": 56.12244897959184, "grad_norm": 0.0007208554306998849, "learning_rate": 8.775510204081633e-06, "loss": 0.0844, "step": 11000 }, { "epoch": 56.173469387755105, "grad_norm": 0.002780457492917776, "learning_rate": 8.76530612244898e-06, "loss": 0.0012, "step": 11010 }, { "epoch": 56.224489795918366, "grad_norm": 0.007315227296203375, "learning_rate": 8.755102040816326e-06, "loss": 0.067, "step": 11020 }, { "epoch": 56.275510204081634, "grad_norm": 0.0010656971717253327, "learning_rate": 8.744897959183676e-06, "loss": 0.0003, "step": 11030 }, { "epoch": 56.326530612244895, "grad_norm": 0.005156094674021006, "learning_rate": 8.734693877551021e-06, "loss": 0.0007, "step": 11040 }, { "epoch": 56.37755102040816, "grad_norm": 0.0020376748871058226, "learning_rate": 8.724489795918369e-06, "loss": 0.0184, "step": 11050 }, { "epoch": 56.42857142857143, "grad_norm": 0.007411920465528965, "learning_rate": 8.714285714285715e-06, "loss": 0.0013, "step": 11060 }, { "epoch": 56.47959183673469, "grad_norm": 39.1966667175293, "learning_rate": 8.704081632653062e-06, "loss": 0.1442, "step": 11070 }, { "epoch": 56.53061224489796, "grad_norm": 0.13524441421031952, "learning_rate": 8.69387755102041e-06, "loss": 0.0009, "step": 11080 }, { "epoch": 56.58163265306123, "grad_norm": 0.038729358464479446, "learning_rate": 8.683673469387755e-06, "loss": 0.0005, "step": 11090 }, { "epoch": 56.63265306122449, "grad_norm": 0.04754778370261192, "learning_rate": 8.673469387755103e-06, "loss": 0.0069, "step": 11100 }, { "epoch": 56.683673469387756, "grad_norm": 0.0030663185752928257, "learning_rate": 8.663265306122449e-06, "loss": 0.1094, "step": 11110 }, { "epoch": 56.734693877551024, "grad_norm": 0.4384918808937073, "learning_rate": 8.653061224489798e-06, "loss": 0.0474, "step": 11120 }, { "epoch": 56.785714285714285, "grad_norm": 0.006999153643846512, "learning_rate": 8.642857142857144e-06, "loss": 0.0935, "step": 11130 }, { "epoch": 56.83673469387755, "grad_norm": 0.0020574056543409824, "learning_rate": 8.632653061224491e-06, "loss": 0.0021, "step": 11140 }, { "epoch": 56.88775510204081, "grad_norm": 0.03286987543106079, "learning_rate": 8.622448979591837e-06, "loss": 0.0001, "step": 11150 }, { "epoch": 56.93877551020408, "grad_norm": 30.671478271484375, "learning_rate": 8.612244897959184e-06, "loss": 0.1333, "step": 11160 }, { "epoch": 56.98979591836735, "grad_norm": 0.0015075260307639837, "learning_rate": 8.602040816326532e-06, "loss": 0.0093, "step": 11170 }, { "epoch": 57.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.51384037733078, "eval_runtime": 1.0803, "eval_samples_per_second": 256.401, "eval_steps_per_second": 32.397, "step": 11172 }, { "epoch": 57.04081632653061, "grad_norm": 12.403755187988281, "learning_rate": 8.591836734693878e-06, "loss": 0.0888, "step": 11180 }, { "epoch": 57.09183673469388, "grad_norm": 0.001510613365098834, "learning_rate": 8.581632653061225e-06, "loss": 0.0391, "step": 11190 }, { "epoch": 57.142857142857146, "grad_norm": 0.012250826694071293, "learning_rate": 8.571428571428571e-06, "loss": 0.0003, "step": 11200 }, { "epoch": 57.19387755102041, "grad_norm": 8.519612312316895, "learning_rate": 8.56122448979592e-06, "loss": 0.0044, "step": 11210 }, { "epoch": 57.244897959183675, "grad_norm": 0.0035421387292444706, "learning_rate": 8.551020408163266e-06, "loss": 0.0002, "step": 11220 }, { "epoch": 57.295918367346935, "grad_norm": 29.73356819152832, "learning_rate": 8.540816326530613e-06, "loss": 0.0924, "step": 11230 }, { "epoch": 57.3469387755102, "grad_norm": 17.875030517578125, "learning_rate": 8.530612244897961e-06, "loss": 0.0063, "step": 11240 }, { "epoch": 57.39795918367347, "grad_norm": 8.741540908813477, "learning_rate": 8.520408163265307e-06, "loss": 0.0063, "step": 11250 }, { "epoch": 57.44897959183673, "grad_norm": 0.0027055738028138876, "learning_rate": 8.510204081632654e-06, "loss": 0.2214, "step": 11260 }, { "epoch": 57.5, "grad_norm": 0.03664912283420563, "learning_rate": 8.5e-06, "loss": 0.135, "step": 11270 }, { "epoch": 57.55102040816327, "grad_norm": 0.04475510120391846, "learning_rate": 8.489795918367347e-06, "loss": 0.0001, "step": 11280 }, { "epoch": 57.60204081632653, "grad_norm": 8.990615844726562, "learning_rate": 8.479591836734695e-06, "loss": 0.1122, "step": 11290 }, { "epoch": 57.6530612244898, "grad_norm": 38.610565185546875, "learning_rate": 8.469387755102042e-06, "loss": 0.0865, "step": 11300 }, { "epoch": 57.704081632653065, "grad_norm": 0.0015925682382658124, "learning_rate": 8.459183673469388e-06, "loss": 0.0013, "step": 11310 }, { "epoch": 57.755102040816325, "grad_norm": 0.0018991163233295083, "learning_rate": 8.448979591836736e-06, "loss": 0.0002, "step": 11320 }, { "epoch": 57.80612244897959, "grad_norm": 0.9396018385887146, "learning_rate": 8.438775510204083e-06, "loss": 0.0083, "step": 11330 }, { "epoch": 57.857142857142854, "grad_norm": 0.0639747902750969, "learning_rate": 8.428571428571429e-06, "loss": 0.0034, "step": 11340 }, { "epoch": 57.90816326530612, "grad_norm": 1.4883151054382324, "learning_rate": 8.418367346938776e-06, "loss": 0.001, "step": 11350 }, { "epoch": 57.95918367346939, "grad_norm": 0.0070408182218670845, "learning_rate": 8.408163265306122e-06, "loss": 0.0004, "step": 11360 }, { "epoch": 58.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.5161513686180115, "eval_runtime": 1.0919, "eval_samples_per_second": 253.695, "eval_steps_per_second": 32.055, "step": 11368 }, { "epoch": 58.01020408163265, "grad_norm": 0.0072437613271176815, "learning_rate": 8.39795918367347e-06, "loss": 0.095, "step": 11370 }, { "epoch": 58.06122448979592, "grad_norm": 0.008636312559247017, "learning_rate": 8.387755102040817e-06, "loss": 0.0015, "step": 11380 }, { "epoch": 58.11224489795919, "grad_norm": 0.0346841923892498, "learning_rate": 8.377551020408165e-06, "loss": 0.0307, "step": 11390 }, { "epoch": 58.16326530612245, "grad_norm": 0.00649836752563715, "learning_rate": 8.36734693877551e-06, "loss": 0.0197, "step": 11400 }, { "epoch": 58.214285714285715, "grad_norm": 0.05089600756764412, "learning_rate": 8.357142857142858e-06, "loss": 0.0005, "step": 11410 }, { "epoch": 58.265306122448976, "grad_norm": 12.186833381652832, "learning_rate": 8.346938775510205e-06, "loss": 0.0609, "step": 11420 }, { "epoch": 58.316326530612244, "grad_norm": 0.0013033733703196049, "learning_rate": 8.336734693877551e-06, "loss": 0.0497, "step": 11430 }, { "epoch": 58.36734693877551, "grad_norm": 0.0020918340887874365, "learning_rate": 8.326530612244899e-06, "loss": 0.0001, "step": 11440 }, { "epoch": 58.41836734693877, "grad_norm": 0.0003076460852753371, "learning_rate": 8.316326530612246e-06, "loss": 0.0053, "step": 11450 }, { "epoch": 58.46938775510204, "grad_norm": 1.6228913068771362, "learning_rate": 8.306122448979592e-06, "loss": 0.0013, "step": 11460 }, { "epoch": 58.52040816326531, "grad_norm": 3.717069625854492, "learning_rate": 8.29591836734694e-06, "loss": 0.1706, "step": 11470 }, { "epoch": 58.57142857142857, "grad_norm": 0.1767139881849289, "learning_rate": 8.285714285714287e-06, "loss": 0.0102, "step": 11480 }, { "epoch": 58.62244897959184, "grad_norm": 0.0006755400099791586, "learning_rate": 8.275510204081634e-06, "loss": 0.0056, "step": 11490 }, { "epoch": 58.673469387755105, "grad_norm": 0.0008632587851025164, "learning_rate": 8.26530612244898e-06, "loss": 0.0383, "step": 11500 }, { "epoch": 58.724489795918366, "grad_norm": 0.0006241640658117831, "learning_rate": 8.255102040816328e-06, "loss": 0.0004, "step": 11510 }, { "epoch": 58.775510204081634, "grad_norm": 0.00045678409514948726, "learning_rate": 8.244897959183674e-06, "loss": 0.0001, "step": 11520 }, { "epoch": 58.826530612244895, "grad_norm": 0.008961795829236507, "learning_rate": 8.234693877551021e-06, "loss": 0.0001, "step": 11530 }, { "epoch": 58.87755102040816, "grad_norm": 0.0008870940655469894, "learning_rate": 8.224489795918369e-06, "loss": 0.1522, "step": 11540 }, { "epoch": 58.92857142857143, "grad_norm": 0.02654801309108734, "learning_rate": 8.214285714285714e-06, "loss": 0.0487, "step": 11550 }, { "epoch": 58.97959183673469, "grad_norm": 0.0029196005780249834, "learning_rate": 8.204081632653062e-06, "loss": 0.0002, "step": 11560 }, { "epoch": 59.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.4513862133026123, "eval_runtime": 1.0878, "eval_samples_per_second": 254.642, "eval_steps_per_second": 32.175, "step": 11564 }, { "epoch": 59.03061224489796, "grad_norm": 0.012186354957520962, "learning_rate": 8.19387755102041e-06, "loss": 0.0013, "step": 11570 }, { "epoch": 59.08163265306123, "grad_norm": 0.002869348507374525, "learning_rate": 8.183673469387757e-06, "loss": 0.0002, "step": 11580 }, { "epoch": 59.13265306122449, "grad_norm": 0.0009631278808228672, "learning_rate": 8.173469387755103e-06, "loss": 0.0004, "step": 11590 }, { "epoch": 59.183673469387756, "grad_norm": 0.0012889080680906773, "learning_rate": 8.16326530612245e-06, "loss": 0.0001, "step": 11600 }, { "epoch": 59.234693877551024, "grad_norm": 0.019055675715208054, "learning_rate": 8.153061224489796e-06, "loss": 0.0006, "step": 11610 }, { "epoch": 59.285714285714285, "grad_norm": 0.0025214364286512136, "learning_rate": 8.142857142857143e-06, "loss": 0.0016, "step": 11620 }, { "epoch": 59.33673469387755, "grad_norm": 0.004679705016314983, "learning_rate": 8.13265306122449e-06, "loss": 0.1477, "step": 11630 }, { "epoch": 59.38775510204081, "grad_norm": 0.0013368760701268911, "learning_rate": 8.122448979591837e-06, "loss": 0.0044, "step": 11640 }, { "epoch": 59.43877551020408, "grad_norm": 0.0008905482245609164, "learning_rate": 8.112244897959184e-06, "loss": 0.0001, "step": 11650 }, { "epoch": 59.48979591836735, "grad_norm": 0.05714056268334389, "learning_rate": 8.102040816326532e-06, "loss": 0.0465, "step": 11660 }, { "epoch": 59.54081632653061, "grad_norm": 0.0014684011694043875, "learning_rate": 8.091836734693879e-06, "loss": 0.0854, "step": 11670 }, { "epoch": 59.59183673469388, "grad_norm": 0.0007029020925983787, "learning_rate": 8.081632653061225e-06, "loss": 0.0625, "step": 11680 }, { "epoch": 59.642857142857146, "grad_norm": 0.0018458092818036675, "learning_rate": 8.071428571428572e-06, "loss": 0.0002, "step": 11690 }, { "epoch": 59.69387755102041, "grad_norm": 0.005803353153169155, "learning_rate": 8.06122448979592e-06, "loss": 0.0001, "step": 11700 }, { "epoch": 59.744897959183675, "grad_norm": 0.0011216498678550124, "learning_rate": 8.051020408163266e-06, "loss": 0.0055, "step": 11710 }, { "epoch": 59.795918367346935, "grad_norm": 0.0014340392081066966, "learning_rate": 8.040816326530613e-06, "loss": 0.0004, "step": 11720 }, { "epoch": 59.8469387755102, "grad_norm": 0.0006265795673243701, "learning_rate": 8.030612244897959e-06, "loss": 0.064, "step": 11730 }, { "epoch": 59.89795918367347, "grad_norm": 0.02402559481561184, "learning_rate": 8.020408163265306e-06, "loss": 0.0002, "step": 11740 }, { "epoch": 59.94897959183673, "grad_norm": 0.0005986476899124682, "learning_rate": 8.010204081632654e-06, "loss": 0.0317, "step": 11750 }, { "epoch": 60.0, "grad_norm": 0.0016358025604858994, "learning_rate": 8.000000000000001e-06, "loss": 0.1463, "step": 11760 }, { "epoch": 60.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.47439315915107727, "eval_runtime": 1.0984, "eval_samples_per_second": 252.176, "eval_steps_per_second": 31.863, "step": 11760 }, { "epoch": 60.05102040816327, "grad_norm": 0.0014308940153568983, "learning_rate": 7.989795918367347e-06, "loss": 0.0001, "step": 11770 }, { "epoch": 60.10204081632653, "grad_norm": 0.0009721182868815958, "learning_rate": 7.979591836734695e-06, "loss": 0.0001, "step": 11780 }, { "epoch": 60.1530612244898, "grad_norm": 0.00835713092237711, "learning_rate": 7.969387755102042e-06, "loss": 0.0001, "step": 11790 }, { "epoch": 60.204081632653065, "grad_norm": 0.18029354512691498, "learning_rate": 7.959183673469388e-06, "loss": 0.1026, "step": 11800 }, { "epoch": 60.255102040816325, "grad_norm": 0.004692603833973408, "learning_rate": 7.948979591836735e-06, "loss": 0.0001, "step": 11810 }, { "epoch": 60.30612244897959, "grad_norm": 2.533399820327759, "learning_rate": 7.938775510204081e-06, "loss": 0.0034, "step": 11820 }, { "epoch": 60.357142857142854, "grad_norm": 0.0012155831791460514, "learning_rate": 7.928571428571429e-06, "loss": 0.0002, "step": 11830 }, { "epoch": 60.40816326530612, "grad_norm": 0.005300146993249655, "learning_rate": 7.918367346938776e-06, "loss": 0.054, "step": 11840 }, { "epoch": 60.45918367346939, "grad_norm": 0.001332037034444511, "learning_rate": 7.908163265306124e-06, "loss": 0.1344, "step": 11850 }, { "epoch": 60.51020408163265, "grad_norm": 0.003332866821438074, "learning_rate": 7.897959183673471e-06, "loss": 0.1152, "step": 11860 }, { "epoch": 60.56122448979592, "grad_norm": 0.034427445381879807, "learning_rate": 7.887755102040817e-06, "loss": 0.0005, "step": 11870 }, { "epoch": 60.61224489795919, "grad_norm": 18.173721313476562, "learning_rate": 7.877551020408164e-06, "loss": 0.0221, "step": 11880 }, { "epoch": 60.66326530612245, "grad_norm": 0.0013922007055953145, "learning_rate": 7.86734693877551e-06, "loss": 0.0004, "step": 11890 }, { "epoch": 60.714285714285715, "grad_norm": 0.0025625822599977255, "learning_rate": 7.857142857142858e-06, "loss": 0.0661, "step": 11900 }, { "epoch": 60.765306122448976, "grad_norm": 11.145613670349121, "learning_rate": 7.846938775510205e-06, "loss": 0.019, "step": 11910 }, { "epoch": 60.816326530612244, "grad_norm": 0.0010683988220989704, "learning_rate": 7.836734693877551e-06, "loss": 0.0004, "step": 11920 }, { "epoch": 60.86734693877551, "grad_norm": 0.0038273765239864588, "learning_rate": 7.826530612244898e-06, "loss": 0.0017, "step": 11930 }, { "epoch": 60.91836734693877, "grad_norm": 0.004336085170507431, "learning_rate": 7.816326530612246e-06, "loss": 0.0002, "step": 11940 }, { "epoch": 60.96938775510204, "grad_norm": 0.010450436733663082, "learning_rate": 7.806122448979593e-06, "loss": 0.0001, "step": 11950 }, { "epoch": 61.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.5337720513343811, "eval_runtime": 1.0888, "eval_samples_per_second": 254.419, "eval_steps_per_second": 32.147, "step": 11956 }, { "epoch": 61.02040816326531, "grad_norm": 0.001237039570696652, "learning_rate": 7.79591836734694e-06, "loss": 0.0001, "step": 11960 }, { "epoch": 61.07142857142857, "grad_norm": 0.0008212309912778437, "learning_rate": 7.785714285714287e-06, "loss": 0.0011, "step": 11970 }, { "epoch": 61.12244897959184, "grad_norm": 0.0006884504109621048, "learning_rate": 7.775510204081632e-06, "loss": 0.0001, "step": 11980 }, { "epoch": 61.173469387755105, "grad_norm": 0.015512478537857533, "learning_rate": 7.76530612244898e-06, "loss": 0.1007, "step": 11990 }, { "epoch": 61.224489795918366, "grad_norm": 0.2063184529542923, "learning_rate": 7.755102040816327e-06, "loss": 0.0184, "step": 12000 }, { "epoch": 61.275510204081634, "grad_norm": 42.287010192871094, "learning_rate": 7.744897959183673e-06, "loss": 0.1516, "step": 12010 }, { "epoch": 61.326530612244895, "grad_norm": 0.0006978824967518449, "learning_rate": 7.73469387755102e-06, "loss": 0.0004, "step": 12020 }, { "epoch": 61.37755102040816, "grad_norm": 0.00962745863944292, "learning_rate": 7.724489795918368e-06, "loss": 0.0075, "step": 12030 }, { "epoch": 61.42857142857143, "grad_norm": 0.0013585771666839719, "learning_rate": 7.714285714285716e-06, "loss": 0.1473, "step": 12040 }, { "epoch": 61.47959183673469, "grad_norm": 0.0005578490090556443, "learning_rate": 7.704081632653061e-06, "loss": 0.0001, "step": 12050 }, { "epoch": 61.53061224489796, "grad_norm": 0.020713916048407555, "learning_rate": 7.693877551020409e-06, "loss": 0.1126, "step": 12060 }, { "epoch": 61.58163265306123, "grad_norm": 0.028666259720921516, "learning_rate": 7.683673469387756e-06, "loss": 0.061, "step": 12070 }, { "epoch": 61.63265306122449, "grad_norm": 0.0044141653925180435, "learning_rate": 7.673469387755102e-06, "loss": 0.0001, "step": 12080 }, { "epoch": 61.683673469387756, "grad_norm": 0.02224322222173214, "learning_rate": 7.66326530612245e-06, "loss": 0.0636, "step": 12090 }, { "epoch": 61.734693877551024, "grad_norm": 0.45546767115592957, "learning_rate": 7.653061224489796e-06, "loss": 0.0004, "step": 12100 }, { "epoch": 61.785714285714285, "grad_norm": 0.015211665071547031, "learning_rate": 7.642857142857143e-06, "loss": 0.0004, "step": 12110 }, { "epoch": 61.83673469387755, "grad_norm": 0.025683023035526276, "learning_rate": 7.63265306122449e-06, "loss": 0.0003, "step": 12120 }, { "epoch": 61.88775510204081, "grad_norm": 0.0011293612187728286, "learning_rate": 7.622448979591838e-06, "loss": 0.0025, "step": 12130 }, { "epoch": 61.93877551020408, "grad_norm": 0.002530501689761877, "learning_rate": 7.612244897959185e-06, "loss": 0.0001, "step": 12140 }, { "epoch": 61.98979591836735, "grad_norm": 1.2625882625579834, "learning_rate": 7.602040816326531e-06, "loss": 0.0006, "step": 12150 }, { "epoch": 62.0, "eval_accuracy": 0.927797833935018, "eval_loss": 0.5788304805755615, "eval_runtime": 1.0955, "eval_samples_per_second": 252.863, "eval_steps_per_second": 31.95, "step": 12152 }, { "epoch": 62.04081632653061, "grad_norm": 0.0012302438262850046, "learning_rate": 7.591836734693878e-06, "loss": 0.0001, "step": 12160 }, { "epoch": 62.09183673469388, "grad_norm": 0.0002833695325534791, "learning_rate": 7.581632653061225e-06, "loss": 0.0001, "step": 12170 }, { "epoch": 62.142857142857146, "grad_norm": 5.891306400299072, "learning_rate": 7.571428571428572e-06, "loss": 0.1115, "step": 12180 }, { "epoch": 62.19387755102041, "grad_norm": 0.0024651968851685524, "learning_rate": 7.561224489795919e-06, "loss": 0.0061, "step": 12190 }, { "epoch": 62.244897959183675, "grad_norm": 0.056724824011325836, "learning_rate": 7.551020408163265e-06, "loss": 0.0002, "step": 12200 }, { "epoch": 62.295918367346935, "grad_norm": 0.0012926298659294844, "learning_rate": 7.540816326530614e-06, "loss": 0.0001, "step": 12210 }, { "epoch": 62.3469387755102, "grad_norm": 0.0532744824886322, "learning_rate": 7.53061224489796e-06, "loss": 0.0547, "step": 12220 }, { "epoch": 62.39795918367347, "grad_norm": 0.003796725533902645, "learning_rate": 7.520408163265307e-06, "loss": 0.0095, "step": 12230 }, { "epoch": 62.44897959183673, "grad_norm": 0.09812874346971512, "learning_rate": 7.5102040816326536e-06, "loss": 0.0654, "step": 12240 }, { "epoch": 62.5, "grad_norm": 0.005811839830130339, "learning_rate": 7.500000000000001e-06, "loss": 0.0002, "step": 12250 }, { "epoch": 62.55102040816327, "grad_norm": 0.0004320720618125051, "learning_rate": 7.489795918367348e-06, "loss": 0.0045, "step": 12260 }, { "epoch": 62.60204081632653, "grad_norm": 0.027087802067399025, "learning_rate": 7.479591836734694e-06, "loss": 0.0551, "step": 12270 }, { "epoch": 62.6530612244898, "grad_norm": 0.0022713462822139263, "learning_rate": 7.469387755102041e-06, "loss": 0.0089, "step": 12280 }, { "epoch": 62.704081632653065, "grad_norm": 0.22178992629051208, "learning_rate": 7.459183673469388e-06, "loss": 0.0004, "step": 12290 }, { "epoch": 62.755102040816325, "grad_norm": 0.0014946759911254048, "learning_rate": 7.448979591836736e-06, "loss": 0.0001, "step": 12300 }, { "epoch": 62.80612244897959, "grad_norm": 0.0007464837981387973, "learning_rate": 7.4387755102040826e-06, "loss": 0.0003, "step": 12310 }, { "epoch": 62.857142857142854, "grad_norm": 0.0022405448835343122, "learning_rate": 7.428571428571429e-06, "loss": 0.1118, "step": 12320 }, { "epoch": 62.90816326530612, "grad_norm": 1.9994844198226929, "learning_rate": 7.418367346938776e-06, "loss": 0.0006, "step": 12330 }, { "epoch": 62.95918367346939, "grad_norm": 0.0004937660414725542, "learning_rate": 7.408163265306123e-06, "loss": 0.0269, "step": 12340 }, { "epoch": 63.0, "eval_accuracy": 0.927797833935018, "eval_loss": 0.5499926209449768, "eval_runtime": 1.08, "eval_samples_per_second": 256.48, "eval_steps_per_second": 32.407, "step": 12348 }, { "epoch": 63.01020408163265, "grad_norm": 0.001896741334348917, "learning_rate": 7.39795918367347e-06, "loss": 0.0383, "step": 12350 }, { "epoch": 63.06122448979592, "grad_norm": 0.0009329813183285296, "learning_rate": 7.387755102040817e-06, "loss": 0.0002, "step": 12360 }, { "epoch": 63.11224489795919, "grad_norm": 0.014004690572619438, "learning_rate": 7.377551020408163e-06, "loss": 0.0581, "step": 12370 }, { "epoch": 63.16326530612245, "grad_norm": 0.010309996083378792, "learning_rate": 7.367346938775511e-06, "loss": 0.0, "step": 12380 }, { "epoch": 63.214285714285715, "grad_norm": 34.72095489501953, "learning_rate": 7.357142857142858e-06, "loss": 0.0131, "step": 12390 }, { "epoch": 63.265306122448976, "grad_norm": 0.0009893177775666118, "learning_rate": 7.346938775510205e-06, "loss": 0.0002, "step": 12400 }, { "epoch": 63.316326530612244, "grad_norm": 0.0030334554612636566, "learning_rate": 7.3367346938775515e-06, "loss": 0.0001, "step": 12410 }, { "epoch": 63.36734693877551, "grad_norm": 0.037118230015039444, "learning_rate": 7.326530612244899e-06, "loss": 0.0004, "step": 12420 }, { "epoch": 63.41836734693877, "grad_norm": 41.940547943115234, "learning_rate": 7.316326530612246e-06, "loss": 0.0616, "step": 12430 }, { "epoch": 63.46938775510204, "grad_norm": 0.08743254095315933, "learning_rate": 7.306122448979592e-06, "loss": 0.0008, "step": 12440 }, { "epoch": 63.52040816326531, "grad_norm": 0.0003081340400967747, "learning_rate": 7.295918367346939e-06, "loss": 0.0384, "step": 12450 }, { "epoch": 63.57142857142857, "grad_norm": 0.0018472468946129084, "learning_rate": 7.285714285714286e-06, "loss": 0.0033, "step": 12460 }, { "epoch": 63.62244897959184, "grad_norm": 0.0005271555273793638, "learning_rate": 7.275510204081633e-06, "loss": 0.043, "step": 12470 }, { "epoch": 63.673469387755105, "grad_norm": 0.003033218439668417, "learning_rate": 7.2653061224489805e-06, "loss": 0.0005, "step": 12480 }, { "epoch": 63.724489795918366, "grad_norm": 0.004401716403663158, "learning_rate": 7.255102040816327e-06, "loss": 0.0001, "step": 12490 }, { "epoch": 63.775510204081634, "grad_norm": 0.004580669570714235, "learning_rate": 7.244897959183675e-06, "loss": 0.1382, "step": 12500 }, { "epoch": 63.826530612244895, "grad_norm": 0.0015654556918889284, "learning_rate": 7.234693877551021e-06, "loss": 0.0249, "step": 12510 }, { "epoch": 63.87755102040816, "grad_norm": 0.004439481068402529, "learning_rate": 7.224489795918368e-06, "loss": 0.0791, "step": 12520 }, { "epoch": 63.92857142857143, "grad_norm": 11.162091255187988, "learning_rate": 7.2142857142857145e-06, "loss": 0.1252, "step": 12530 }, { "epoch": 63.97959183673469, "grad_norm": 50.49294662475586, "learning_rate": 7.204081632653061e-06, "loss": 0.1, "step": 12540 }, { "epoch": 64.0, "eval_accuracy": 0.9205776173285198, "eval_loss": 0.6466786861419678, "eval_runtime": 1.0788, "eval_samples_per_second": 256.758, "eval_steps_per_second": 32.442, "step": 12544 }, { "epoch": 64.03061224489795, "grad_norm": 0.3174396753311157, "learning_rate": 7.193877551020409e-06, "loss": 0.0002, "step": 12550 }, { "epoch": 64.08163265306122, "grad_norm": 24.110004425048828, "learning_rate": 7.183673469387755e-06, "loss": 0.0124, "step": 12560 }, { "epoch": 64.13265306122449, "grad_norm": 0.0022637853398919106, "learning_rate": 7.173469387755103e-06, "loss": 0.0045, "step": 12570 }, { "epoch": 64.18367346938776, "grad_norm": 12.9771146774292, "learning_rate": 7.16326530612245e-06, "loss": 0.0095, "step": 12580 }, { "epoch": 64.23469387755102, "grad_norm": 0.001837348099797964, "learning_rate": 7.153061224489797e-06, "loss": 0.0187, "step": 12590 }, { "epoch": 64.28571428571429, "grad_norm": 0.032073747366666794, "learning_rate": 7.1428571428571436e-06, "loss": 0.1076, "step": 12600 }, { "epoch": 64.33673469387755, "grad_norm": 0.0014201452722772956, "learning_rate": 7.13265306122449e-06, "loss": 0.0062, "step": 12610 }, { "epoch": 64.38775510204081, "grad_norm": 0.0007771974778734148, "learning_rate": 7.122448979591837e-06, "loss": 0.0093, "step": 12620 }, { "epoch": 64.43877551020408, "grad_norm": 0.001709192874841392, "learning_rate": 7.112244897959184e-06, "loss": 0.0154, "step": 12630 }, { "epoch": 64.48979591836735, "grad_norm": 0.0027038834523409605, "learning_rate": 7.102040816326531e-06, "loss": 0.1222, "step": 12640 }, { "epoch": 64.54081632653062, "grad_norm": 0.0018568458035588264, "learning_rate": 7.091836734693878e-06, "loss": 0.0819, "step": 12650 }, { "epoch": 64.59183673469387, "grad_norm": 0.03192492946982384, "learning_rate": 7.081632653061226e-06, "loss": 0.0015, "step": 12660 }, { "epoch": 64.64285714285714, "grad_norm": 0.0014545799931511283, "learning_rate": 7.0714285714285726e-06, "loss": 0.0001, "step": 12670 }, { "epoch": 64.6938775510204, "grad_norm": 0.03471424803137779, "learning_rate": 7.061224489795919e-06, "loss": 0.0039, "step": 12680 }, { "epoch": 64.74489795918367, "grad_norm": 0.5363325476646423, "learning_rate": 7.051020408163266e-06, "loss": 0.0839, "step": 12690 }, { "epoch": 64.79591836734694, "grad_norm": 0.14592291414737701, "learning_rate": 7.0408163265306125e-06, "loss": 0.0001, "step": 12700 }, { "epoch": 64.84693877551021, "grad_norm": 0.002070327987894416, "learning_rate": 7.03061224489796e-06, "loss": 0.0001, "step": 12710 }, { "epoch": 64.89795918367346, "grad_norm": 0.0016950189601629972, "learning_rate": 7.020408163265307e-06, "loss": 0.0631, "step": 12720 }, { "epoch": 64.94897959183673, "grad_norm": 0.07179731130599976, "learning_rate": 7.010204081632653e-06, "loss": 0.007, "step": 12730 }, { "epoch": 65.0, "grad_norm": 0.0011688038939610124, "learning_rate": 7e-06, "loss": 0.0004, "step": 12740 }, { "epoch": 65.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.5827694535255432, "eval_runtime": 1.0938, "eval_samples_per_second": 253.238, "eval_steps_per_second": 31.998, "step": 12740 }, { "epoch": 65.05102040816327, "grad_norm": 0.0011357966577634215, "learning_rate": 6.989795918367348e-06, "loss": 0.06, "step": 12750 }, { "epoch": 65.10204081632654, "grad_norm": 0.0015703201061114669, "learning_rate": 6.979591836734695e-06, "loss": 0.0, "step": 12760 }, { "epoch": 65.15306122448979, "grad_norm": 0.126028373837471, "learning_rate": 6.9693877551020415e-06, "loss": 0.0009, "step": 12770 }, { "epoch": 65.20408163265306, "grad_norm": 0.003637620247900486, "learning_rate": 6.959183673469388e-06, "loss": 0.0007, "step": 12780 }, { "epoch": 65.25510204081633, "grad_norm": 0.001561914337798953, "learning_rate": 6.948979591836736e-06, "loss": 0.0002, "step": 12790 }, { "epoch": 65.3061224489796, "grad_norm": 0.001797153614461422, "learning_rate": 6.938775510204082e-06, "loss": 0.0002, "step": 12800 }, { "epoch": 65.35714285714286, "grad_norm": 0.0017791077261790633, "learning_rate": 6.928571428571429e-06, "loss": 0.0106, "step": 12810 }, { "epoch": 65.40816326530613, "grad_norm": 0.0006706606945954263, "learning_rate": 6.9183673469387755e-06, "loss": 0.0004, "step": 12820 }, { "epoch": 65.45918367346938, "grad_norm": 0.29097989201545715, "learning_rate": 6.908163265306122e-06, "loss": 0.0009, "step": 12830 }, { "epoch": 65.51020408163265, "grad_norm": 0.096458800137043, "learning_rate": 6.8979591836734705e-06, "loss": 0.0001, "step": 12840 }, { "epoch": 65.56122448979592, "grad_norm": 0.0008508774917572737, "learning_rate": 6.887755102040817e-06, "loss": 0.0001, "step": 12850 }, { "epoch": 65.61224489795919, "grad_norm": 0.033212557435035706, "learning_rate": 6.877551020408164e-06, "loss": 0.0021, "step": 12860 }, { "epoch": 65.66326530612245, "grad_norm": 7.343283653259277, "learning_rate": 6.867346938775511e-06, "loss": 0.0042, "step": 12870 }, { "epoch": 65.71428571428571, "grad_norm": 0.00293795601464808, "learning_rate": 6.857142857142858e-06, "loss": 0.001, "step": 12880 }, { "epoch": 65.76530612244898, "grad_norm": 0.04084068536758423, "learning_rate": 6.8469387755102046e-06, "loss": 0.018, "step": 12890 }, { "epoch": 65.81632653061224, "grad_norm": 0.0005856331554241478, "learning_rate": 6.836734693877551e-06, "loss": 0.0003, "step": 12900 }, { "epoch": 65.86734693877551, "grad_norm": 31.835289001464844, "learning_rate": 6.826530612244898e-06, "loss": 0.1251, "step": 12910 }, { "epoch": 65.91836734693878, "grad_norm": 0.0050123403780162334, "learning_rate": 6.816326530612245e-06, "loss": 0.1534, "step": 12920 }, { "epoch": 65.96938775510205, "grad_norm": 0.0017807442927733064, "learning_rate": 6.806122448979592e-06, "loss": 0.0001, "step": 12930 }, { "epoch": 66.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.5283100008964539, "eval_runtime": 1.0859, "eval_samples_per_second": 255.094, "eval_steps_per_second": 32.232, "step": 12936 }, { "epoch": 66.0204081632653, "grad_norm": 0.10015212744474411, "learning_rate": 6.7959183673469394e-06, "loss": 0.0002, "step": 12940 }, { "epoch": 66.07142857142857, "grad_norm": 0.019825097173452377, "learning_rate": 6.785714285714287e-06, "loss": 0.0092, "step": 12950 }, { "epoch": 66.12244897959184, "grad_norm": 0.0005336967296898365, "learning_rate": 6.7755102040816336e-06, "loss": 0.0003, "step": 12960 }, { "epoch": 66.1734693877551, "grad_norm": 0.002440831158310175, "learning_rate": 6.76530612244898e-06, "loss": 0.0002, "step": 12970 }, { "epoch": 66.22448979591837, "grad_norm": 0.0005563469021581113, "learning_rate": 6.755102040816327e-06, "loss": 0.014, "step": 12980 }, { "epoch": 66.27551020408163, "grad_norm": 0.0009195200400426984, "learning_rate": 6.7448979591836735e-06, "loss": 0.0008, "step": 12990 }, { "epoch": 66.3265306122449, "grad_norm": 9.574272155761719, "learning_rate": 6.734693877551021e-06, "loss": 0.0465, "step": 13000 }, { "epoch": 66.37755102040816, "grad_norm": 0.003237582743167877, "learning_rate": 6.724489795918368e-06, "loss": 0.06, "step": 13010 }, { "epoch": 66.42857142857143, "grad_norm": 0.013962351717054844, "learning_rate": 6.714285714285714e-06, "loss": 0.0001, "step": 13020 }, { "epoch": 66.4795918367347, "grad_norm": 0.008406028151512146, "learning_rate": 6.704081632653063e-06, "loss": 0.0011, "step": 13030 }, { "epoch": 66.53061224489795, "grad_norm": 0.0006518092122860253, "learning_rate": 6.693877551020409e-06, "loss": 0.0001, "step": 13040 }, { "epoch": 66.58163265306122, "grad_norm": 0.012106046080589294, "learning_rate": 6.683673469387756e-06, "loss": 0.0068, "step": 13050 }, { "epoch": 66.63265306122449, "grad_norm": 0.07244808971881866, "learning_rate": 6.6734693877551025e-06, "loss": 0.1103, "step": 13060 }, { "epoch": 66.68367346938776, "grad_norm": 0.019703442230820656, "learning_rate": 6.663265306122449e-06, "loss": 0.0001, "step": 13070 }, { "epoch": 66.73469387755102, "grad_norm": 18.24056053161621, "learning_rate": 6.653061224489797e-06, "loss": 0.0139, "step": 13080 }, { "epoch": 66.78571428571429, "grad_norm": 0.000664177758153528, "learning_rate": 6.642857142857143e-06, "loss": 0.0071, "step": 13090 }, { "epoch": 66.83673469387755, "grad_norm": 0.0017546509625390172, "learning_rate": 6.63265306122449e-06, "loss": 0.0026, "step": 13100 }, { "epoch": 66.88775510204081, "grad_norm": 0.0007789704250171781, "learning_rate": 6.6224489795918365e-06, "loss": 0.0172, "step": 13110 }, { "epoch": 66.93877551020408, "grad_norm": 0.03885313495993614, "learning_rate": 6.612244897959185e-06, "loss": 0.0015, "step": 13120 }, { "epoch": 66.98979591836735, "grad_norm": 0.0009417658438906074, "learning_rate": 6.6020408163265315e-06, "loss": 0.0001, "step": 13130 }, { "epoch": 67.0, "eval_accuracy": 0.9205776173285198, "eval_loss": 0.6211773753166199, "eval_runtime": 1.0788, "eval_samples_per_second": 256.776, "eval_steps_per_second": 32.445, "step": 13132 }, { "epoch": 67.04081632653062, "grad_norm": 3.2461624145507812, "learning_rate": 6.591836734693878e-06, "loss": 0.0017, "step": 13140 }, { "epoch": 67.09183673469387, "grad_norm": 0.009441860020160675, "learning_rate": 6.581632653061225e-06, "loss": 0.1559, "step": 13150 }, { "epoch": 67.14285714285714, "grad_norm": 0.0004475672612898052, "learning_rate": 6.571428571428572e-06, "loss": 0.0019, "step": 13160 }, { "epoch": 67.1938775510204, "grad_norm": 25.655916213989258, "learning_rate": 6.561224489795919e-06, "loss": 0.0247, "step": 13170 }, { "epoch": 67.24489795918367, "grad_norm": 0.002604785840958357, "learning_rate": 6.5510204081632656e-06, "loss": 0.0, "step": 13180 }, { "epoch": 67.29591836734694, "grad_norm": 0.007296546828001738, "learning_rate": 6.540816326530612e-06, "loss": 0.1148, "step": 13190 }, { "epoch": 67.34693877551021, "grad_norm": 0.0016531578730791807, "learning_rate": 6.530612244897959e-06, "loss": 0.0003, "step": 13200 }, { "epoch": 67.39795918367346, "grad_norm": 0.031078562140464783, "learning_rate": 6.520408163265307e-06, "loss": 0.0001, "step": 13210 }, { "epoch": 67.44897959183673, "grad_norm": 0.0008519267430528998, "learning_rate": 6.510204081632654e-06, "loss": 0.0002, "step": 13220 }, { "epoch": 67.5, "grad_norm": 0.0006317342049442232, "learning_rate": 6.5000000000000004e-06, "loss": 0.0016, "step": 13230 }, { "epoch": 67.55102040816327, "grad_norm": 0.001069535850547254, "learning_rate": 6.489795918367348e-06, "loss": 0.0013, "step": 13240 }, { "epoch": 67.60204081632654, "grad_norm": 27.43927764892578, "learning_rate": 6.4795918367346946e-06, "loss": 0.048, "step": 13250 }, { "epoch": 67.65306122448979, "grad_norm": 0.0031886622309684753, "learning_rate": 6.469387755102041e-06, "loss": 0.0001, "step": 13260 }, { "epoch": 67.70408163265306, "grad_norm": 0.0076958192512393, "learning_rate": 6.459183673469388e-06, "loss": 0.0244, "step": 13270 }, { "epoch": 67.75510204081633, "grad_norm": 13.080926895141602, "learning_rate": 6.4489795918367345e-06, "loss": 0.1058, "step": 13280 }, { "epoch": 67.8061224489796, "grad_norm": 0.003534428309649229, "learning_rate": 6.438775510204082e-06, "loss": 0.1156, "step": 13290 }, { "epoch": 67.85714285714286, "grad_norm": 0.0006483325851149857, "learning_rate": 6.4285714285714295e-06, "loss": 0.0806, "step": 13300 }, { "epoch": 67.90816326530613, "grad_norm": 0.007327963598072529, "learning_rate": 6.418367346938776e-06, "loss": 0.0001, "step": 13310 }, { "epoch": 67.95918367346938, "grad_norm": 0.01653578132390976, "learning_rate": 6.408163265306124e-06, "loss": 0.0002, "step": 13320 }, { "epoch": 68.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.49729979038238525, "eval_runtime": 1.0826, "eval_samples_per_second": 255.87, "eval_steps_per_second": 32.33, "step": 13328 }, { "epoch": 68.01020408163265, "grad_norm": 0.0015546072972938418, "learning_rate": 6.39795918367347e-06, "loss": 0.0684, "step": 13330 }, { "epoch": 68.06122448979592, "grad_norm": 0.16754399240016937, "learning_rate": 6.387755102040817e-06, "loss": 0.0904, "step": 13340 }, { "epoch": 68.11224489795919, "grad_norm": 0.5623511075973511, "learning_rate": 6.3775510204081635e-06, "loss": 0.0973, "step": 13350 }, { "epoch": 68.16326530612245, "grad_norm": 0.017039529979228973, "learning_rate": 6.36734693877551e-06, "loss": 0.1459, "step": 13360 }, { "epoch": 68.21428571428571, "grad_norm": 0.03762345016002655, "learning_rate": 6.357142857142858e-06, "loss": 0.0007, "step": 13370 }, { "epoch": 68.26530612244898, "grad_norm": 0.002080440754070878, "learning_rate": 6.346938775510204e-06, "loss": 0.0001, "step": 13380 }, { "epoch": 68.31632653061224, "grad_norm": 0.0007440683548338711, "learning_rate": 6.336734693877552e-06, "loss": 0.0854, "step": 13390 }, { "epoch": 68.36734693877551, "grad_norm": 0.17591404914855957, "learning_rate": 6.326530612244899e-06, "loss": 0.0044, "step": 13400 }, { "epoch": 68.41836734693878, "grad_norm": 0.0015963518526405096, "learning_rate": 6.316326530612246e-06, "loss": 0.0002, "step": 13410 }, { "epoch": 68.46938775510205, "grad_norm": 2.2939441204071045, "learning_rate": 6.3061224489795925e-06, "loss": 0.0007, "step": 13420 }, { "epoch": 68.5204081632653, "grad_norm": 38.51103973388672, "learning_rate": 6.295918367346939e-06, "loss": 0.0345, "step": 13430 }, { "epoch": 68.57142857142857, "grad_norm": 0.17458097636699677, "learning_rate": 6.285714285714286e-06, "loss": 0.0563, "step": 13440 }, { "epoch": 68.62244897959184, "grad_norm": 0.5256828665733337, "learning_rate": 6.275510204081633e-06, "loss": 0.0141, "step": 13450 }, { "epoch": 68.6734693877551, "grad_norm": 0.0002658106677699834, "learning_rate": 6.26530612244898e-06, "loss": 0.0042, "step": 13460 }, { "epoch": 68.72448979591837, "grad_norm": 0.0024421957787126303, "learning_rate": 6.2551020408163266e-06, "loss": 0.0444, "step": 13470 }, { "epoch": 68.77551020408163, "grad_norm": 0.0006208324339240789, "learning_rate": 6.244897959183675e-06, "loss": 0.02, "step": 13480 }, { "epoch": 68.8265306122449, "grad_norm": 0.0017610067734494805, "learning_rate": 6.2346938775510215e-06, "loss": 0.0133, "step": 13490 }, { "epoch": 68.87755102040816, "grad_norm": 0.0021017007529735565, "learning_rate": 6.224489795918368e-06, "loss": 0.023, "step": 13500 }, { "epoch": 68.92857142857143, "grad_norm": 0.0007662806310690939, "learning_rate": 6.214285714285715e-06, "loss": 0.0001, "step": 13510 }, { "epoch": 68.9795918367347, "grad_norm": 0.0003395264793653041, "learning_rate": 6.2040816326530614e-06, "loss": 0.0058, "step": 13520 }, { "epoch": 69.0, "eval_accuracy": 0.927797833935018, "eval_loss": 0.5021236538887024, "eval_runtime": 1.085, "eval_samples_per_second": 255.302, "eval_steps_per_second": 32.258, "step": 13524 }, { "epoch": 69.03061224489795, "grad_norm": 0.0007607376319356263, "learning_rate": 6.193877551020409e-06, "loss": 0.0055, "step": 13530 }, { "epoch": 69.08163265306122, "grad_norm": 0.007833562791347504, "learning_rate": 6.1836734693877556e-06, "loss": 0.0006, "step": 13540 }, { "epoch": 69.13265306122449, "grad_norm": 0.0008038674714043736, "learning_rate": 6.173469387755102e-06, "loss": 0.0002, "step": 13550 }, { "epoch": 69.18367346938776, "grad_norm": 0.003031996078789234, "learning_rate": 6.163265306122449e-06, "loss": 0.0001, "step": 13560 }, { "epoch": 69.23469387755102, "grad_norm": 0.008537117391824722, "learning_rate": 6.153061224489797e-06, "loss": 0.0011, "step": 13570 }, { "epoch": 69.28571428571429, "grad_norm": 3.0747671127319336, "learning_rate": 6.142857142857144e-06, "loss": 0.0021, "step": 13580 }, { "epoch": 69.33673469387755, "grad_norm": 1.774642825126648, "learning_rate": 6.1326530612244905e-06, "loss": 0.0039, "step": 13590 }, { "epoch": 69.38775510204081, "grad_norm": 0.0007778594736009836, "learning_rate": 6.122448979591837e-06, "loss": 0.0521, "step": 13600 }, { "epoch": 69.43877551020408, "grad_norm": 0.0017788363620638847, "learning_rate": 6.112244897959185e-06, "loss": 0.0786, "step": 13610 }, { "epoch": 69.48979591836735, "grad_norm": 0.0006191099528223276, "learning_rate": 6.102040816326531e-06, "loss": 0.0001, "step": 13620 }, { "epoch": 69.54081632653062, "grad_norm": 0.0005208913935348392, "learning_rate": 6.091836734693878e-06, "loss": 0.0002, "step": 13630 }, { "epoch": 69.59183673469387, "grad_norm": 0.0005669071688316762, "learning_rate": 6.0816326530612245e-06, "loss": 0.0634, "step": 13640 }, { "epoch": 69.64285714285714, "grad_norm": 0.00409251032397151, "learning_rate": 6.071428571428571e-06, "loss": 0.0728, "step": 13650 }, { "epoch": 69.6938775510204, "grad_norm": 0.00046774715883657336, "learning_rate": 6.0612244897959195e-06, "loss": 0.0001, "step": 13660 }, { "epoch": 69.74489795918367, "grad_norm": 0.002229816047474742, "learning_rate": 6.051020408163266e-06, "loss": 0.0004, "step": 13670 }, { "epoch": 69.79591836734694, "grad_norm": 0.0006731147295795381, "learning_rate": 6.040816326530613e-06, "loss": 0.001, "step": 13680 }, { "epoch": 69.84693877551021, "grad_norm": 0.0006926387432031333, "learning_rate": 6.03061224489796e-06, "loss": 0.1095, "step": 13690 }, { "epoch": 69.89795918367346, "grad_norm": 0.0005957155954092741, "learning_rate": 6.020408163265307e-06, "loss": 0.0003, "step": 13700 }, { "epoch": 69.94897959183673, "grad_norm": 11.699542045593262, "learning_rate": 6.0102040816326535e-06, "loss": 0.0068, "step": 13710 }, { "epoch": 70.0, "grad_norm": 0.001246853731572628, "learning_rate": 6e-06, "loss": 0.0605, "step": 13720 }, { "epoch": 70.0, "eval_accuracy": 0.9169675090252708, "eval_loss": 0.6981845498085022, "eval_runtime": 1.0835, "eval_samples_per_second": 255.646, "eval_steps_per_second": 32.302, "step": 13720 }, { "epoch": 70.05102040816327, "grad_norm": 0.01808277890086174, "learning_rate": 5.989795918367347e-06, "loss": 0.0014, "step": 13730 }, { "epoch": 70.10204081632654, "grad_norm": 0.0016726941103115678, "learning_rate": 5.979591836734694e-06, "loss": 0.0135, "step": 13740 }, { "epoch": 70.15306122448979, "grad_norm": 0.0031059379689395428, "learning_rate": 5.969387755102042e-06, "loss": 0.0166, "step": 13750 }, { "epoch": 70.20408163265306, "grad_norm": 0.0014469457091763616, "learning_rate": 5.959183673469388e-06, "loss": 0.0001, "step": 13760 }, { "epoch": 70.25510204081633, "grad_norm": 0.0009005493484437466, "learning_rate": 5.948979591836735e-06, "loss": 0.1097, "step": 13770 }, { "epoch": 70.3061224489796, "grad_norm": 0.002801652532070875, "learning_rate": 5.9387755102040825e-06, "loss": 0.069, "step": 13780 }, { "epoch": 70.35714285714286, "grad_norm": 0.0030414480715990067, "learning_rate": 5.928571428571429e-06, "loss": 0.1269, "step": 13790 }, { "epoch": 70.40816326530613, "grad_norm": 0.0041852472350001335, "learning_rate": 5.918367346938776e-06, "loss": 0.0154, "step": 13800 }, { "epoch": 70.45918367346938, "grad_norm": 0.000529597164131701, "learning_rate": 5.9081632653061224e-06, "loss": 0.0014, "step": 13810 }, { "epoch": 70.51020408163265, "grad_norm": 0.0010508056730031967, "learning_rate": 5.89795918367347e-06, "loss": 0.1232, "step": 13820 }, { "epoch": 70.56122448979592, "grad_norm": 0.0025549293495714664, "learning_rate": 5.8877551020408166e-06, "loss": 0.0001, "step": 13830 }, { "epoch": 70.61224489795919, "grad_norm": 0.0022436375729739666, "learning_rate": 5.877551020408164e-06, "loss": 0.0098, "step": 13840 }, { "epoch": 70.66326530612245, "grad_norm": 0.001773953321389854, "learning_rate": 5.867346938775511e-06, "loss": 0.0, "step": 13850 }, { "epoch": 70.71428571428571, "grad_norm": 0.004221509210765362, "learning_rate": 5.857142857142858e-06, "loss": 0.0001, "step": 13860 }, { "epoch": 70.76530612244898, "grad_norm": 0.003939969930797815, "learning_rate": 5.846938775510205e-06, "loss": 0.0001, "step": 13870 }, { "epoch": 70.81632653061224, "grad_norm": 0.0014852830208837986, "learning_rate": 5.8367346938775515e-06, "loss": 0.01, "step": 13880 }, { "epoch": 70.86734693877551, "grad_norm": 0.25884443521499634, "learning_rate": 5.826530612244898e-06, "loss": 0.0068, "step": 13890 }, { "epoch": 70.91836734693878, "grad_norm": 0.0034037751611322165, "learning_rate": 5.816326530612246e-06, "loss": 0.0002, "step": 13900 }, { "epoch": 70.96938775510205, "grad_norm": 0.03086518682539463, "learning_rate": 5.806122448979592e-06, "loss": 0.0006, "step": 13910 }, { "epoch": 71.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.46024465560913086, "eval_runtime": 1.0802, "eval_samples_per_second": 256.438, "eval_steps_per_second": 32.402, "step": 13916 }, { "epoch": 71.0204081632653, "grad_norm": 0.007064112927764654, "learning_rate": 5.795918367346939e-06, "loss": 0.0109, "step": 13920 }, { "epoch": 71.07142857142857, "grad_norm": 1.5306414365768433, "learning_rate": 5.785714285714286e-06, "loss": 0.0018, "step": 13930 }, { "epoch": 71.12244897959184, "grad_norm": 0.0004420735058374703, "learning_rate": 5.775510204081634e-06, "loss": 0.001, "step": 13940 }, { "epoch": 71.1734693877551, "grad_norm": 0.0007512482698075473, "learning_rate": 5.7653061224489805e-06, "loss": 0.0295, "step": 13950 }, { "epoch": 71.22448979591837, "grad_norm": 0.010687971487641335, "learning_rate": 5.755102040816327e-06, "loss": 0.0001, "step": 13960 }, { "epoch": 71.27551020408163, "grad_norm": 0.002488783560693264, "learning_rate": 5.744897959183674e-06, "loss": 0.0001, "step": 13970 }, { "epoch": 71.3265306122449, "grad_norm": 1.5924909114837646, "learning_rate": 5.73469387755102e-06, "loss": 0.0316, "step": 13980 }, { "epoch": 71.37755102040816, "grad_norm": 0.003192701144143939, "learning_rate": 5.724489795918368e-06, "loss": 0.0001, "step": 13990 }, { "epoch": 71.42857142857143, "grad_norm": 0.002065510954707861, "learning_rate": 5.7142857142857145e-06, "loss": 0.0006, "step": 14000 }, { "epoch": 71.4795918367347, "grad_norm": 51.37974548339844, "learning_rate": 5.704081632653061e-06, "loss": 0.0927, "step": 14010 }, { "epoch": 71.53061224489795, "grad_norm": 0.000698354619089514, "learning_rate": 5.6938775510204095e-06, "loss": 0.0064, "step": 14020 }, { "epoch": 71.58163265306122, "grad_norm": 0.001019397284835577, "learning_rate": 5.683673469387756e-06, "loss": 0.0167, "step": 14030 }, { "epoch": 71.63265306122449, "grad_norm": 0.0013389576924964786, "learning_rate": 5.673469387755103e-06, "loss": 0.0003, "step": 14040 }, { "epoch": 71.68367346938776, "grad_norm": 0.0037247114814817905, "learning_rate": 5.663265306122449e-06, "loss": 0.0153, "step": 14050 }, { "epoch": 71.73469387755102, "grad_norm": 0.001026379643008113, "learning_rate": 5.653061224489796e-06, "loss": 0.0005, "step": 14060 }, { "epoch": 71.78571428571429, "grad_norm": 0.0023309008684009314, "learning_rate": 5.6428571428571435e-06, "loss": 0.0005, "step": 14070 }, { "epoch": 71.83673469387755, "grad_norm": 2.518416404724121, "learning_rate": 5.63265306122449e-06, "loss": 0.004, "step": 14080 }, { "epoch": 71.88775510204081, "grad_norm": 0.0023508963640779257, "learning_rate": 5.622448979591837e-06, "loss": 0.0001, "step": 14090 }, { "epoch": 71.93877551020408, "grad_norm": 0.002550177276134491, "learning_rate": 5.6122448979591834e-06, "loss": 0.0021, "step": 14100 }, { "epoch": 71.98979591836735, "grad_norm": 0.0027450546622276306, "learning_rate": 5.602040816326531e-06, "loss": 0.0021, "step": 14110 }, { "epoch": 72.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.559477686882019, "eval_runtime": 1.0854, "eval_samples_per_second": 255.199, "eval_steps_per_second": 32.245, "step": 14112 }, { "epoch": 72.04081632653062, "grad_norm": 0.00036486852332018316, "learning_rate": 5.591836734693878e-06, "loss": 0.0009, "step": 14120 }, { "epoch": 72.09183673469387, "grad_norm": 0.001150712138041854, "learning_rate": 5.581632653061225e-06, "loss": 0.0029, "step": 14130 }, { "epoch": 72.14285714285714, "grad_norm": 0.0017207561759278178, "learning_rate": 5.571428571428572e-06, "loss": 0.0004, "step": 14140 }, { "epoch": 72.1938775510204, "grad_norm": 0.0006864034221507609, "learning_rate": 5.561224489795919e-06, "loss": 0.0306, "step": 14150 }, { "epoch": 72.24489795918367, "grad_norm": 0.0003824463055934757, "learning_rate": 5.551020408163266e-06, "loss": 0.0, "step": 14160 }, { "epoch": 72.29591836734694, "grad_norm": 0.002324274042621255, "learning_rate": 5.5408163265306125e-06, "loss": 0.0001, "step": 14170 }, { "epoch": 72.34693877551021, "grad_norm": 0.0007837857701815665, "learning_rate": 5.530612244897959e-06, "loss": 0.0322, "step": 14180 }, { "epoch": 72.39795918367346, "grad_norm": 0.01286349818110466, "learning_rate": 5.520408163265306e-06, "loss": 0.0003, "step": 14190 }, { "epoch": 72.44897959183673, "grad_norm": 0.010271753184497356, "learning_rate": 5.510204081632653e-06, "loss": 0.0001, "step": 14200 }, { "epoch": 72.5, "grad_norm": 0.003656284883618355, "learning_rate": 5.500000000000001e-06, "loss": 0.0778, "step": 14210 }, { "epoch": 72.55102040816327, "grad_norm": 18.93934440612793, "learning_rate": 5.489795918367347e-06, "loss": 0.102, "step": 14220 }, { "epoch": 72.60204081632654, "grad_norm": 0.06587009876966476, "learning_rate": 5.479591836734695e-06, "loss": 0.0001, "step": 14230 }, { "epoch": 72.65306122448979, "grad_norm": 0.022085756063461304, "learning_rate": 5.4693877551020415e-06, "loss": 0.0278, "step": 14240 }, { "epoch": 72.70408163265306, "grad_norm": 0.0004267058102414012, "learning_rate": 5.459183673469388e-06, "loss": 0.0003, "step": 14250 }, { "epoch": 72.75510204081633, "grad_norm": 0.00581092294305563, "learning_rate": 5.448979591836735e-06, "loss": 0.0567, "step": 14260 }, { "epoch": 72.8061224489796, "grad_norm": 0.16200384497642517, "learning_rate": 5.438775510204081e-06, "loss": 0.0107, "step": 14270 }, { "epoch": 72.85714285714286, "grad_norm": 16.48029136657715, "learning_rate": 5.428571428571429e-06, "loss": 0.0949, "step": 14280 }, { "epoch": 72.90816326530613, "grad_norm": 0.021228225901722908, "learning_rate": 5.4183673469387755e-06, "loss": 0.0001, "step": 14290 }, { "epoch": 72.95918367346938, "grad_norm": 0.004662848077714443, "learning_rate": 5.408163265306123e-06, "loss": 0.0004, "step": 14300 }, { "epoch": 73.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.43656104803085327, "eval_runtime": 1.1298, "eval_samples_per_second": 245.178, "eval_steps_per_second": 30.979, "step": 14308 }, { "epoch": 73.01020408163265, "grad_norm": 0.6572969555854797, "learning_rate": 5.3979591836734705e-06, "loss": 0.001, "step": 14310 }, { "epoch": 73.06122448979592, "grad_norm": 0.00045627326471731067, "learning_rate": 5.387755102040817e-06, "loss": 0.1349, "step": 14320 }, { "epoch": 73.11224489795919, "grad_norm": 0.0024611428380012512, "learning_rate": 5.377551020408164e-06, "loss": 0.0003, "step": 14330 }, { "epoch": 73.16326530612245, "grad_norm": 8.631586074829102, "learning_rate": 5.36734693877551e-06, "loss": 0.0038, "step": 14340 }, { "epoch": 73.21428571428571, "grad_norm": 4.385893821716309, "learning_rate": 5.357142857142857e-06, "loss": 0.0556, "step": 14350 }, { "epoch": 73.26530612244898, "grad_norm": 0.0011136539978906512, "learning_rate": 5.3469387755102045e-06, "loss": 0.0003, "step": 14360 }, { "epoch": 73.31632653061224, "grad_norm": 8.160663604736328, "learning_rate": 5.336734693877551e-06, "loss": 0.002, "step": 14370 }, { "epoch": 73.36734693877551, "grad_norm": 0.0009015538962557912, "learning_rate": 5.326530612244898e-06, "loss": 0.0, "step": 14380 }, { "epoch": 73.41836734693878, "grad_norm": 0.0028277600649744272, "learning_rate": 5.316326530612246e-06, "loss": 0.0473, "step": 14390 }, { "epoch": 73.46938775510205, "grad_norm": 0.07307971268892288, "learning_rate": 5.306122448979593e-06, "loss": 0.0071, "step": 14400 }, { "epoch": 73.5204081632653, "grad_norm": 0.09868597984313965, "learning_rate": 5.295918367346939e-06, "loss": 0.0009, "step": 14410 }, { "epoch": 73.57142857142857, "grad_norm": 0.10393614321947098, "learning_rate": 5.285714285714286e-06, "loss": 0.0023, "step": 14420 }, { "epoch": 73.62244897959184, "grad_norm": 0.0008125916356220841, "learning_rate": 5.275510204081633e-06, "loss": 0.1959, "step": 14430 }, { "epoch": 73.6734693877551, "grad_norm": 0.00472773052752018, "learning_rate": 5.26530612244898e-06, "loss": 0.1606, "step": 14440 }, { "epoch": 73.72448979591837, "grad_norm": 0.0039938632398843765, "learning_rate": 5.255102040816327e-06, "loss": 0.0708, "step": 14450 }, { "epoch": 73.77551020408163, "grad_norm": 0.0014945559669286013, "learning_rate": 5.2448979591836735e-06, "loss": 0.0002, "step": 14460 }, { "epoch": 73.8265306122449, "grad_norm": 0.0026692699175328016, "learning_rate": 5.23469387755102e-06, "loss": 0.0076, "step": 14470 }, { "epoch": 73.87755102040816, "grad_norm": 0.002656379481777549, "learning_rate": 5.2244897959183684e-06, "loss": 0.088, "step": 14480 }, { "epoch": 73.92857142857143, "grad_norm": 0.0017336404416710138, "learning_rate": 5.214285714285715e-06, "loss": 0.0031, "step": 14490 }, { "epoch": 73.9795918367347, "grad_norm": 0.014629676938056946, "learning_rate": 5.204081632653062e-06, "loss": 0.0124, "step": 14500 }, { "epoch": 74.0, "eval_accuracy": 0.9133574007220217, "eval_loss": 0.7611564993858337, "eval_runtime": 1.0911, "eval_samples_per_second": 253.881, "eval_steps_per_second": 32.079, "step": 14504 }, { "epoch": 74.03061224489795, "grad_norm": 14.214078903198242, "learning_rate": 5.193877551020408e-06, "loss": 0.0074, "step": 14510 }, { "epoch": 74.08163265306122, "grad_norm": 0.003779330989345908, "learning_rate": 5.183673469387756e-06, "loss": 0.0864, "step": 14520 }, { "epoch": 74.13265306122449, "grad_norm": 0.001556128729134798, "learning_rate": 5.1734693877551025e-06, "loss": 0.0002, "step": 14530 }, { "epoch": 74.18367346938776, "grad_norm": 0.0016911375569179654, "learning_rate": 5.163265306122449e-06, "loss": 0.0003, "step": 14540 }, { "epoch": 74.23469387755102, "grad_norm": 0.022783048450946808, "learning_rate": 5.153061224489796e-06, "loss": 0.0001, "step": 14550 }, { "epoch": 74.28571428571429, "grad_norm": 0.0023381069768220186, "learning_rate": 5.142857142857142e-06, "loss": 0.0939, "step": 14560 }, { "epoch": 74.33673469387755, "grad_norm": 0.01028367131948471, "learning_rate": 5.132653061224491e-06, "loss": 0.0002, "step": 14570 }, { "epoch": 74.38775510204081, "grad_norm": 0.0008671150426380336, "learning_rate": 5.122448979591837e-06, "loss": 0.0936, "step": 14580 }, { "epoch": 74.43877551020408, "grad_norm": 0.008581283502280712, "learning_rate": 5.112244897959184e-06, "loss": 0.0059, "step": 14590 }, { "epoch": 74.48979591836735, "grad_norm": 0.004132489673793316, "learning_rate": 5.1020408163265315e-06, "loss": 0.0002, "step": 14600 }, { "epoch": 74.54081632653062, "grad_norm": 0.002309389179572463, "learning_rate": 5.091836734693878e-06, "loss": 0.0018, "step": 14610 }, { "epoch": 74.59183673469387, "grad_norm": 0.004606122151017189, "learning_rate": 5.081632653061225e-06, "loss": 0.0809, "step": 14620 }, { "epoch": 74.64285714285714, "grad_norm": 0.6175808906555176, "learning_rate": 5.071428571428571e-06, "loss": 0.0085, "step": 14630 }, { "epoch": 74.6938775510204, "grad_norm": 0.004183734301477671, "learning_rate": 5.061224489795918e-06, "loss": 0.0057, "step": 14640 }, { "epoch": 74.74489795918367, "grad_norm": 0.012128970585763454, "learning_rate": 5.0510204081632655e-06, "loss": 0.006, "step": 14650 }, { "epoch": 74.79591836734694, "grad_norm": 0.0026174013037234545, "learning_rate": 5.040816326530613e-06, "loss": 0.0261, "step": 14660 }, { "epoch": 74.84693877551021, "grad_norm": 0.0013418736634775996, "learning_rate": 5.03061224489796e-06, "loss": 0.0005, "step": 14670 }, { "epoch": 74.89795918367346, "grad_norm": 0.0016789283836260438, "learning_rate": 5.020408163265307e-06, "loss": 0.0013, "step": 14680 }, { "epoch": 74.94897959183673, "grad_norm": 0.06226319819688797, "learning_rate": 5.010204081632654e-06, "loss": 0.0002, "step": 14690 }, { "epoch": 75.0, "grad_norm": 0.001314781722612679, "learning_rate": 5e-06, "loss": 0.0284, "step": 14700 }, { "epoch": 75.0, "eval_accuracy": 0.9205776173285198, "eval_loss": 0.6053860783576965, "eval_runtime": 1.0893, "eval_samples_per_second": 254.294, "eval_steps_per_second": 32.131, "step": 14700 }, { "epoch": 75.05102040816327, "grad_norm": 0.017246130853891373, "learning_rate": 4.989795918367347e-06, "loss": 0.0001, "step": 14710 }, { "epoch": 75.10204081632654, "grad_norm": 0.0022281906567513943, "learning_rate": 4.979591836734694e-06, "loss": 0.0072, "step": 14720 }, { "epoch": 75.15306122448979, "grad_norm": 0.0020866349805146456, "learning_rate": 4.969387755102041e-06, "loss": 0.0, "step": 14730 }, { "epoch": 75.20408163265306, "grad_norm": 0.007508859969675541, "learning_rate": 4.959183673469388e-06, "loss": 0.0011, "step": 14740 }, { "epoch": 75.25510204081633, "grad_norm": 0.0008581402944400907, "learning_rate": 4.948979591836735e-06, "loss": 0.0001, "step": 14750 }, { "epoch": 75.3061224489796, "grad_norm": 0.040425512939691544, "learning_rate": 4.938775510204082e-06, "loss": 0.0314, "step": 14760 }, { "epoch": 75.35714285714286, "grad_norm": 0.011455115862190723, "learning_rate": 4.928571428571429e-06, "loss": 0.0004, "step": 14770 }, { "epoch": 75.40816326530613, "grad_norm": 0.022549083456397057, "learning_rate": 4.918367346938776e-06, "loss": 0.1887, "step": 14780 }, { "epoch": 75.45918367346938, "grad_norm": 0.00208433554507792, "learning_rate": 4.908163265306123e-06, "loss": 0.0046, "step": 14790 }, { "epoch": 75.51020408163265, "grad_norm": 22.658855438232422, "learning_rate": 4.897959183673469e-06, "loss": 0.0548, "step": 14800 }, { "epoch": 75.56122448979592, "grad_norm": 0.0009922325843945146, "learning_rate": 4.887755102040817e-06, "loss": 0.0, "step": 14810 }, { "epoch": 75.61224489795919, "grad_norm": 0.016239939257502556, "learning_rate": 4.8775510204081635e-06, "loss": 0.0002, "step": 14820 }, { "epoch": 75.66326530612245, "grad_norm": 0.02285103313624859, "learning_rate": 4.867346938775511e-06, "loss": 0.0019, "step": 14830 }, { "epoch": 75.71428571428571, "grad_norm": 0.016967713832855225, "learning_rate": 4.857142857142858e-06, "loss": 0.0002, "step": 14840 }, { "epoch": 75.76530612244898, "grad_norm": 0.0015098198782652617, "learning_rate": 4.846938775510204e-06, "loss": 0.1062, "step": 14850 }, { "epoch": 75.81632653061224, "grad_norm": 33.90650177001953, "learning_rate": 4.836734693877552e-06, "loss": 0.0902, "step": 14860 }, { "epoch": 75.86734693877551, "grad_norm": 0.04325437173247337, "learning_rate": 4.826530612244898e-06, "loss": 0.1195, "step": 14870 }, { "epoch": 75.91836734693878, "grad_norm": 0.11708974838256836, "learning_rate": 4.816326530612245e-06, "loss": 0.0002, "step": 14880 }, { "epoch": 75.96938775510205, "grad_norm": 0.0072238692082464695, "learning_rate": 4.8061224489795925e-06, "loss": 0.0001, "step": 14890 }, { "epoch": 76.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.5922021269798279, "eval_runtime": 1.1047, "eval_samples_per_second": 250.751, "eval_steps_per_second": 31.683, "step": 14896 }, { "epoch": 76.0204081632653, "grad_norm": 0.001690774573944509, "learning_rate": 4.795918367346939e-06, "loss": 0.0004, "step": 14900 }, { "epoch": 76.07142857142857, "grad_norm": 0.002286632312461734, "learning_rate": 4.785714285714287e-06, "loss": 0.0004, "step": 14910 }, { "epoch": 76.12244897959184, "grad_norm": 0.000625822227448225, "learning_rate": 4.775510204081633e-06, "loss": 0.0709, "step": 14920 }, { "epoch": 76.1734693877551, "grad_norm": 0.0073734368197619915, "learning_rate": 4.76530612244898e-06, "loss": 0.005, "step": 14930 }, { "epoch": 76.22448979591837, "grad_norm": 0.008224919438362122, "learning_rate": 4.7551020408163265e-06, "loss": 0.0001, "step": 14940 }, { "epoch": 76.27551020408163, "grad_norm": 0.003785711480304599, "learning_rate": 4.744897959183674e-06, "loss": 0.0992, "step": 14950 }, { "epoch": 76.3265306122449, "grad_norm": 0.00034129800042137504, "learning_rate": 4.734693877551021e-06, "loss": 0.0, "step": 14960 }, { "epoch": 76.37755102040816, "grad_norm": 0.002319997875019908, "learning_rate": 4.724489795918368e-06, "loss": 0.0002, "step": 14970 }, { "epoch": 76.42857142857143, "grad_norm": 0.10855195671319962, "learning_rate": 4.714285714285715e-06, "loss": 0.0004, "step": 14980 }, { "epoch": 76.4795918367347, "grad_norm": 0.0005766889080405235, "learning_rate": 4.704081632653061e-06, "loss": 0.0, "step": 14990 }, { "epoch": 76.53061224489795, "grad_norm": 0.0008817859925329685, "learning_rate": 4.693877551020409e-06, "loss": 0.0003, "step": 15000 }, { "epoch": 76.58163265306122, "grad_norm": 0.0007610672037117183, "learning_rate": 4.6836734693877555e-06, "loss": 0.0001, "step": 15010 }, { "epoch": 76.63265306122449, "grad_norm": 0.0009014653041958809, "learning_rate": 4.673469387755102e-06, "loss": 0.0651, "step": 15020 }, { "epoch": 76.68367346938776, "grad_norm": 0.0035539474338293076, "learning_rate": 4.663265306122449e-06, "loss": 0.0788, "step": 15030 }, { "epoch": 76.73469387755102, "grad_norm": 0.0031320129055529833, "learning_rate": 4.653061224489796e-06, "loss": 0.0001, "step": 15040 }, { "epoch": 76.78571428571429, "grad_norm": 0.0010606256546452641, "learning_rate": 4.642857142857144e-06, "loss": 0.0002, "step": 15050 }, { "epoch": 76.83673469387755, "grad_norm": 0.0016601895913481712, "learning_rate": 4.63265306122449e-06, "loss": 0.0211, "step": 15060 }, { "epoch": 76.88775510204081, "grad_norm": 0.004060484003275633, "learning_rate": 4.622448979591837e-06, "loss": 0.0001, "step": 15070 }, { "epoch": 76.93877551020408, "grad_norm": 0.010187773033976555, "learning_rate": 4.612244897959184e-06, "loss": 0.0001, "step": 15080 }, { "epoch": 76.98979591836735, "grad_norm": 0.0017449766164645553, "learning_rate": 4.602040816326531e-06, "loss": 0.0119, "step": 15090 }, { "epoch": 77.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.5496028065681458, "eval_runtime": 1.0982, "eval_samples_per_second": 252.219, "eval_steps_per_second": 31.869, "step": 15092 }, { "epoch": 77.04081632653062, "grad_norm": 0.0014577321708202362, "learning_rate": 4.591836734693878e-06, "loss": 0.0341, "step": 15100 }, { "epoch": 77.09183673469387, "grad_norm": 0.0016758479177951813, "learning_rate": 4.5816326530612245e-06, "loss": 0.0038, "step": 15110 }, { "epoch": 77.14285714285714, "grad_norm": 0.0016968995332717896, "learning_rate": 4.571428571428572e-06, "loss": 0.0831, "step": 15120 }, { "epoch": 77.1938775510204, "grad_norm": 0.028017479926347733, "learning_rate": 4.561224489795919e-06, "loss": 0.0017, "step": 15130 }, { "epoch": 77.24489795918367, "grad_norm": 0.0003469182120170444, "learning_rate": 4.551020408163266e-06, "loss": 0.0002, "step": 15140 }, { "epoch": 77.29591836734694, "grad_norm": 0.0016192300245165825, "learning_rate": 4.540816326530613e-06, "loss": 0.0103, "step": 15150 }, { "epoch": 77.34693877551021, "grad_norm": 0.011507781222462654, "learning_rate": 4.530612244897959e-06, "loss": 0.0012, "step": 15160 }, { "epoch": 77.39795918367346, "grad_norm": 0.0009392569190822542, "learning_rate": 4.520408163265306e-06, "loss": 0.0071, "step": 15170 }, { "epoch": 77.44897959183673, "grad_norm": 0.0006552474806085229, "learning_rate": 4.5102040816326535e-06, "loss": 0.0443, "step": 15180 }, { "epoch": 77.5, "grad_norm": 0.001189417322166264, "learning_rate": 4.5e-06, "loss": 0.0063, "step": 15190 }, { "epoch": 77.55102040816327, "grad_norm": 70.85895538330078, "learning_rate": 4.489795918367348e-06, "loss": 0.0368, "step": 15200 }, { "epoch": 77.60204081632654, "grad_norm": 5.045065879821777, "learning_rate": 4.479591836734694e-06, "loss": 0.006, "step": 15210 }, { "epoch": 77.65306122448979, "grad_norm": 0.00043268149602226913, "learning_rate": 4.469387755102041e-06, "loss": 0.001, "step": 15220 }, { "epoch": 77.70408163265306, "grad_norm": 0.0012495230184867978, "learning_rate": 4.459183673469388e-06, "loss": 0.0001, "step": 15230 }, { "epoch": 77.75510204081633, "grad_norm": 0.0003033694811165333, "learning_rate": 4.448979591836735e-06, "loss": 0.0003, "step": 15240 }, { "epoch": 77.8061224489796, "grad_norm": 0.08307422697544098, "learning_rate": 4.438775510204082e-06, "loss": 0.0986, "step": 15250 }, { "epoch": 77.85714285714286, "grad_norm": 0.004372878465801477, "learning_rate": 4.428571428571429e-06, "loss": 0.0647, "step": 15260 }, { "epoch": 77.90816326530613, "grad_norm": 0.00046887315693311393, "learning_rate": 4.418367346938776e-06, "loss": 0.0001, "step": 15270 }, { "epoch": 77.95918367346938, "grad_norm": 0.00606790604069829, "learning_rate": 4.408163265306123e-06, "loss": 0.0006, "step": 15280 }, { "epoch": 78.0, "eval_accuracy": 0.9205776173285198, "eval_loss": 0.6326822638511658, "eval_runtime": 1.0754, "eval_samples_per_second": 257.582, "eval_steps_per_second": 32.546, "step": 15288 }, { "epoch": 78.01020408163265, "grad_norm": 0.0024251583963632584, "learning_rate": 4.39795918367347e-06, "loss": 0.1329, "step": 15290 }, { "epoch": 78.06122448979592, "grad_norm": 0.0040822383016347885, "learning_rate": 4.3877551020408165e-06, "loss": 0.0001, "step": 15300 }, { "epoch": 78.11224489795919, "grad_norm": 0.3164103031158447, "learning_rate": 4.377551020408163e-06, "loss": 0.0012, "step": 15310 }, { "epoch": 78.16326530612245, "grad_norm": 0.000998248695395887, "learning_rate": 4.367346938775511e-06, "loss": 0.0014, "step": 15320 }, { "epoch": 78.21428571428571, "grad_norm": 0.007188987918198109, "learning_rate": 4.357142857142857e-06, "loss": 0.0001, "step": 15330 }, { "epoch": 78.26530612244898, "grad_norm": 0.007898544892668724, "learning_rate": 4.346938775510205e-06, "loss": 0.0055, "step": 15340 }, { "epoch": 78.31632653061224, "grad_norm": 0.0044418745674192905, "learning_rate": 4.336734693877551e-06, "loss": 0.0003, "step": 15350 }, { "epoch": 78.36734693877551, "grad_norm": 0.004286288749426603, "learning_rate": 4.326530612244899e-06, "loss": 0.1421, "step": 15360 }, { "epoch": 78.41836734693878, "grad_norm": 0.0009252499439753592, "learning_rate": 4.3163265306122455e-06, "loss": 0.0024, "step": 15370 }, { "epoch": 78.46938775510205, "grad_norm": 0.001535197370685637, "learning_rate": 4.306122448979592e-06, "loss": 0.0, "step": 15380 }, { "epoch": 78.5204081632653, "grad_norm": 0.0010684464359655976, "learning_rate": 4.295918367346939e-06, "loss": 0.0002, "step": 15390 }, { "epoch": 78.57142857142857, "grad_norm": 0.002064766362309456, "learning_rate": 4.2857142857142855e-06, "loss": 0.064, "step": 15400 }, { "epoch": 78.62244897959184, "grad_norm": 0.0007320994627662003, "learning_rate": 4.275510204081633e-06, "loss": 0.0001, "step": 15410 }, { "epoch": 78.6734693877551, "grad_norm": 0.013717442750930786, "learning_rate": 4.2653061224489804e-06, "loss": 0.0002, "step": 15420 }, { "epoch": 78.72448979591837, "grad_norm": 0.0024749492295086384, "learning_rate": 4.255102040816327e-06, "loss": 0.0003, "step": 15430 }, { "epoch": 78.77551020408163, "grad_norm": 0.0008448545704595745, "learning_rate": 4.244897959183674e-06, "loss": 0.0622, "step": 15440 }, { "epoch": 78.8265306122449, "grad_norm": 0.003372789826244116, "learning_rate": 4.234693877551021e-06, "loss": 0.0967, "step": 15450 }, { "epoch": 78.87755102040816, "grad_norm": 0.0017138301627710462, "learning_rate": 4.224489795918368e-06, "loss": 0.0002, "step": 15460 }, { "epoch": 78.92857142857143, "grad_norm": 0.0010195611976087093, "learning_rate": 4.2142857142857145e-06, "loss": 0.0001, "step": 15470 }, { "epoch": 78.9795918367347, "grad_norm": 5.828068256378174, "learning_rate": 4.204081632653061e-06, "loss": 0.0711, "step": 15480 }, { "epoch": 79.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.5177193284034729, "eval_runtime": 1.0822, "eval_samples_per_second": 255.96, "eval_steps_per_second": 32.341, "step": 15484 }, { "epoch": 79.03061224489795, "grad_norm": 0.0005103123839944601, "learning_rate": 4.193877551020409e-06, "loss": 0.0027, "step": 15490 }, { "epoch": 79.08163265306122, "grad_norm": 0.003480609506368637, "learning_rate": 4.183673469387755e-06, "loss": 0.1439, "step": 15500 }, { "epoch": 79.13265306122449, "grad_norm": 0.0034314803779125214, "learning_rate": 4.173469387755103e-06, "loss": 0.0038, "step": 15510 }, { "epoch": 79.18367346938776, "grad_norm": 0.21085740625858307, "learning_rate": 4.163265306122449e-06, "loss": 0.0011, "step": 15520 }, { "epoch": 79.23469387755102, "grad_norm": 0.005711423698812723, "learning_rate": 4.153061224489796e-06, "loss": 0.0001, "step": 15530 }, { "epoch": 79.28571428571429, "grad_norm": 0.00074528600089252, "learning_rate": 4.1428571428571435e-06, "loss": 0.0002, "step": 15540 }, { "epoch": 79.33673469387755, "grad_norm": 0.0042790998704731464, "learning_rate": 4.13265306122449e-06, "loss": 0.0001, "step": 15550 }, { "epoch": 79.38775510204081, "grad_norm": 0.06146891042590141, "learning_rate": 4.122448979591837e-06, "loss": 0.0036, "step": 15560 }, { "epoch": 79.43877551020408, "grad_norm": 23.362375259399414, "learning_rate": 4.112244897959184e-06, "loss": 0.0776, "step": 15570 }, { "epoch": 79.48979591836735, "grad_norm": 0.0016490630805492401, "learning_rate": 4.102040816326531e-06, "loss": 0.0003, "step": 15580 }, { "epoch": 79.54081632653062, "grad_norm": 25.160037994384766, "learning_rate": 4.091836734693878e-06, "loss": 0.0166, "step": 15590 }, { "epoch": 79.59183673469387, "grad_norm": 0.0071771834045648575, "learning_rate": 4.081632653061225e-06, "loss": 0.0002, "step": 15600 }, { "epoch": 79.64285714285714, "grad_norm": 0.0006621835636906326, "learning_rate": 4.071428571428572e-06, "loss": 0.0003, "step": 15610 }, { "epoch": 79.6938775510204, "grad_norm": 0.0017756756860762835, "learning_rate": 4.061224489795918e-06, "loss": 0.0001, "step": 15620 }, { "epoch": 79.74489795918367, "grad_norm": 0.005591052118688822, "learning_rate": 4.051020408163266e-06, "loss": 0.0001, "step": 15630 }, { "epoch": 79.79591836734694, "grad_norm": 56.970863342285156, "learning_rate": 4.040816326530612e-06, "loss": 0.0465, "step": 15640 }, { "epoch": 79.84693877551021, "grad_norm": 0.021803749725222588, "learning_rate": 4.03061224489796e-06, "loss": 0.0053, "step": 15650 }, { "epoch": 79.89795918367346, "grad_norm": 0.0007582844118587673, "learning_rate": 4.0204081632653065e-06, "loss": 0.0001, "step": 15660 }, { "epoch": 79.94897959183673, "grad_norm": 0.0007920601055957377, "learning_rate": 4.010204081632653e-06, "loss": 0.0002, "step": 15670 }, { "epoch": 80.0, "grad_norm": 0.00025248422753065825, "learning_rate": 4.000000000000001e-06, "loss": 0.0001, "step": 15680 }, { "epoch": 80.0, "eval_accuracy": 0.9133574007220217, "eval_loss": 0.7390635013580322, "eval_runtime": 1.0836, "eval_samples_per_second": 255.629, "eval_steps_per_second": 32.3, "step": 15680 }, { "epoch": 80.05102040816327, "grad_norm": 0.16532659530639648, "learning_rate": 3.989795918367347e-06, "loss": 0.0003, "step": 15690 }, { "epoch": 80.10204081632654, "grad_norm": 0.07838960736989975, "learning_rate": 3.979591836734694e-06, "loss": 0.0058, "step": 15700 }, { "epoch": 80.15306122448979, "grad_norm": 0.0013953743036836386, "learning_rate": 3.969387755102041e-06, "loss": 0.0001, "step": 15710 }, { "epoch": 80.20408163265306, "grad_norm": 0.000838958949316293, "learning_rate": 3.959183673469388e-06, "loss": 0.1283, "step": 15720 }, { "epoch": 80.25510204081633, "grad_norm": 0.028232138603925705, "learning_rate": 3.9489795918367356e-06, "loss": 0.0001, "step": 15730 }, { "epoch": 80.3061224489796, "grad_norm": 0.036748163402080536, "learning_rate": 3.938775510204082e-06, "loss": 0.0035, "step": 15740 }, { "epoch": 80.35714285714286, "grad_norm": 0.005808426532894373, "learning_rate": 3.928571428571429e-06, "loss": 0.0331, "step": 15750 }, { "epoch": 80.40816326530613, "grad_norm": 0.013694155029952526, "learning_rate": 3.9183673469387755e-06, "loss": 0.0525, "step": 15760 }, { "epoch": 80.45918367346938, "grad_norm": 0.00041695384425111115, "learning_rate": 3.908163265306123e-06, "loss": 0.0017, "step": 15770 }, { "epoch": 80.51020408163265, "grad_norm": 0.06154480576515198, "learning_rate": 3.89795918367347e-06, "loss": 0.0044, "step": 15780 }, { "epoch": 80.56122448979592, "grad_norm": 0.00043679706868715584, "learning_rate": 3.887755102040816e-06, "loss": 0.0, "step": 15790 }, { "epoch": 80.61224489795919, "grad_norm": 0.015825308859348297, "learning_rate": 3.877551020408164e-06, "loss": 0.0001, "step": 15800 }, { "epoch": 80.66326530612245, "grad_norm": 0.0009145226213149726, "learning_rate": 3.86734693877551e-06, "loss": 0.0003, "step": 15810 }, { "epoch": 80.71428571428571, "grad_norm": 0.0026916523929685354, "learning_rate": 3.857142857142858e-06, "loss": 0.0006, "step": 15820 }, { "epoch": 80.76530612244898, "grad_norm": 0.0004271497018635273, "learning_rate": 3.8469387755102045e-06, "loss": 0.0001, "step": 15830 }, { "epoch": 80.81632653061224, "grad_norm": 0.005669012665748596, "learning_rate": 3.836734693877551e-06, "loss": 0.0005, "step": 15840 }, { "epoch": 80.86734693877551, "grad_norm": 0.0003490884555503726, "learning_rate": 3.826530612244898e-06, "loss": 0.0103, "step": 15850 }, { "epoch": 80.91836734693878, "grad_norm": 0.0014083468122407794, "learning_rate": 3.816326530612245e-06, "loss": 0.0, "step": 15860 }, { "epoch": 80.96938775510205, "grad_norm": 0.0009305182611569762, "learning_rate": 3.8061224489795923e-06, "loss": 0.0985, "step": 15870 }, { "epoch": 81.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.5683252215385437, "eval_runtime": 1.0792, "eval_samples_per_second": 256.661, "eval_steps_per_second": 32.43, "step": 15876 }, { "epoch": 81.0204081632653, "grad_norm": 0.0009478586725890636, "learning_rate": 3.795918367346939e-06, "loss": 0.0001, "step": 15880 }, { "epoch": 81.07142857142857, "grad_norm": 0.0027497070841491222, "learning_rate": 3.785714285714286e-06, "loss": 0.0003, "step": 15890 }, { "epoch": 81.12244897959184, "grad_norm": 1.8935362100601196, "learning_rate": 3.7755102040816327e-06, "loss": 0.0008, "step": 15900 }, { "epoch": 81.1734693877551, "grad_norm": 0.0005016218055970967, "learning_rate": 3.76530612244898e-06, "loss": 0.0001, "step": 15910 }, { "epoch": 81.22448979591837, "grad_norm": 0.0006974856369197369, "learning_rate": 3.7551020408163268e-06, "loss": 0.1031, "step": 15920 }, { "epoch": 81.27551020408163, "grad_norm": 0.0008804520475678146, "learning_rate": 3.744897959183674e-06, "loss": 0.0005, "step": 15930 }, { "epoch": 81.3265306122449, "grad_norm": 0.03394754230976105, "learning_rate": 3.7346938775510205e-06, "loss": 0.0523, "step": 15940 }, { "epoch": 81.37755102040816, "grad_norm": 0.0005235446733422577, "learning_rate": 3.724489795918368e-06, "loss": 0.0052, "step": 15950 }, { "epoch": 81.42857142857143, "grad_norm": 0.08732754737138748, "learning_rate": 3.7142857142857146e-06, "loss": 0.0015, "step": 15960 }, { "epoch": 81.4795918367347, "grad_norm": 0.008718066848814487, "learning_rate": 3.7040816326530617e-06, "loss": 0.0008, "step": 15970 }, { "epoch": 81.53061224489795, "grad_norm": 0.0009734350605867803, "learning_rate": 3.6938775510204083e-06, "loss": 0.0, "step": 15980 }, { "epoch": 81.58163265306122, "grad_norm": 17.896821975708008, "learning_rate": 3.6836734693877554e-06, "loss": 0.007, "step": 15990 }, { "epoch": 81.63265306122449, "grad_norm": 0.0004061071085743606, "learning_rate": 3.6734693877551024e-06, "loss": 0.005, "step": 16000 }, { "epoch": 81.68367346938776, "grad_norm": 0.00048405694542452693, "learning_rate": 3.6632653061224495e-06, "loss": 0.0006, "step": 16010 }, { "epoch": 81.73469387755102, "grad_norm": 0.14911587536334991, "learning_rate": 3.653061224489796e-06, "loss": 0.0001, "step": 16020 }, { "epoch": 81.78571428571429, "grad_norm": 0.002278613857924938, "learning_rate": 3.642857142857143e-06, "loss": 0.0001, "step": 16030 }, { "epoch": 81.83673469387755, "grad_norm": 0.0003544982464518398, "learning_rate": 3.6326530612244903e-06, "loss": 0.0003, "step": 16040 }, { "epoch": 81.88775510204081, "grad_norm": 0.0008703051717020571, "learning_rate": 3.6224489795918373e-06, "loss": 0.053, "step": 16050 }, { "epoch": 81.93877551020408, "grad_norm": 10.087508201599121, "learning_rate": 3.612244897959184e-06, "loss": 0.0037, "step": 16060 }, { "epoch": 81.98979591836735, "grad_norm": 0.0005200792220421135, "learning_rate": 3.6020408163265306e-06, "loss": 0.0001, "step": 16070 }, { "epoch": 82.0, "eval_accuracy": 0.9205776173285198, "eval_loss": 0.610588550567627, "eval_runtime": 1.0707, "eval_samples_per_second": 258.712, "eval_steps_per_second": 32.689, "step": 16072 }, { "epoch": 82.04081632653062, "grad_norm": 0.22808292508125305, "learning_rate": 3.5918367346938777e-06, "loss": 0.0002, "step": 16080 }, { "epoch": 82.09183673469387, "grad_norm": 0.000528380332980305, "learning_rate": 3.581632653061225e-06, "loss": 0.0006, "step": 16090 }, { "epoch": 82.14285714285714, "grad_norm": 0.01142851822078228, "learning_rate": 3.5714285714285718e-06, "loss": 0.0906, "step": 16100 }, { "epoch": 82.1938775510204, "grad_norm": 0.0007205039146356285, "learning_rate": 3.5612244897959184e-06, "loss": 0.0002, "step": 16110 }, { "epoch": 82.24489795918367, "grad_norm": 0.02454567141830921, "learning_rate": 3.5510204081632655e-06, "loss": 0.0184, "step": 16120 }, { "epoch": 82.29591836734694, "grad_norm": 0.018092699348926544, "learning_rate": 3.540816326530613e-06, "loss": 0.0001, "step": 16130 }, { "epoch": 82.34693877551021, "grad_norm": 0.010026969015598297, "learning_rate": 3.5306122448979596e-06, "loss": 0.0001, "step": 16140 }, { "epoch": 82.39795918367346, "grad_norm": 0.0017730246763676405, "learning_rate": 3.5204081632653062e-06, "loss": 0.0008, "step": 16150 }, { "epoch": 82.44897959183673, "grad_norm": 0.0022774541284888983, "learning_rate": 3.5102040816326533e-06, "loss": 0.0207, "step": 16160 }, { "epoch": 82.5, "grad_norm": 0.0034177061170339584, "learning_rate": 3.5e-06, "loss": 0.0003, "step": 16170 }, { "epoch": 82.55102040816327, "grad_norm": 0.0008815902401693165, "learning_rate": 3.4897959183673474e-06, "loss": 0.1538, "step": 16180 }, { "epoch": 82.60204081632654, "grad_norm": 0.0022889720275998116, "learning_rate": 3.479591836734694e-06, "loss": 0.002, "step": 16190 }, { "epoch": 82.65306122448979, "grad_norm": 2.974851131439209, "learning_rate": 3.469387755102041e-06, "loss": 0.0012, "step": 16200 }, { "epoch": 82.70408163265306, "grad_norm": 14.788187980651855, "learning_rate": 3.4591836734693878e-06, "loss": 0.0055, "step": 16210 }, { "epoch": 82.75510204081633, "grad_norm": 0.06460939347743988, "learning_rate": 3.4489795918367353e-06, "loss": 0.0009, "step": 16220 }, { "epoch": 82.8061224489796, "grad_norm": 0.016392122954130173, "learning_rate": 3.438775510204082e-06, "loss": 0.0019, "step": 16230 }, { "epoch": 82.85714285714286, "grad_norm": 0.0008081716950982809, "learning_rate": 3.428571428571429e-06, "loss": 0.0, "step": 16240 }, { "epoch": 82.90816326530613, "grad_norm": 0.0014016545610502362, "learning_rate": 3.4183673469387756e-06, "loss": 0.0001, "step": 16250 }, { "epoch": 82.95918367346938, "grad_norm": 0.0007072300650179386, "learning_rate": 3.4081632653061227e-06, "loss": 0.0, "step": 16260 }, { "epoch": 83.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.6235185265541077, "eval_runtime": 1.0717, "eval_samples_per_second": 258.474, "eval_steps_per_second": 32.659, "step": 16268 }, { "epoch": 83.01020408163265, "grad_norm": 0.028477124869823456, "learning_rate": 3.3979591836734697e-06, "loss": 0.0004, "step": 16270 }, { "epoch": 83.06122448979592, "grad_norm": 0.0006861508591100574, "learning_rate": 3.3877551020408168e-06, "loss": 0.0004, "step": 16280 }, { "epoch": 83.11224489795919, "grad_norm": 0.0008571436046622694, "learning_rate": 3.3775510204081634e-06, "loss": 0.0001, "step": 16290 }, { "epoch": 83.16326530612245, "grad_norm": 0.010397214442491531, "learning_rate": 3.3673469387755105e-06, "loss": 0.0001, "step": 16300 }, { "epoch": 83.21428571428571, "grad_norm": 0.001611132174730301, "learning_rate": 3.357142857142857e-06, "loss": 0.0006, "step": 16310 }, { "epoch": 83.26530612244898, "grad_norm": 0.003106019226834178, "learning_rate": 3.3469387755102046e-06, "loss": 0.0845, "step": 16320 }, { "epoch": 83.31632653061224, "grad_norm": 0.009200221858918667, "learning_rate": 3.3367346938775513e-06, "loss": 0.0014, "step": 16330 }, { "epoch": 83.36734693877551, "grad_norm": 0.0010124070104211569, "learning_rate": 3.3265306122448983e-06, "loss": 0.0002, "step": 16340 }, { "epoch": 83.41836734693878, "grad_norm": 0.00034463393967598677, "learning_rate": 3.316326530612245e-06, "loss": 0.0028, "step": 16350 }, { "epoch": 83.46938775510205, "grad_norm": 0.0023424443788826466, "learning_rate": 3.3061224489795924e-06, "loss": 0.0001, "step": 16360 }, { "epoch": 83.5204081632653, "grad_norm": 0.0004294893878977746, "learning_rate": 3.295918367346939e-06, "loss": 0.0011, "step": 16370 }, { "epoch": 83.57142857142857, "grad_norm": 0.0007893657893873751, "learning_rate": 3.285714285714286e-06, "loss": 0.0001, "step": 16380 }, { "epoch": 83.62244897959184, "grad_norm": 0.0032500827219337225, "learning_rate": 3.2755102040816328e-06, "loss": 0.0133, "step": 16390 }, { "epoch": 83.6734693877551, "grad_norm": 0.005109312944114208, "learning_rate": 3.2653061224489794e-06, "loss": 0.007, "step": 16400 }, { "epoch": 83.72448979591837, "grad_norm": 0.0004992606700398028, "learning_rate": 3.255102040816327e-06, "loss": 0.0188, "step": 16410 }, { "epoch": 83.77551020408163, "grad_norm": 0.0013952411245554686, "learning_rate": 3.244897959183674e-06, "loss": 0.0, "step": 16420 }, { "epoch": 83.8265306122449, "grad_norm": 0.00041128721204586327, "learning_rate": 3.2346938775510206e-06, "loss": 0.0007, "step": 16430 }, { "epoch": 83.87755102040816, "grad_norm": 0.0014655032427981496, "learning_rate": 3.2244897959183672e-06, "loss": 0.0001, "step": 16440 }, { "epoch": 83.92857142857143, "grad_norm": 63.23695755004883, "learning_rate": 3.2142857142857147e-06, "loss": 0.1913, "step": 16450 }, { "epoch": 83.9795918367347, "grad_norm": 0.0005484999273903668, "learning_rate": 3.204081632653062e-06, "loss": 0.0006, "step": 16460 }, { "epoch": 84.0, "eval_accuracy": 0.9061371841155235, "eval_loss": 0.791436493396759, "eval_runtime": 1.0852, "eval_samples_per_second": 255.264, "eval_steps_per_second": 32.254, "step": 16464 }, { "epoch": 84.03061224489795, "grad_norm": 0.00029899098444730043, "learning_rate": 3.1938775510204084e-06, "loss": 0.0869, "step": 16470 }, { "epoch": 84.08163265306122, "grad_norm": 0.000354474555933848, "learning_rate": 3.183673469387755e-06, "loss": 0.0001, "step": 16480 }, { "epoch": 84.13265306122449, "grad_norm": 0.0003581519122235477, "learning_rate": 3.173469387755102e-06, "loss": 0.1156, "step": 16490 }, { "epoch": 84.18367346938776, "grad_norm": 0.15128546953201294, "learning_rate": 3.1632653061224496e-06, "loss": 0.0001, "step": 16500 }, { "epoch": 84.23469387755102, "grad_norm": 0.0036563859321177006, "learning_rate": 3.1530612244897963e-06, "loss": 0.0003, "step": 16510 }, { "epoch": 84.28571428571429, "grad_norm": 0.001038924790918827, "learning_rate": 3.142857142857143e-06, "loss": 0.0044, "step": 16520 }, { "epoch": 84.33673469387755, "grad_norm": 12.563940048217773, "learning_rate": 3.13265306122449e-06, "loss": 0.0207, "step": 16530 }, { "epoch": 84.38775510204081, "grad_norm": 0.02270456589758396, "learning_rate": 3.1224489795918374e-06, "loss": 0.0001, "step": 16540 }, { "epoch": 84.43877551020408, "grad_norm": 0.0036377739161252975, "learning_rate": 3.112244897959184e-06, "loss": 0.0001, "step": 16550 }, { "epoch": 84.48979591836735, "grad_norm": 0.0005686595686711371, "learning_rate": 3.1020408163265307e-06, "loss": 0.0001, "step": 16560 }, { "epoch": 84.54081632653062, "grad_norm": 0.0005101625574752688, "learning_rate": 3.0918367346938778e-06, "loss": 0.085, "step": 16570 }, { "epoch": 84.59183673469387, "grad_norm": 0.0028208934236317873, "learning_rate": 3.0816326530612244e-06, "loss": 0.0001, "step": 16580 }, { "epoch": 84.64285714285714, "grad_norm": 0.0006114314892329276, "learning_rate": 3.071428571428572e-06, "loss": 0.0, "step": 16590 }, { "epoch": 84.6938775510204, "grad_norm": 0.029959795996546745, "learning_rate": 3.0612244897959185e-06, "loss": 0.0066, "step": 16600 }, { "epoch": 84.74489795918367, "grad_norm": 0.0011018848745152354, "learning_rate": 3.0510204081632656e-06, "loss": 0.0, "step": 16610 }, { "epoch": 84.79591836734694, "grad_norm": 0.00042971764924004674, "learning_rate": 3.0408163265306122e-06, "loss": 0.0042, "step": 16620 }, { "epoch": 84.84693877551021, "grad_norm": 0.000296957470709458, "learning_rate": 3.0306122448979597e-06, "loss": 0.0021, "step": 16630 }, { "epoch": 84.89795918367346, "grad_norm": 18.961952209472656, "learning_rate": 3.0204081632653064e-06, "loss": 0.0062, "step": 16640 }, { "epoch": 84.94897959183673, "grad_norm": 0.0007443376234732568, "learning_rate": 3.0102040816326534e-06, "loss": 0.0006, "step": 16650 }, { "epoch": 85.0, "grad_norm": 0.0005438339430838823, "learning_rate": 3e-06, "loss": 0.0001, "step": 16660 }, { "epoch": 85.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.5649476051330566, "eval_runtime": 1.1196, "eval_samples_per_second": 247.411, "eval_steps_per_second": 31.261, "step": 16660 }, { "epoch": 85.05102040816327, "grad_norm": 0.001318538561463356, "learning_rate": 2.989795918367347e-06, "loss": 0.0005, "step": 16670 }, { "epoch": 85.10204081632654, "grad_norm": 0.0013827024959027767, "learning_rate": 2.979591836734694e-06, "loss": 0.0874, "step": 16680 }, { "epoch": 85.15306122448979, "grad_norm": 0.007082667201757431, "learning_rate": 2.9693877551020413e-06, "loss": 0.0, "step": 16690 }, { "epoch": 85.20408163265306, "grad_norm": 0.0008667759248055518, "learning_rate": 2.959183673469388e-06, "loss": 0.0001, "step": 16700 }, { "epoch": 85.25510204081633, "grad_norm": 0.001552125089801848, "learning_rate": 2.948979591836735e-06, "loss": 0.0002, "step": 16710 }, { "epoch": 85.3061224489796, "grad_norm": 0.003595917019993067, "learning_rate": 2.938775510204082e-06, "loss": 0.1037, "step": 16720 }, { "epoch": 85.35714285714286, "grad_norm": 0.0009035568218678236, "learning_rate": 2.928571428571429e-06, "loss": 0.0914, "step": 16730 }, { "epoch": 85.40816326530613, "grad_norm": 0.0013159025693312287, "learning_rate": 2.9183673469387757e-06, "loss": 0.0013, "step": 16740 }, { "epoch": 85.45918367346938, "grad_norm": 0.000666959211230278, "learning_rate": 2.908163265306123e-06, "loss": 0.0002, "step": 16750 }, { "epoch": 85.51020408163265, "grad_norm": 0.002168916864320636, "learning_rate": 2.8979591836734694e-06, "loss": 0.0001, "step": 16760 }, { "epoch": 85.56122448979592, "grad_norm": 0.007884960621595383, "learning_rate": 2.887755102040817e-06, "loss": 0.0002, "step": 16770 }, { "epoch": 85.61224489795919, "grad_norm": 0.0006330261239781976, "learning_rate": 2.8775510204081636e-06, "loss": 0.1168, "step": 16780 }, { "epoch": 85.66326530612245, "grad_norm": 0.02550496906042099, "learning_rate": 2.86734693877551e-06, "loss": 0.0001, "step": 16790 }, { "epoch": 85.71428571428571, "grad_norm": 0.004017403814941645, "learning_rate": 2.8571428571428573e-06, "loss": 0.0076, "step": 16800 }, { "epoch": 85.76530612244898, "grad_norm": 0.003952109720557928, "learning_rate": 2.8469387755102047e-06, "loss": 0.0084, "step": 16810 }, { "epoch": 85.81632653061224, "grad_norm": 0.022375956177711487, "learning_rate": 2.8367346938775514e-06, "loss": 0.0001, "step": 16820 }, { "epoch": 85.86734693877551, "grad_norm": 0.004634643904864788, "learning_rate": 2.826530612244898e-06, "loss": 0.0001, "step": 16830 }, { "epoch": 85.91836734693878, "grad_norm": 0.0013172129401937127, "learning_rate": 2.816326530612245e-06, "loss": 0.09, "step": 16840 }, { "epoch": 85.96938775510205, "grad_norm": 0.0009228216367773712, "learning_rate": 2.8061224489795917e-06, "loss": 0.0, "step": 16850 }, { "epoch": 86.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.5512405633926392, "eval_runtime": 1.0888, "eval_samples_per_second": 254.417, "eval_steps_per_second": 32.147, "step": 16856 }, { "epoch": 86.0204081632653, "grad_norm": 0.0007092113373801112, "learning_rate": 2.795918367346939e-06, "loss": 0.0459, "step": 16860 }, { "epoch": 86.07142857142857, "grad_norm": 0.014404451474547386, "learning_rate": 2.785714285714286e-06, "loss": 0.0004, "step": 16870 }, { "epoch": 86.12244897959184, "grad_norm": 39.835716247558594, "learning_rate": 2.775510204081633e-06, "loss": 0.0124, "step": 16880 }, { "epoch": 86.1734693877551, "grad_norm": 0.004022069741040468, "learning_rate": 2.7653061224489795e-06, "loss": 0.0, "step": 16890 }, { "epoch": 86.22448979591837, "grad_norm": 0.001902677584439516, "learning_rate": 2.7551020408163266e-06, "loss": 0.0108, "step": 16900 }, { "epoch": 86.27551020408163, "grad_norm": 0.002769744722172618, "learning_rate": 2.7448979591836737e-06, "loss": 0.0001, "step": 16910 }, { "epoch": 86.3265306122449, "grad_norm": 0.003424712922424078, "learning_rate": 2.7346938775510207e-06, "loss": 0.0389, "step": 16920 }, { "epoch": 86.37755102040816, "grad_norm": 0.0010579436784610152, "learning_rate": 2.7244897959183674e-06, "loss": 0.0001, "step": 16930 }, { "epoch": 86.42857142857143, "grad_norm": 0.0006266564014367759, "learning_rate": 2.7142857142857144e-06, "loss": 0.0001, "step": 16940 }, { "epoch": 86.4795918367347, "grad_norm": 0.0005798305501230061, "learning_rate": 2.7040816326530615e-06, "loss": 0.0002, "step": 16950 }, { "epoch": 86.53061224489795, "grad_norm": 0.004349791444838047, "learning_rate": 2.6938775510204086e-06, "loss": 0.0517, "step": 16960 }, { "epoch": 86.58163265306122, "grad_norm": 7.679243087768555, "learning_rate": 2.683673469387755e-06, "loss": 0.0043, "step": 16970 }, { "epoch": 86.63265306122449, "grad_norm": 0.003425444243475795, "learning_rate": 2.6734693877551023e-06, "loss": 0.0024, "step": 16980 }, { "epoch": 86.68367346938776, "grad_norm": 0.0003848330525215715, "learning_rate": 2.663265306122449e-06, "loss": 0.0, "step": 16990 }, { "epoch": 86.73469387755102, "grad_norm": 0.0005235043936409056, "learning_rate": 2.6530612244897964e-06, "loss": 0.0913, "step": 17000 }, { "epoch": 86.78571428571429, "grad_norm": 0.0007512976881116629, "learning_rate": 2.642857142857143e-06, "loss": 0.0002, "step": 17010 }, { "epoch": 86.83673469387755, "grad_norm": 0.0010601053945720196, "learning_rate": 2.63265306122449e-06, "loss": 0.0006, "step": 17020 }, { "epoch": 86.88775510204081, "grad_norm": 20.133331298828125, "learning_rate": 2.6224489795918367e-06, "loss": 0.032, "step": 17030 }, { "epoch": 86.93877551020408, "grad_norm": 0.0006032405653968453, "learning_rate": 2.6122448979591842e-06, "loss": 0.0024, "step": 17040 }, { "epoch": 86.98979591836735, "grad_norm": 0.0023844274692237377, "learning_rate": 2.602040816326531e-06, "loss": 0.066, "step": 17050 }, { "epoch": 87.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.547265350818634, "eval_runtime": 1.0887, "eval_samples_per_second": 254.439, "eval_steps_per_second": 32.149, "step": 17052 }, { "epoch": 87.04081632653062, "grad_norm": 0.0028474205173552036, "learning_rate": 2.591836734693878e-06, "loss": 0.0, "step": 17060 }, { "epoch": 87.09183673469387, "grad_norm": 0.00038483846583403647, "learning_rate": 2.5816326530612246e-06, "loss": 0.0002, "step": 17070 }, { "epoch": 87.14285714285714, "grad_norm": 0.00721361767500639, "learning_rate": 2.571428571428571e-06, "loss": 0.0708, "step": 17080 }, { "epoch": 87.1938775510204, "grad_norm": 0.030635815113782883, "learning_rate": 2.5612244897959187e-06, "loss": 0.0001, "step": 17090 }, { "epoch": 87.24489795918367, "grad_norm": 0.007359732873737812, "learning_rate": 2.5510204081632657e-06, "loss": 0.003, "step": 17100 }, { "epoch": 87.29591836734694, "grad_norm": 0.001952233025804162, "learning_rate": 2.5408163265306124e-06, "loss": 0.0191, "step": 17110 }, { "epoch": 87.34693877551021, "grad_norm": 0.00048027155571617186, "learning_rate": 2.530612244897959e-06, "loss": 0.0002, "step": 17120 }, { "epoch": 87.39795918367346, "grad_norm": 0.0005371647421270609, "learning_rate": 2.5204081632653065e-06, "loss": 0.0, "step": 17130 }, { "epoch": 87.44897959183673, "grad_norm": 0.004265504889190197, "learning_rate": 2.5102040816326536e-06, "loss": 0.0301, "step": 17140 }, { "epoch": 87.5, "grad_norm": 0.00041383958887308836, "learning_rate": 2.5e-06, "loss": 0.0023, "step": 17150 }, { "epoch": 87.55102040816327, "grad_norm": 0.006341009866446257, "learning_rate": 2.489795918367347e-06, "loss": 0.1055, "step": 17160 }, { "epoch": 87.60204081632654, "grad_norm": 0.013607253320515156, "learning_rate": 2.479591836734694e-06, "loss": 0.1289, "step": 17170 }, { "epoch": 87.65306122448979, "grad_norm": 0.007136788684874773, "learning_rate": 2.469387755102041e-06, "loss": 0.0001, "step": 17180 }, { "epoch": 87.70408163265306, "grad_norm": 0.0019289840711280704, "learning_rate": 2.459183673469388e-06, "loss": 0.0844, "step": 17190 }, { "epoch": 87.75510204081633, "grad_norm": 0.0010303023736923933, "learning_rate": 2.4489795918367347e-06, "loss": 0.004, "step": 17200 }, { "epoch": 87.8061224489796, "grad_norm": 47.05202102661133, "learning_rate": 2.4387755102040817e-06, "loss": 0.0337, "step": 17210 }, { "epoch": 87.85714285714286, "grad_norm": 0.03291342779994011, "learning_rate": 2.428571428571429e-06, "loss": 0.0229, "step": 17220 }, { "epoch": 87.90816326530613, "grad_norm": 0.00033585139317438006, "learning_rate": 2.418367346938776e-06, "loss": 0.0518, "step": 17230 }, { "epoch": 87.95918367346938, "grad_norm": 0.0008860923699103296, "learning_rate": 2.4081632653061225e-06, "loss": 0.0189, "step": 17240 }, { "epoch": 88.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.48660141229629517, "eval_runtime": 1.0836, "eval_samples_per_second": 255.633, "eval_steps_per_second": 32.3, "step": 17248 }, { "epoch": 88.01020408163265, "grad_norm": 0.00781255029141903, "learning_rate": 2.3979591836734696e-06, "loss": 0.0001, "step": 17250 }, { "epoch": 88.06122448979592, "grad_norm": 6.823309898376465, "learning_rate": 2.3877551020408166e-06, "loss": 0.0043, "step": 17260 }, { "epoch": 88.11224489795919, "grad_norm": 0.0020987829193472862, "learning_rate": 2.3775510204081633e-06, "loss": 0.0003, "step": 17270 }, { "epoch": 88.16326530612245, "grad_norm": 0.09770765155553818, "learning_rate": 2.3673469387755103e-06, "loss": 0.0001, "step": 17280 }, { "epoch": 88.21428571428571, "grad_norm": 0.0009081694297492504, "learning_rate": 2.3571428571428574e-06, "loss": 0.0035, "step": 17290 }, { "epoch": 88.26530612244898, "grad_norm": 0.004696785006672144, "learning_rate": 2.3469387755102044e-06, "loss": 0.0001, "step": 17300 }, { "epoch": 88.31632653061224, "grad_norm": 0.0005367195117287338, "learning_rate": 2.336734693877551e-06, "loss": 0.0007, "step": 17310 }, { "epoch": 88.36734693877551, "grad_norm": 0.0006380550912581384, "learning_rate": 2.326530612244898e-06, "loss": 0.0, "step": 17320 }, { "epoch": 88.41836734693878, "grad_norm": 0.0005197112914174795, "learning_rate": 2.316326530612245e-06, "loss": 0.0, "step": 17330 }, { "epoch": 88.46938775510205, "grad_norm": 0.0021705967374145985, "learning_rate": 2.306122448979592e-06, "loss": 0.0145, "step": 17340 }, { "epoch": 88.5204081632653, "grad_norm": 0.0023973756469786167, "learning_rate": 2.295918367346939e-06, "loss": 0.0002, "step": 17350 }, { "epoch": 88.57142857142857, "grad_norm": 0.000825838535092771, "learning_rate": 2.285714285714286e-06, "loss": 0.0, "step": 17360 }, { "epoch": 88.62244897959184, "grad_norm": 29.48289680480957, "learning_rate": 2.275510204081633e-06, "loss": 0.0514, "step": 17370 }, { "epoch": 88.6734693877551, "grad_norm": 0.006796309724450111, "learning_rate": 2.2653061224489797e-06, "loss": 0.1307, "step": 17380 }, { "epoch": 88.72448979591837, "grad_norm": 0.030332809314131737, "learning_rate": 2.2551020408163267e-06, "loss": 0.0001, "step": 17390 }, { "epoch": 88.77551020408163, "grad_norm": 0.0007284952444024384, "learning_rate": 2.244897959183674e-06, "loss": 0.0001, "step": 17400 }, { "epoch": 88.8265306122449, "grad_norm": 0.001431124983355403, "learning_rate": 2.2346938775510204e-06, "loss": 0.0002, "step": 17410 }, { "epoch": 88.87755102040816, "grad_norm": 0.0006692196475341916, "learning_rate": 2.2244897959183675e-06, "loss": 0.0034, "step": 17420 }, { "epoch": 88.92857142857143, "grad_norm": 0.0008285000221803784, "learning_rate": 2.2142857142857146e-06, "loss": 0.0364, "step": 17430 }, { "epoch": 88.9795918367347, "grad_norm": 0.001175836194306612, "learning_rate": 2.2040816326530616e-06, "loss": 0.0, "step": 17440 }, { "epoch": 89.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.5135785341262817, "eval_runtime": 1.0848, "eval_samples_per_second": 255.35, "eval_steps_per_second": 32.264, "step": 17444 }, { "epoch": 89.03061224489795, "grad_norm": 0.015034779906272888, "learning_rate": 2.1938775510204083e-06, "loss": 0.0002, "step": 17450 }, { "epoch": 89.08163265306122, "grad_norm": 0.00048784297541715205, "learning_rate": 2.1836734693877553e-06, "loss": 0.0048, "step": 17460 }, { "epoch": 89.13265306122449, "grad_norm": 0.005646976642310619, "learning_rate": 2.1734693877551024e-06, "loss": 0.0009, "step": 17470 }, { "epoch": 89.18367346938776, "grad_norm": 1.0820720195770264, "learning_rate": 2.1632653061224495e-06, "loss": 0.0006, "step": 17480 }, { "epoch": 89.23469387755102, "grad_norm": 0.00029209599597379565, "learning_rate": 2.153061224489796e-06, "loss": 0.0002, "step": 17490 }, { "epoch": 89.28571428571429, "grad_norm": 0.0038033067248761654, "learning_rate": 2.1428571428571427e-06, "loss": 0.0009, "step": 17500 }, { "epoch": 89.33673469387755, "grad_norm": 0.0016370756784453988, "learning_rate": 2.1326530612244902e-06, "loss": 0.0001, "step": 17510 }, { "epoch": 89.38775510204081, "grad_norm": 0.0004264684394001961, "learning_rate": 2.122448979591837e-06, "loss": 0.0021, "step": 17520 }, { "epoch": 89.43877551020408, "grad_norm": 0.003561575897037983, "learning_rate": 2.112244897959184e-06, "loss": 0.0001, "step": 17530 }, { "epoch": 89.48979591836735, "grad_norm": 0.0028887272346764803, "learning_rate": 2.1020408163265306e-06, "loss": 0.0001, "step": 17540 }, { "epoch": 89.54081632653062, "grad_norm": 0.009618128649890423, "learning_rate": 2.0918367346938776e-06, "loss": 0.0969, "step": 17550 }, { "epoch": 89.59183673469387, "grad_norm": 0.0036190368700772524, "learning_rate": 2.0816326530612247e-06, "loss": 0.0001, "step": 17560 }, { "epoch": 89.64285714285714, "grad_norm": 0.0024666478857398033, "learning_rate": 2.0714285714285717e-06, "loss": 0.0002, "step": 17570 }, { "epoch": 89.6938775510204, "grad_norm": 0.0010146555723622441, "learning_rate": 2.0612244897959184e-06, "loss": 0.0006, "step": 17580 }, { "epoch": 89.74489795918367, "grad_norm": 0.0011051897890865803, "learning_rate": 2.0510204081632654e-06, "loss": 0.0461, "step": 17590 }, { "epoch": 89.79591836734694, "grad_norm": 0.00044943380635231733, "learning_rate": 2.0408163265306125e-06, "loss": 0.0013, "step": 17600 }, { "epoch": 89.84693877551021, "grad_norm": 0.004551333375275135, "learning_rate": 2.030612244897959e-06, "loss": 0.0586, "step": 17610 }, { "epoch": 89.89795918367346, "grad_norm": 0.0005936112720519304, "learning_rate": 2.020408163265306e-06, "loss": 0.1189, "step": 17620 }, { "epoch": 89.94897959183673, "grad_norm": 0.00040862939204089344, "learning_rate": 2.0102040816326533e-06, "loss": 0.0, "step": 17630 }, { "epoch": 90.0, "grad_norm": 0.0037128387484699488, "learning_rate": 2.0000000000000003e-06, "loss": 0.0001, "step": 17640 }, { "epoch": 90.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.524640679359436, "eval_runtime": 1.0798, "eval_samples_per_second": 256.539, "eval_steps_per_second": 32.415, "step": 17640 }, { "epoch": 90.05102040816327, "grad_norm": 0.0007691020728088915, "learning_rate": 1.989795918367347e-06, "loss": 0.0002, "step": 17650 }, { "epoch": 90.10204081632654, "grad_norm": 0.0008152734371833503, "learning_rate": 1.979591836734694e-06, "loss": 0.0004, "step": 17660 }, { "epoch": 90.15306122448979, "grad_norm": 0.0003245431580580771, "learning_rate": 1.969387755102041e-06, "loss": 0.0001, "step": 17670 }, { "epoch": 90.20408163265306, "grad_norm": 0.0005253622657619417, "learning_rate": 1.9591836734693877e-06, "loss": 0.0001, "step": 17680 }, { "epoch": 90.25510204081633, "grad_norm": 0.000440177449490875, "learning_rate": 1.948979591836735e-06, "loss": 0.0001, "step": 17690 }, { "epoch": 90.3061224489796, "grad_norm": 0.39447104930877686, "learning_rate": 1.938775510204082e-06, "loss": 0.0004, "step": 17700 }, { "epoch": 90.35714285714286, "grad_norm": 0.0013608720619231462, "learning_rate": 1.928571428571429e-06, "loss": 0.004, "step": 17710 }, { "epoch": 90.40816326530613, "grad_norm": 0.009410234168171883, "learning_rate": 1.9183673469387756e-06, "loss": 0.0001, "step": 17720 }, { "epoch": 90.45918367346938, "grad_norm": 0.0002899194078054279, "learning_rate": 1.9081632653061226e-06, "loss": 0.0, "step": 17730 }, { "epoch": 90.51020408163265, "grad_norm": 0.019250724464654922, "learning_rate": 1.8979591836734695e-06, "loss": 0.0256, "step": 17740 }, { "epoch": 90.56122448979592, "grad_norm": 0.0014948390889912844, "learning_rate": 1.8877551020408163e-06, "loss": 0.0002, "step": 17750 }, { "epoch": 90.61224489795919, "grad_norm": 0.0005530890775844455, "learning_rate": 1.8775510204081634e-06, "loss": 0.0002, "step": 17760 }, { "epoch": 90.66326530612245, "grad_norm": 0.0010858520399779081, "learning_rate": 1.8673469387755102e-06, "loss": 0.0002, "step": 17770 }, { "epoch": 90.71428571428571, "grad_norm": 0.004605590365827084, "learning_rate": 1.8571428571428573e-06, "loss": 0.0003, "step": 17780 }, { "epoch": 90.76530612244898, "grad_norm": 0.0006175026064738631, "learning_rate": 1.8469387755102042e-06, "loss": 0.0002, "step": 17790 }, { "epoch": 90.81632653061224, "grad_norm": 0.0020875169429928064, "learning_rate": 1.8367346938775512e-06, "loss": 0.0042, "step": 17800 }, { "epoch": 90.86734693877551, "grad_norm": 0.0015989365056157112, "learning_rate": 1.826530612244898e-06, "loss": 0.0005, "step": 17810 }, { "epoch": 90.91836734693878, "grad_norm": 0.0008654359844513237, "learning_rate": 1.8163265306122451e-06, "loss": 0.0001, "step": 17820 }, { "epoch": 90.96938775510205, "grad_norm": 0.05170920118689537, "learning_rate": 1.806122448979592e-06, "loss": 0.0001, "step": 17830 }, { "epoch": 91.0, "eval_accuracy": 0.9314079422382672, "eval_loss": 0.5626211762428284, "eval_runtime": 1.0798, "eval_samples_per_second": 256.538, "eval_steps_per_second": 32.415, "step": 17836 }, { "epoch": 91.0204081632653, "grad_norm": 1.2442771196365356, "learning_rate": 1.7959183673469388e-06, "loss": 0.0023, "step": 17840 }, { "epoch": 91.07142857142857, "grad_norm": 0.0024343181867152452, "learning_rate": 1.7857142857142859e-06, "loss": 0.0, "step": 17850 }, { "epoch": 91.12244897959184, "grad_norm": 0.001072353101335466, "learning_rate": 1.7755102040816327e-06, "loss": 0.0003, "step": 17860 }, { "epoch": 91.1734693877551, "grad_norm": 18.939407348632812, "learning_rate": 1.7653061224489798e-06, "loss": 0.0055, "step": 17870 }, { "epoch": 91.22448979591837, "grad_norm": 0.0004119381483178586, "learning_rate": 1.7551020408163267e-06, "loss": 0.0531, "step": 17880 }, { "epoch": 91.27551020408163, "grad_norm": 0.0006503881886601448, "learning_rate": 1.7448979591836737e-06, "loss": 0.0, "step": 17890 }, { "epoch": 91.3265306122449, "grad_norm": 38.06752014160156, "learning_rate": 1.7346938775510206e-06, "loss": 0.0297, "step": 17900 }, { "epoch": 91.37755102040816, "grad_norm": 0.00042768768616952, "learning_rate": 1.7244897959183676e-06, "loss": 0.0001, "step": 17910 }, { "epoch": 91.42857142857143, "grad_norm": 0.0006707253633067012, "learning_rate": 1.7142857142857145e-06, "loss": 0.0, "step": 17920 }, { "epoch": 91.4795918367347, "grad_norm": 0.0007442046189680696, "learning_rate": 1.7040816326530613e-06, "loss": 0.0026, "step": 17930 }, { "epoch": 91.53061224489795, "grad_norm": 0.0002912504132837057, "learning_rate": 1.6938775510204084e-06, "loss": 0.0, "step": 17940 }, { "epoch": 91.58163265306122, "grad_norm": 0.0006401717546395957, "learning_rate": 1.6836734693877552e-06, "loss": 0.0, "step": 17950 }, { "epoch": 91.63265306122449, "grad_norm": 0.09557076543569565, "learning_rate": 1.6734693877551023e-06, "loss": 0.0483, "step": 17960 }, { "epoch": 91.68367346938776, "grad_norm": 0.0011875201016664505, "learning_rate": 1.6632653061224492e-06, "loss": 0.0002, "step": 17970 }, { "epoch": 91.73469387755102, "grad_norm": 0.0015232325531542301, "learning_rate": 1.6530612244897962e-06, "loss": 0.0001, "step": 17980 }, { "epoch": 91.78571428571429, "grad_norm": 0.0008706047083251178, "learning_rate": 1.642857142857143e-06, "loss": 0.0001, "step": 17990 }, { "epoch": 91.83673469387755, "grad_norm": 0.001088135875761509, "learning_rate": 1.6326530612244897e-06, "loss": 0.0688, "step": 18000 }, { "epoch": 91.88775510204081, "grad_norm": 0.0015986982034519315, "learning_rate": 1.622448979591837e-06, "loss": 0.0017, "step": 18010 }, { "epoch": 91.93877551020408, "grad_norm": 0.0005098647670820355, "learning_rate": 1.6122448979591836e-06, "loss": 0.0001, "step": 18020 }, { "epoch": 91.98979591836735, "grad_norm": 0.0006992301787249744, "learning_rate": 1.602040816326531e-06, "loss": 0.0037, "step": 18030 }, { "epoch": 92.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.5334672927856445, "eval_runtime": 1.0869, "eval_samples_per_second": 254.843, "eval_steps_per_second": 32.2, "step": 18032 }, { "epoch": 92.04081632653062, "grad_norm": 0.0025642879772931337, "learning_rate": 1.5918367346938775e-06, "loss": 0.0009, "step": 18040 }, { "epoch": 92.09183673469387, "grad_norm": 0.03112534061074257, "learning_rate": 1.5816326530612248e-06, "loss": 0.0001, "step": 18050 }, { "epoch": 92.14285714285714, "grad_norm": 0.0011495293583720922, "learning_rate": 1.5714285714285714e-06, "loss": 0.0102, "step": 18060 }, { "epoch": 92.1938775510204, "grad_norm": 0.0018035119865089655, "learning_rate": 1.5612244897959187e-06, "loss": 0.0008, "step": 18070 }, { "epoch": 92.24489795918367, "grad_norm": 0.0005472283228300512, "learning_rate": 1.5510204081632654e-06, "loss": 0.102, "step": 18080 }, { "epoch": 92.29591836734694, "grad_norm": 0.005956082604825497, "learning_rate": 1.5408163265306122e-06, "loss": 0.0006, "step": 18090 }, { "epoch": 92.34693877551021, "grad_norm": 0.0006637598271481693, "learning_rate": 1.5306122448979593e-06, "loss": 0.0002, "step": 18100 }, { "epoch": 92.39795918367346, "grad_norm": 0.015481573529541492, "learning_rate": 1.5204081632653061e-06, "loss": 0.0182, "step": 18110 }, { "epoch": 92.44897959183673, "grad_norm": 0.0006527500227093697, "learning_rate": 1.5102040816326532e-06, "loss": 0.0, "step": 18120 }, { "epoch": 92.5, "grad_norm": 0.0012338627129793167, "learning_rate": 1.5e-06, "loss": 0.0496, "step": 18130 }, { "epoch": 92.55102040816327, "grad_norm": 0.0011780742788687348, "learning_rate": 1.489795918367347e-06, "loss": 0.0004, "step": 18140 }, { "epoch": 92.60204081632654, "grad_norm": 0.002981587778776884, "learning_rate": 1.479591836734694e-06, "loss": 0.0067, "step": 18150 }, { "epoch": 92.65306122448979, "grad_norm": 0.0033729374408721924, "learning_rate": 1.469387755102041e-06, "loss": 0.0001, "step": 18160 }, { "epoch": 92.70408163265306, "grad_norm": 0.0008722987840883434, "learning_rate": 1.4591836734693879e-06, "loss": 0.0046, "step": 18170 }, { "epoch": 92.75510204081633, "grad_norm": 0.010068514384329319, "learning_rate": 1.4489795918367347e-06, "loss": 0.0006, "step": 18180 }, { "epoch": 92.8061224489796, "grad_norm": 0.0006730770110152662, "learning_rate": 1.4387755102040818e-06, "loss": 0.0012, "step": 18190 }, { "epoch": 92.85714285714286, "grad_norm": 0.0010180239332839847, "learning_rate": 1.4285714285714286e-06, "loss": 0.0, "step": 18200 }, { "epoch": 92.90816326530613, "grad_norm": 0.00089266401482746, "learning_rate": 1.4183673469387757e-06, "loss": 0.0009, "step": 18210 }, { "epoch": 92.95918367346938, "grad_norm": 0.009324053302407265, "learning_rate": 1.4081632653061225e-06, "loss": 0.0999, "step": 18220 }, { "epoch": 93.0, "eval_accuracy": 0.924187725631769, "eval_loss": 0.6356784105300903, "eval_runtime": 1.092, "eval_samples_per_second": 253.653, "eval_steps_per_second": 32.05, "step": 18228 }, { "epoch": 93.01020408163265, "grad_norm": 0.0018933649407699704, "learning_rate": 1.3979591836734696e-06, "loss": 0.0017, "step": 18230 }, { "epoch": 93.06122448979592, "grad_norm": 0.025904294103384018, "learning_rate": 1.3877551020408165e-06, "loss": 0.0002, "step": 18240 }, { "epoch": 93.11224489795919, "grad_norm": 0.000891715579200536, "learning_rate": 1.3775510204081633e-06, "loss": 0.0012, "step": 18250 }, { "epoch": 93.16326530612245, "grad_norm": 0.0023826761171221733, "learning_rate": 1.3673469387755104e-06, "loss": 0.0, "step": 18260 }, { "epoch": 93.21428571428571, "grad_norm": 0.0005038994131609797, "learning_rate": 1.3571428571428572e-06, "loss": 0.0, "step": 18270 }, { "epoch": 93.26530612244898, "grad_norm": 0.00231083482503891, "learning_rate": 1.3469387755102043e-06, "loss": 0.0894, "step": 18280 }, { "epoch": 93.31632653061224, "grad_norm": 1.040708065032959, "learning_rate": 1.3367346938775511e-06, "loss": 0.0035, "step": 18290 }, { "epoch": 93.36734693877551, "grad_norm": 0.003825167892500758, "learning_rate": 1.3265306122448982e-06, "loss": 0.0002, "step": 18300 }, { "epoch": 93.41836734693878, "grad_norm": 0.0013739422429352999, "learning_rate": 1.316326530612245e-06, "loss": 0.0, "step": 18310 }, { "epoch": 93.46938775510205, "grad_norm": 0.00032948836451396346, "learning_rate": 1.3061224489795921e-06, "loss": 0.0034, "step": 18320 }, { "epoch": 93.5204081632653, "grad_norm": 0.002375826006755233, "learning_rate": 1.295918367346939e-06, "loss": 0.0, "step": 18330 }, { "epoch": 93.57142857142857, "grad_norm": 0.004834245424717665, "learning_rate": 1.2857142857142856e-06, "loss": 0.0003, "step": 18340 }, { "epoch": 93.62244897959184, "grad_norm": 0.00031381373992189765, "learning_rate": 1.2755102040816329e-06, "loss": 0.0001, "step": 18350 }, { "epoch": 93.6734693877551, "grad_norm": 0.01649618148803711, "learning_rate": 1.2653061224489795e-06, "loss": 0.0001, "step": 18360 }, { "epoch": 93.72448979591837, "grad_norm": 0.00026429592981003225, "learning_rate": 1.2551020408163268e-06, "loss": 0.0015, "step": 18370 }, { "epoch": 93.77551020408163, "grad_norm": 0.0038391277194023132, "learning_rate": 1.2448979591836734e-06, "loss": 0.0536, "step": 18380 }, { "epoch": 93.8265306122449, "grad_norm": 0.0032683941535651684, "learning_rate": 1.2346938775510205e-06, "loss": 0.0714, "step": 18390 }, { "epoch": 93.87755102040816, "grad_norm": 0.01570449396967888, "learning_rate": 1.2244897959183673e-06, "loss": 0.0001, "step": 18400 }, { "epoch": 93.92857142857143, "grad_norm": 0.027679676190018654, "learning_rate": 1.2142857142857144e-06, "loss": 0.0, "step": 18410 }, { "epoch": 93.9795918367347, "grad_norm": 0.005994068458676338, "learning_rate": 1.2040816326530612e-06, "loss": 0.1124, "step": 18420 }, { "epoch": 94.0, "eval_accuracy": 0.927797833935018, "eval_loss": 0.590476930141449, "eval_runtime": 1.0817, "eval_samples_per_second": 256.087, "eval_steps_per_second": 32.358, "step": 18424 }, { "epoch": 94.03061224489795, "grad_norm": 0.0011859724763780832, "learning_rate": 1.1938775510204083e-06, "loss": 0.0, "step": 18430 }, { "epoch": 94.08163265306122, "grad_norm": 0.06980699300765991, "learning_rate": 1.1836734693877552e-06, "loss": 0.0386, "step": 18440 }, { "epoch": 94.13265306122449, "grad_norm": 0.0037249717861413956, "learning_rate": 1.1734693877551022e-06, "loss": 0.0, "step": 18450 }, { "epoch": 94.18367346938776, "grad_norm": 0.000398153904825449, "learning_rate": 1.163265306122449e-06, "loss": 0.0391, "step": 18460 }, { "epoch": 94.23469387755102, "grad_norm": 2.7329695224761963, "learning_rate": 1.153061224489796e-06, "loss": 0.0007, "step": 18470 }, { "epoch": 94.28571428571429, "grad_norm": 0.002940528094768524, "learning_rate": 1.142857142857143e-06, "loss": 0.0, "step": 18480 }, { "epoch": 94.33673469387755, "grad_norm": 0.0005318316980265081, "learning_rate": 1.1326530612244898e-06, "loss": 0.0, "step": 18490 }, { "epoch": 94.38775510204081, "grad_norm": 0.003667190670967102, "learning_rate": 1.122448979591837e-06, "loss": 0.0256, "step": 18500 }, { "epoch": 94.43877551020408, "grad_norm": 0.0011388729326426983, "learning_rate": 1.1122448979591838e-06, "loss": 0.0008, "step": 18510 }, { "epoch": 94.48979591836735, "grad_norm": 0.0004734453686978668, "learning_rate": 1.1020408163265308e-06, "loss": 0.0001, "step": 18520 }, { "epoch": 94.54081632653062, "grad_norm": 0.00046554498840123415, "learning_rate": 1.0918367346938777e-06, "loss": 0.0, "step": 18530 }, { "epoch": 94.59183673469387, "grad_norm": 0.006730195134878159, "learning_rate": 1.0816326530612247e-06, "loss": 0.0006, "step": 18540 }, { "epoch": 94.64285714285714, "grad_norm": 1.2452219724655151, "learning_rate": 1.0714285714285714e-06, "loss": 0.0012, "step": 18550 }, { "epoch": 94.6938775510204, "grad_norm": 0.0050456034950912, "learning_rate": 1.0612244897959184e-06, "loss": 0.0, "step": 18560 }, { "epoch": 94.74489795918367, "grad_norm": 0.022612031549215317, "learning_rate": 1.0510204081632653e-06, "loss": 0.0001, "step": 18570 }, { "epoch": 94.79591836734694, "grad_norm": 0.0007621384575031698, "learning_rate": 1.0408163265306123e-06, "loss": 0.1282, "step": 18580 }, { "epoch": 94.84693877551021, "grad_norm": 0.00040719835669733584, "learning_rate": 1.0306122448979592e-06, "loss": 0.0001, "step": 18590 }, { "epoch": 94.89795918367346, "grad_norm": 0.00458616903051734, "learning_rate": 1.0204081632653063e-06, "loss": 0.0007, "step": 18600 }, { "epoch": 94.94897959183673, "grad_norm": 0.05395274609327316, "learning_rate": 1.010204081632653e-06, "loss": 0.0001, "step": 18610 }, { "epoch": 95.0, "grad_norm": 0.0007162753609009087, "learning_rate": 1.0000000000000002e-06, "loss": 0.0175, "step": 18620 }, { "epoch": 95.0, "eval_accuracy": 0.9205776173285198, "eval_loss": 0.661800742149353, "eval_runtime": 1.1187, "eval_samples_per_second": 247.61, "eval_steps_per_second": 31.286, "step": 18620 }, { "epoch": 95.05102040816327, "grad_norm": 0.0006321434048004448, "learning_rate": 9.89795918367347e-07, "loss": 0.0, "step": 18630 }, { "epoch": 95.10204081632654, "grad_norm": 0.0008710910915397108, "learning_rate": 9.795918367346939e-07, "loss": 0.0497, "step": 18640 }, { "epoch": 95.15306122448979, "grad_norm": 0.0034613830503076315, "learning_rate": 9.69387755102041e-07, "loss": 0.0001, "step": 18650 }, { "epoch": 95.20408163265306, "grad_norm": 0.005640481133013964, "learning_rate": 9.591836734693878e-07, "loss": 0.0004, "step": 18660 }, { "epoch": 95.25510204081633, "grad_norm": 0.0019760699942708015, "learning_rate": 9.489795918367347e-07, "loss": 0.003, "step": 18670 }, { "epoch": 95.3061224489796, "grad_norm": 0.00034776542452163994, "learning_rate": 9.387755102040817e-07, "loss": 0.0015, "step": 18680 }, { "epoch": 95.35714285714286, "grad_norm": 20.253816604614258, "learning_rate": 9.285714285714287e-07, "loss": 0.006, "step": 18690 }, { "epoch": 95.40816326530613, "grad_norm": 0.0007858517928980291, "learning_rate": 9.183673469387756e-07, "loss": 0.0004, "step": 18700 }, { "epoch": 95.45918367346938, "grad_norm": 0.8080747723579407, "learning_rate": 9.081632653061226e-07, "loss": 0.0003, "step": 18710 }, { "epoch": 95.51020408163265, "grad_norm": 0.0011322839418426156, "learning_rate": 8.979591836734694e-07, "loss": 0.0, "step": 18720 }, { "epoch": 95.56122448979592, "grad_norm": 0.0006596882012672722, "learning_rate": 8.877551020408164e-07, "loss": 0.0026, "step": 18730 }, { "epoch": 95.61224489795919, "grad_norm": 0.010485788807272911, "learning_rate": 8.775510204081633e-07, "loss": 0.0613, "step": 18740 }, { "epoch": 95.66326530612245, "grad_norm": 0.0018889378989115357, "learning_rate": 8.673469387755103e-07, "loss": 0.0, "step": 18750 }, { "epoch": 95.71428571428571, "grad_norm": 0.001992079196497798, "learning_rate": 8.571428571428572e-07, "loss": 0.0013, "step": 18760 }, { "epoch": 95.76530612244898, "grad_norm": 0.0006261126836761832, "learning_rate": 8.469387755102042e-07, "loss": 0.0, "step": 18770 }, { "epoch": 95.81632653061224, "grad_norm": 0.005070496816188097, "learning_rate": 8.367346938775512e-07, "loss": 0.0001, "step": 18780 }, { "epoch": 95.86734693877551, "grad_norm": 0.0008687849040143192, "learning_rate": 8.265306122448981e-07, "loss": 0.0045, "step": 18790 }, { "epoch": 95.91836734693878, "grad_norm": 0.0007648968021385372, "learning_rate": 8.163265306122449e-07, "loss": 0.0, "step": 18800 }, { "epoch": 95.96938775510205, "grad_norm": 0.0007862557540647686, "learning_rate": 8.061224489795918e-07, "loss": 0.0001, "step": 18810 }, { "epoch": 96.0, "eval_accuracy": 0.9386281588447654, "eval_loss": 0.5588254332542419, "eval_runtime": 1.0772, "eval_samples_per_second": 257.143, "eval_steps_per_second": 32.491, "step": 18816 }, { "epoch": 96.0204081632653, "grad_norm": 0.0003080127062276006, "learning_rate": 7.959183673469388e-07, "loss": 0.0, "step": 18820 }, { "epoch": 96.07142857142857, "grad_norm": 0.00046985066728666425, "learning_rate": 7.857142857142857e-07, "loss": 0.0, "step": 18830 }, { "epoch": 96.12244897959184, "grad_norm": 0.005389807745814323, "learning_rate": 7.755102040816327e-07, "loss": 0.0522, "step": 18840 }, { "epoch": 96.1734693877551, "grad_norm": 0.06740324944257736, "learning_rate": 7.653061224489796e-07, "loss": 0.0534, "step": 18850 }, { "epoch": 96.22448979591837, "grad_norm": 0.0003619295603130013, "learning_rate": 7.551020408163266e-07, "loss": 0.0019, "step": 18860 }, { "epoch": 96.27551020408163, "grad_norm": 0.0006231485167518258, "learning_rate": 7.448979591836736e-07, "loss": 0.0059, "step": 18870 }, { "epoch": 96.3265306122449, "grad_norm": 0.0005269271205179393, "learning_rate": 7.346938775510205e-07, "loss": 0.0274, "step": 18880 }, { "epoch": 96.37755102040816, "grad_norm": 0.0017044926062226295, "learning_rate": 7.244897959183674e-07, "loss": 0.0712, "step": 18890 }, { "epoch": 96.42857142857143, "grad_norm": 0.00026990927290171385, "learning_rate": 7.142857142857143e-07, "loss": 0.0001, "step": 18900 }, { "epoch": 96.4795918367347, "grad_norm": 0.0010163065744563937, "learning_rate": 7.040816326530613e-07, "loss": 0.0001, "step": 18910 }, { "epoch": 96.53061224489795, "grad_norm": 0.00048644471098668873, "learning_rate": 6.938775510204082e-07, "loss": 0.0283, "step": 18920 }, { "epoch": 96.58163265306122, "grad_norm": 0.0008096566307358444, "learning_rate": 6.836734693877552e-07, "loss": 0.0, "step": 18930 }, { "epoch": 96.63265306122449, "grad_norm": 0.010710473172366619, "learning_rate": 6.734693877551021e-07, "loss": 0.0082, "step": 18940 }, { "epoch": 96.68367346938776, "grad_norm": 0.003333403030410409, "learning_rate": 6.632653061224491e-07, "loss": 0.0, "step": 18950 }, { "epoch": 96.73469387755102, "grad_norm": 0.001861305208876729, "learning_rate": 6.530612244897961e-07, "loss": 0.0008, "step": 18960 }, { "epoch": 96.78571428571429, "grad_norm": 0.003332825843244791, "learning_rate": 6.428571428571428e-07, "loss": 0.0, "step": 18970 }, { "epoch": 96.83673469387755, "grad_norm": 0.0011864847037941217, "learning_rate": 6.326530612244898e-07, "loss": 0.0001, "step": 18980 }, { "epoch": 96.88775510204081, "grad_norm": 0.001021908363327384, "learning_rate": 6.224489795918367e-07, "loss": 0.0132, "step": 18990 }, { "epoch": 96.93877551020408, "grad_norm": 0.000425991223892197, "learning_rate": 6.122448979591837e-07, "loss": 0.0489, "step": 19000 }, { "epoch": 96.98979591836735, "grad_norm": 0.00045770313590765, "learning_rate": 6.020408163265306e-07, "loss": 0.0259, "step": 19010 }, { "epoch": 97.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.5548804998397827, "eval_runtime": 1.0796, "eval_samples_per_second": 256.574, "eval_steps_per_second": 32.419, "step": 19012 }, { "epoch": 97.04081632653062, "grad_norm": 0.00049464131006971, "learning_rate": 5.918367346938776e-07, "loss": 0.0001, "step": 19020 }, { "epoch": 97.09183673469387, "grad_norm": 0.001187681220471859, "learning_rate": 5.816326530612245e-07, "loss": 0.0, "step": 19030 }, { "epoch": 97.14285714285714, "grad_norm": 0.0005556918913498521, "learning_rate": 5.714285714285715e-07, "loss": 0.0088, "step": 19040 }, { "epoch": 97.1938775510204, "grad_norm": 0.0007009373512119055, "learning_rate": 5.612244897959184e-07, "loss": 0.0001, "step": 19050 }, { "epoch": 97.24489795918367, "grad_norm": 0.0018012009095400572, "learning_rate": 5.510204081632654e-07, "loss": 0.0, "step": 19060 }, { "epoch": 97.29591836734694, "grad_norm": 0.011059354990720749, "learning_rate": 5.408163265306124e-07, "loss": 0.0, "step": 19070 }, { "epoch": 97.34693877551021, "grad_norm": 0.002653836039826274, "learning_rate": 5.306122448979592e-07, "loss": 0.0, "step": 19080 }, { "epoch": 97.39795918367346, "grad_norm": 0.0004589575110003352, "learning_rate": 5.204081632653062e-07, "loss": 0.0074, "step": 19090 }, { "epoch": 97.44897959183673, "grad_norm": 0.0008633316028863192, "learning_rate": 5.102040816326531e-07, "loss": 0.0008, "step": 19100 }, { "epoch": 97.5, "grad_norm": 0.002689096610993147, "learning_rate": 5.000000000000001e-07, "loss": 0.0001, "step": 19110 }, { "epoch": 97.55102040816327, "grad_norm": 0.000376744254026562, "learning_rate": 4.897959183673469e-07, "loss": 0.0, "step": 19120 }, { "epoch": 97.60204081632654, "grad_norm": 32.96564483642578, "learning_rate": 4.795918367346939e-07, "loss": 0.1432, "step": 19130 }, { "epoch": 97.65306122448979, "grad_norm": 0.001013111905194819, "learning_rate": 4.6938775510204085e-07, "loss": 0.0001, "step": 19140 }, { "epoch": 97.70408163265306, "grad_norm": 0.0009411570499651134, "learning_rate": 4.591836734693878e-07, "loss": 0.0455, "step": 19150 }, { "epoch": 97.75510204081633, "grad_norm": 0.0005728753749281168, "learning_rate": 4.489795918367347e-07, "loss": 0.0055, "step": 19160 }, { "epoch": 97.8061224489796, "grad_norm": 0.03133363276720047, "learning_rate": 4.3877551020408166e-07, "loss": 0.0021, "step": 19170 }, { "epoch": 97.85714285714286, "grad_norm": 0.0009139064350165427, "learning_rate": 4.285714285714286e-07, "loss": 0.0, "step": 19180 }, { "epoch": 97.90816326530613, "grad_norm": 0.00043925695354118943, "learning_rate": 4.183673469387756e-07, "loss": 0.0, "step": 19190 }, { "epoch": 97.95918367346938, "grad_norm": 0.0006421167054213583, "learning_rate": 4.0816326530612243e-07, "loss": 0.0001, "step": 19200 }, { "epoch": 98.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.5599140524864197, "eval_runtime": 1.0759, "eval_samples_per_second": 257.46, "eval_steps_per_second": 32.531, "step": 19208 }, { "epoch": 98.01020408163265, "grad_norm": 0.001159628969617188, "learning_rate": 3.979591836734694e-07, "loss": 0.0002, "step": 19210 }, { "epoch": 98.06122448979592, "grad_norm": 0.0009000562131404877, "learning_rate": 3.8775510204081634e-07, "loss": 0.0008, "step": 19220 }, { "epoch": 98.11224489795919, "grad_norm": 0.0074297478422522545, "learning_rate": 3.775510204081633e-07, "loss": 0.0588, "step": 19230 }, { "epoch": 98.16326530612245, "grad_norm": 0.0023160367272794247, "learning_rate": 3.6734693877551025e-07, "loss": 0.0116, "step": 19240 }, { "epoch": 98.21428571428571, "grad_norm": 0.0034650848247110844, "learning_rate": 3.5714285714285716e-07, "loss": 0.0133, "step": 19250 }, { "epoch": 98.26530612244898, "grad_norm": 0.0022028449457138777, "learning_rate": 3.469387755102041e-07, "loss": 0.003, "step": 19260 }, { "epoch": 98.31632653061224, "grad_norm": 0.0004120208614040166, "learning_rate": 3.3673469387755107e-07, "loss": 0.0107, "step": 19270 }, { "epoch": 98.36734693877551, "grad_norm": 0.01532533299177885, "learning_rate": 3.2653061224489803e-07, "loss": 0.1561, "step": 19280 }, { "epoch": 98.41836734693878, "grad_norm": 0.0009321342222392559, "learning_rate": 3.163265306122449e-07, "loss": 0.0, "step": 19290 }, { "epoch": 98.46938775510205, "grad_norm": 0.000532894569914788, "learning_rate": 3.0612244897959183e-07, "loss": 0.0564, "step": 19300 }, { "epoch": 98.5204081632653, "grad_norm": 0.0024218480102717876, "learning_rate": 2.959183673469388e-07, "loss": 0.0022, "step": 19310 }, { "epoch": 98.57142857142857, "grad_norm": 0.0003754945646505803, "learning_rate": 2.8571428571428575e-07, "loss": 0.0, "step": 19320 }, { "epoch": 98.62244897959184, "grad_norm": 0.0026696028653532267, "learning_rate": 2.755102040816327e-07, "loss": 0.0001, "step": 19330 }, { "epoch": 98.6734693877551, "grad_norm": 0.003208919195458293, "learning_rate": 2.653061224489796e-07, "loss": 0.0, "step": 19340 }, { "epoch": 98.72448979591837, "grad_norm": 0.0012073633261024952, "learning_rate": 2.5510204081632656e-07, "loss": 0.0973, "step": 19350 }, { "epoch": 98.77551020408163, "grad_norm": 0.0007839403115212917, "learning_rate": 2.4489795918367347e-07, "loss": 0.0045, "step": 19360 }, { "epoch": 98.8265306122449, "grad_norm": 0.0002693181158974767, "learning_rate": 2.3469387755102042e-07, "loss": 0.0, "step": 19370 }, { "epoch": 98.87755102040816, "grad_norm": 0.0004924468230456114, "learning_rate": 2.2448979591836735e-07, "loss": 0.0, "step": 19380 }, { "epoch": 98.92857142857143, "grad_norm": 0.000353965355316177, "learning_rate": 2.142857142857143e-07, "loss": 0.0005, "step": 19390 }, { "epoch": 98.9795918367347, "grad_norm": 0.0012441278668120503, "learning_rate": 2.0408163265306121e-07, "loss": 0.0285, "step": 19400 }, { "epoch": 99.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.5517168045043945, "eval_runtime": 1.1267, "eval_samples_per_second": 245.849, "eval_steps_per_second": 31.064, "step": 19404 }, { "epoch": 99.03061224489795, "grad_norm": 0.0031274007633328438, "learning_rate": 1.9387755102040817e-07, "loss": 0.0064, "step": 19410 }, { "epoch": 99.08163265306122, "grad_norm": 0.0034188437275588512, "learning_rate": 1.8367346938775513e-07, "loss": 0.0001, "step": 19420 }, { "epoch": 99.13265306122449, "grad_norm": 31.40038299560547, "learning_rate": 1.7346938775510206e-07, "loss": 0.0085, "step": 19430 }, { "epoch": 99.18367346938776, "grad_norm": 0.0013962392695248127, "learning_rate": 1.6326530612244901e-07, "loss": 0.0001, "step": 19440 }, { "epoch": 99.23469387755102, "grad_norm": 0.0009413012303411961, "learning_rate": 1.5306122448979592e-07, "loss": 0.0, "step": 19450 }, { "epoch": 99.28571428571429, "grad_norm": 0.0034365213941782713, "learning_rate": 1.4285714285714287e-07, "loss": 0.0011, "step": 19460 }, { "epoch": 99.33673469387755, "grad_norm": 0.05414808541536331, "learning_rate": 1.326530612244898e-07, "loss": 0.0199, "step": 19470 }, { "epoch": 99.38775510204081, "grad_norm": 0.0008096377132460475, "learning_rate": 1.2244897959183673e-07, "loss": 0.0005, "step": 19480 }, { "epoch": 99.43877551020408, "grad_norm": 0.04069012030959129, "learning_rate": 1.1224489795918368e-07, "loss": 0.0001, "step": 19490 }, { "epoch": 99.48979591836735, "grad_norm": 0.00032034626929089427, "learning_rate": 1.0204081632653061e-07, "loss": 0.0001, "step": 19500 }, { "epoch": 99.54081632653062, "grad_norm": 0.0009445060277357697, "learning_rate": 9.183673469387756e-08, "loss": 0.0001, "step": 19510 }, { "epoch": 99.59183673469387, "grad_norm": 0.000807897886261344, "learning_rate": 8.163265306122451e-08, "loss": 0.0001, "step": 19520 }, { "epoch": 99.64285714285714, "grad_norm": 0.0014714981662109494, "learning_rate": 7.142857142857144e-08, "loss": 0.0099, "step": 19530 }, { "epoch": 99.6938775510204, "grad_norm": 0.00042553915409371257, "learning_rate": 6.122448979591837e-08, "loss": 0.0004, "step": 19540 }, { "epoch": 99.74489795918367, "grad_norm": 0.00044042442459613085, "learning_rate": 5.1020408163265303e-08, "loss": 0.0, "step": 19550 }, { "epoch": 99.79591836734694, "grad_norm": 0.0014019225491210818, "learning_rate": 4.0816326530612253e-08, "loss": 0.0046, "step": 19560 }, { "epoch": 99.84693877551021, "grad_norm": 0.0007698306581005454, "learning_rate": 3.0612244897959183e-08, "loss": 0.0, "step": 19570 }, { "epoch": 99.89795918367346, "grad_norm": 0.00033522816374897957, "learning_rate": 2.0408163265306127e-08, "loss": 0.0, "step": 19580 }, { "epoch": 99.94897959183673, "grad_norm": 0.002832184312865138, "learning_rate": 1.0204081632653063e-08, "loss": 0.0, "step": 19590 }, { "epoch": 100.0, "grad_norm": 0.0014840251533314586, "learning_rate": 0.0, "loss": 0.003, "step": 19600 }, { "epoch": 100.0, "eval_accuracy": 0.9350180505415162, "eval_loss": 0.5570487976074219, "eval_runtime": 1.1313, "eval_samples_per_second": 244.855, "eval_steps_per_second": 30.938, "step": 19600 }, { "epoch": 100.0, "step": 19600, "total_flos": 1.2135680631115776e+19, "train_loss": 0.0, "train_runtime": 1.4004, "train_samples_per_second": 111828.931, "train_steps_per_second": 13996.469 } ], "logging_steps": 10, "max_steps": 19600, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2135680631115776e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }