{ "best_metric": null, "best_model_checkpoint": null, "epoch": 118.93805309734513, "eval_steps": 500, "global_step": 3360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.35398230088495575, "grad_norm": 8.875021934509277, "learning_rate": 1.4880952380952381e-06, "loss": 2.2984, "step": 10 }, { "epoch": 0.7079646017699115, "grad_norm": 10.44489860534668, "learning_rate": 2.9761904761904763e-06, "loss": 2.302, "step": 20 }, { "epoch": 0.9911504424778761, "eval_accuracy": 0.1575, "eval_loss": 2.266624689102173, "eval_runtime": 7.4116, "eval_samples_per_second": 53.97, "eval_steps_per_second": 1.754, "step": 28 }, { "epoch": 1.0619469026548674, "grad_norm": 11.131507873535156, "learning_rate": 4.464285714285715e-06, "loss": 2.2827, "step": 30 }, { "epoch": 1.415929203539823, "grad_norm": 11.846382141113281, "learning_rate": 5.9523809523809525e-06, "loss": 2.2517, "step": 40 }, { "epoch": 1.7699115044247788, "grad_norm": 8.047672271728516, "learning_rate": 7.4404761904761905e-06, "loss": 2.2226, "step": 50 }, { "epoch": 1.9823008849557522, "eval_accuracy": 0.315, "eval_loss": 2.1654422283172607, "eval_runtime": 5.7675, "eval_samples_per_second": 69.354, "eval_steps_per_second": 2.254, "step": 56 }, { "epoch": 2.1238938053097347, "grad_norm": 7.9003190994262695, "learning_rate": 8.92857142857143e-06, "loss": 2.1848, "step": 60 }, { "epoch": 2.47787610619469, "grad_norm": 14.892489433288574, "learning_rate": 1.0416666666666668e-05, "loss": 2.1238, "step": 70 }, { "epoch": 2.831858407079646, "grad_norm": 13.308501243591309, "learning_rate": 1.1904761904761905e-05, "loss": 2.0639, "step": 80 }, { "epoch": 2.9734513274336285, "eval_accuracy": 0.445, "eval_loss": 1.9969619512557983, "eval_runtime": 5.7209, "eval_samples_per_second": 69.919, "eval_steps_per_second": 2.272, "step": 84 }, { "epoch": 3.185840707964602, "grad_norm": 16.818639755249023, "learning_rate": 1.3392857142857144e-05, "loss": 2.005, "step": 90 }, { "epoch": 3.5398230088495577, "grad_norm": 13.497295379638672, "learning_rate": 1.4880952380952381e-05, "loss": 1.9232, "step": 100 }, { "epoch": 3.893805309734513, "grad_norm": 16.994335174560547, "learning_rate": 1.636904761904762e-05, "loss": 1.8559, "step": 110 }, { "epoch": 4.0, "eval_accuracy": 0.56, "eval_loss": 1.7373383045196533, "eval_runtime": 5.8519, "eval_samples_per_second": 68.354, "eval_steps_per_second": 2.222, "step": 113 }, { "epoch": 4.247787610619469, "grad_norm": 25.03571891784668, "learning_rate": 1.785714285714286e-05, "loss": 1.7371, "step": 120 }, { "epoch": 4.601769911504425, "grad_norm": 27.68413734436035, "learning_rate": 1.9345238095238097e-05, "loss": 1.6511, "step": 130 }, { "epoch": 4.95575221238938, "grad_norm": 21.25657081604004, "learning_rate": 2.0833333333333336e-05, "loss": 1.5966, "step": 140 }, { "epoch": 4.991150442477876, "eval_accuracy": 0.605, "eval_loss": 1.48233962059021, "eval_runtime": 5.791, "eval_samples_per_second": 69.072, "eval_steps_per_second": 2.245, "step": 141 }, { "epoch": 5.3097345132743365, "grad_norm": 33.357173919677734, "learning_rate": 2.2321428571428575e-05, "loss": 1.4767, "step": 150 }, { "epoch": 5.663716814159292, "grad_norm": 21.856212615966797, "learning_rate": 2.380952380952381e-05, "loss": 1.3967, "step": 160 }, { "epoch": 5.982300884955752, "eval_accuracy": 0.6125, "eval_loss": 1.2925149202346802, "eval_runtime": 5.8558, "eval_samples_per_second": 68.308, "eval_steps_per_second": 2.22, "step": 169 }, { "epoch": 6.017699115044247, "grad_norm": 34.79808807373047, "learning_rate": 2.529761904761905e-05, "loss": 1.3046, "step": 170 }, { "epoch": 6.371681415929204, "grad_norm": 27.307743072509766, "learning_rate": 2.6785714285714288e-05, "loss": 1.2429, "step": 180 }, { "epoch": 6.725663716814159, "grad_norm": 42.35990524291992, "learning_rate": 2.8273809523809523e-05, "loss": 1.204, "step": 190 }, { "epoch": 6.9734513274336285, "eval_accuracy": 0.68, "eval_loss": 1.0512183904647827, "eval_runtime": 6.0412, "eval_samples_per_second": 66.212, "eval_steps_per_second": 2.152, "step": 197 }, { "epoch": 7.079646017699115, "grad_norm": 35.177921295166016, "learning_rate": 2.9761904761904762e-05, "loss": 1.1471, "step": 200 }, { "epoch": 7.433628318584071, "grad_norm": 42.960391998291016, "learning_rate": 3.125e-05, "loss": 1.1087, "step": 210 }, { "epoch": 7.787610619469026, "grad_norm": 28.81920623779297, "learning_rate": 3.273809523809524e-05, "loss": 1.0206, "step": 220 }, { "epoch": 8.0, "eval_accuracy": 0.7025, "eval_loss": 0.930656909942627, "eval_runtime": 5.9577, "eval_samples_per_second": 67.14, "eval_steps_per_second": 2.182, "step": 226 }, { "epoch": 8.141592920353983, "grad_norm": 27.41872787475586, "learning_rate": 3.422619047619048e-05, "loss": 0.9949, "step": 230 }, { "epoch": 8.495575221238939, "grad_norm": 30.292728424072266, "learning_rate": 3.571428571428572e-05, "loss": 0.9248, "step": 240 }, { "epoch": 8.849557522123893, "grad_norm": 41.145442962646484, "learning_rate": 3.7202380952380956e-05, "loss": 0.9408, "step": 250 }, { "epoch": 8.991150442477876, "eval_accuracy": 0.7425, "eval_loss": 0.8286006450653076, "eval_runtime": 6.1346, "eval_samples_per_second": 65.204, "eval_steps_per_second": 2.119, "step": 254 }, { "epoch": 9.20353982300885, "grad_norm": 100.8248519897461, "learning_rate": 3.8690476190476195e-05, "loss": 0.9164, "step": 260 }, { "epoch": 9.557522123893806, "grad_norm": 33.74626541137695, "learning_rate": 4.017857142857143e-05, "loss": 0.8392, "step": 270 }, { "epoch": 9.91150442477876, "grad_norm": 23.371883392333984, "learning_rate": 4.166666666666667e-05, "loss": 0.8501, "step": 280 }, { "epoch": 9.982300884955752, "eval_accuracy": 0.6975, "eval_loss": 0.8589820265769958, "eval_runtime": 5.7507, "eval_samples_per_second": 69.557, "eval_steps_per_second": 2.261, "step": 282 }, { "epoch": 10.265486725663717, "grad_norm": 27.10701560974121, "learning_rate": 4.315476190476191e-05, "loss": 0.8698, "step": 290 }, { "epoch": 10.619469026548673, "grad_norm": 31.57968521118164, "learning_rate": 4.464285714285715e-05, "loss": 0.7621, "step": 300 }, { "epoch": 10.973451327433628, "grad_norm": 16.75609588623047, "learning_rate": 4.613095238095239e-05, "loss": 0.7545, "step": 310 }, { "epoch": 10.973451327433628, "eval_accuracy": 0.7475, "eval_loss": 0.7702187895774841, "eval_runtime": 5.9752, "eval_samples_per_second": 66.944, "eval_steps_per_second": 2.176, "step": 310 }, { "epoch": 11.327433628318584, "grad_norm": 38.2448616027832, "learning_rate": 4.761904761904762e-05, "loss": 0.7884, "step": 320 }, { "epoch": 11.68141592920354, "grad_norm": 55.255489349365234, "learning_rate": 4.910714285714286e-05, "loss": 0.7484, "step": 330 }, { "epoch": 12.0, "eval_accuracy": 0.765, "eval_loss": 0.7738745212554932, "eval_runtime": 5.7936, "eval_samples_per_second": 69.041, "eval_steps_per_second": 2.244, "step": 339 }, { "epoch": 12.035398230088495, "grad_norm": 48.28477478027344, "learning_rate": 4.993386243386244e-05, "loss": 0.7274, "step": 340 }, { "epoch": 12.389380530973451, "grad_norm": 29.47681999206543, "learning_rate": 4.976851851851852e-05, "loss": 0.6793, "step": 350 }, { "epoch": 12.743362831858407, "grad_norm": 25.996002197265625, "learning_rate": 4.960317460317461e-05, "loss": 0.6909, "step": 360 }, { "epoch": 12.991150442477876, "eval_accuracy": 0.75, "eval_loss": 0.7344464063644409, "eval_runtime": 5.7939, "eval_samples_per_second": 69.038, "eval_steps_per_second": 2.244, "step": 367 }, { "epoch": 13.097345132743364, "grad_norm": 17.77817153930664, "learning_rate": 4.943783068783069e-05, "loss": 0.6684, "step": 370 }, { "epoch": 13.451327433628318, "grad_norm": 42.287635803222656, "learning_rate": 4.927248677248678e-05, "loss": 0.6309, "step": 380 }, { "epoch": 13.805309734513274, "grad_norm": 23.582502365112305, "learning_rate": 4.910714285714286e-05, "loss": 0.6558, "step": 390 }, { "epoch": 13.982300884955752, "eval_accuracy": 0.775, "eval_loss": 0.6874340772628784, "eval_runtime": 5.9948, "eval_samples_per_second": 66.725, "eval_steps_per_second": 2.169, "step": 395 }, { "epoch": 14.15929203539823, "grad_norm": 23.51905059814453, "learning_rate": 4.894179894179895e-05, "loss": 0.5937, "step": 400 }, { "epoch": 14.513274336283185, "grad_norm": 23.072551727294922, "learning_rate": 4.8776455026455034e-05, "loss": 0.6672, "step": 410 }, { "epoch": 14.867256637168142, "grad_norm": 20.066308975219727, "learning_rate": 4.8611111111111115e-05, "loss": 0.5923, "step": 420 }, { "epoch": 14.973451327433628, "eval_accuracy": 0.7675, "eval_loss": 0.6640682816505432, "eval_runtime": 5.7834, "eval_samples_per_second": 69.163, "eval_steps_per_second": 2.248, "step": 423 }, { "epoch": 15.221238938053098, "grad_norm": 82.714599609375, "learning_rate": 4.84457671957672e-05, "loss": 0.6061, "step": 430 }, { "epoch": 15.575221238938052, "grad_norm": 23.3370361328125, "learning_rate": 4.8280423280423284e-05, "loss": 0.5595, "step": 440 }, { "epoch": 15.929203539823009, "grad_norm": 39.12126159667969, "learning_rate": 4.811507936507937e-05, "loss": 0.5764, "step": 450 }, { "epoch": 16.0, "eval_accuracy": 0.7925, "eval_loss": 0.6109621524810791, "eval_runtime": 5.8219, "eval_samples_per_second": 68.706, "eval_steps_per_second": 2.233, "step": 452 }, { "epoch": 16.283185840707965, "grad_norm": 18.160350799560547, "learning_rate": 4.794973544973545e-05, "loss": 0.564, "step": 460 }, { "epoch": 16.63716814159292, "grad_norm": 22.667816162109375, "learning_rate": 4.778439153439154e-05, "loss": 0.5416, "step": 470 }, { "epoch": 16.991150442477878, "grad_norm": 43.60499954223633, "learning_rate": 4.761904761904762e-05, "loss": 0.5235, "step": 480 }, { "epoch": 16.991150442477878, "eval_accuracy": 0.76, "eval_loss": 0.680573582649231, "eval_runtime": 6.0278, "eval_samples_per_second": 66.359, "eval_steps_per_second": 2.157, "step": 480 }, { "epoch": 17.345132743362832, "grad_norm": 17.05644416809082, "learning_rate": 4.745370370370371e-05, "loss": 0.5301, "step": 490 }, { "epoch": 17.699115044247787, "grad_norm": 14.023980140686035, "learning_rate": 4.7288359788359796e-05, "loss": 0.4883, "step": 500 }, { "epoch": 17.98230088495575, "eval_accuracy": 0.76, "eval_loss": 0.7902570962905884, "eval_runtime": 5.8085, "eval_samples_per_second": 68.865, "eval_steps_per_second": 2.238, "step": 508 }, { "epoch": 18.053097345132745, "grad_norm": 14.874748229980469, "learning_rate": 4.7123015873015876e-05, "loss": 0.529, "step": 510 }, { "epoch": 18.4070796460177, "grad_norm": 17.90399742126465, "learning_rate": 4.6957671957671964e-05, "loss": 0.455, "step": 520 }, { "epoch": 18.761061946902654, "grad_norm": 17.8863468170166, "learning_rate": 4.6792328042328045e-05, "loss": 0.4682, "step": 530 }, { "epoch": 18.97345132743363, "eval_accuracy": 0.7825, "eval_loss": 0.6469230055809021, "eval_runtime": 5.8117, "eval_samples_per_second": 68.826, "eval_steps_per_second": 2.237, "step": 536 }, { "epoch": 19.115044247787612, "grad_norm": 25.429996490478516, "learning_rate": 4.662698412698413e-05, "loss": 0.4268, "step": 540 }, { "epoch": 19.469026548672566, "grad_norm": 28.328603744506836, "learning_rate": 4.646164021164021e-05, "loss": 0.4233, "step": 550 }, { "epoch": 19.82300884955752, "grad_norm": 18.160737991333008, "learning_rate": 4.62962962962963e-05, "loss": 0.441, "step": 560 }, { "epoch": 20.0, "eval_accuracy": 0.7825, "eval_loss": 0.6693841814994812, "eval_runtime": 5.8278, "eval_samples_per_second": 68.637, "eval_steps_per_second": 2.231, "step": 565 }, { "epoch": 20.17699115044248, "grad_norm": 18.324892044067383, "learning_rate": 4.613095238095239e-05, "loss": 0.4421, "step": 570 }, { "epoch": 20.530973451327434, "grad_norm": 27.71529769897461, "learning_rate": 4.596560846560847e-05, "loss": 0.3583, "step": 580 }, { "epoch": 20.884955752212388, "grad_norm": 10.921418190002441, "learning_rate": 4.580026455026456e-05, "loss": 0.4201, "step": 590 }, { "epoch": 20.991150442477878, "eval_accuracy": 0.7625, "eval_loss": 0.7144609689712524, "eval_runtime": 5.725, "eval_samples_per_second": 69.869, "eval_steps_per_second": 2.271, "step": 593 }, { "epoch": 21.238938053097346, "grad_norm": 27.364168167114258, "learning_rate": 4.563492063492064e-05, "loss": 0.3886, "step": 600 }, { "epoch": 21.5929203539823, "grad_norm": 22.475797653198242, "learning_rate": 4.5469576719576725e-05, "loss": 0.366, "step": 610 }, { "epoch": 21.946902654867255, "grad_norm": 14.525550842285156, "learning_rate": 4.5304232804232806e-05, "loss": 0.387, "step": 620 }, { "epoch": 21.98230088495575, "eval_accuracy": 0.7775, "eval_loss": 0.6505405902862549, "eval_runtime": 6.0629, "eval_samples_per_second": 65.975, "eval_steps_per_second": 2.144, "step": 621 }, { "epoch": 22.300884955752213, "grad_norm": 15.278279304504395, "learning_rate": 4.5138888888888894e-05, "loss": 0.3681, "step": 630 }, { "epoch": 22.654867256637168, "grad_norm": 23.836380004882812, "learning_rate": 4.4973544973544974e-05, "loss": 0.4034, "step": 640 }, { "epoch": 22.97345132743363, "eval_accuracy": 0.7875, "eval_loss": 0.6168544292449951, "eval_runtime": 5.8701, "eval_samples_per_second": 68.143, "eval_steps_per_second": 2.215, "step": 649 }, { "epoch": 23.008849557522122, "grad_norm": 15.120473861694336, "learning_rate": 4.480820105820106e-05, "loss": 0.3424, "step": 650 }, { "epoch": 23.36283185840708, "grad_norm": 12.84585189819336, "learning_rate": 4.464285714285715e-05, "loss": 0.3669, "step": 660 }, { "epoch": 23.716814159292035, "grad_norm": 24.2416934967041, "learning_rate": 4.447751322751323e-05, "loss": 0.3041, "step": 670 }, { "epoch": 24.0, "eval_accuracy": 0.795, "eval_loss": 0.6416059136390686, "eval_runtime": 5.7657, "eval_samples_per_second": 69.375, "eval_steps_per_second": 2.255, "step": 678 }, { "epoch": 24.07079646017699, "grad_norm": 17.557270050048828, "learning_rate": 4.431216931216932e-05, "loss": 0.3344, "step": 680 }, { "epoch": 24.424778761061948, "grad_norm": 12.692173957824707, "learning_rate": 4.41468253968254e-05, "loss": 0.2928, "step": 690 }, { "epoch": 24.778761061946902, "grad_norm": 15.073899269104004, "learning_rate": 4.3981481481481486e-05, "loss": 0.3021, "step": 700 }, { "epoch": 24.991150442477878, "eval_accuracy": 0.775, "eval_loss": 0.6992344856262207, "eval_runtime": 5.8878, "eval_samples_per_second": 67.937, "eval_steps_per_second": 2.208, "step": 706 }, { "epoch": 25.13274336283186, "grad_norm": 10.598650932312012, "learning_rate": 4.381613756613757e-05, "loss": 0.3147, "step": 710 }, { "epoch": 25.486725663716815, "grad_norm": 17.572126388549805, "learning_rate": 4.3650793650793655e-05, "loss": 0.302, "step": 720 }, { "epoch": 25.84070796460177, "grad_norm": 30.176406860351562, "learning_rate": 4.3485449735449736e-05, "loss": 0.2853, "step": 730 }, { "epoch": 25.98230088495575, "eval_accuracy": 0.7975, "eval_loss": 0.6566324830055237, "eval_runtime": 5.7476, "eval_samples_per_second": 69.594, "eval_steps_per_second": 2.262, "step": 734 }, { "epoch": 26.194690265486727, "grad_norm": 18.816850662231445, "learning_rate": 4.332010582010582e-05, "loss": 0.297, "step": 740 }, { "epoch": 26.548672566371682, "grad_norm": 14.217058181762695, "learning_rate": 4.315476190476191e-05, "loss": 0.2849, "step": 750 }, { "epoch": 26.902654867256636, "grad_norm": 17.27300453186035, "learning_rate": 4.298941798941799e-05, "loss": 0.27, "step": 760 }, { "epoch": 26.97345132743363, "eval_accuracy": 0.7825, "eval_loss": 0.696976900100708, "eval_runtime": 6.0434, "eval_samples_per_second": 66.188, "eval_steps_per_second": 2.151, "step": 762 }, { "epoch": 27.256637168141594, "grad_norm": 10.05718994140625, "learning_rate": 4.282407407407408e-05, "loss": 0.2331, "step": 770 }, { "epoch": 27.61061946902655, "grad_norm": 12.319819450378418, "learning_rate": 4.265873015873016e-05, "loss": 0.2462, "step": 780 }, { "epoch": 27.964601769911503, "grad_norm": 14.027650833129883, "learning_rate": 4.249338624338625e-05, "loss": 0.2722, "step": 790 }, { "epoch": 28.0, "eval_accuracy": 0.785, "eval_loss": 0.6862995624542236, "eval_runtime": 5.7837, "eval_samples_per_second": 69.16, "eval_steps_per_second": 2.248, "step": 791 }, { "epoch": 28.31858407079646, "grad_norm": 11.350334167480469, "learning_rate": 4.232804232804233e-05, "loss": 0.2366, "step": 800 }, { "epoch": 28.672566371681416, "grad_norm": 8.22404670715332, "learning_rate": 4.2162698412698416e-05, "loss": 0.2143, "step": 810 }, { "epoch": 28.991150442477878, "eval_accuracy": 0.795, "eval_loss": 0.679432213306427, "eval_runtime": 5.7657, "eval_samples_per_second": 69.376, "eval_steps_per_second": 2.255, "step": 819 }, { "epoch": 29.02654867256637, "grad_norm": 11.07326889038086, "learning_rate": 4.1997354497354504e-05, "loss": 0.2443, "step": 820 }, { "epoch": 29.38053097345133, "grad_norm": 28.747331619262695, "learning_rate": 4.1832010582010584e-05, "loss": 0.2424, "step": 830 }, { "epoch": 29.734513274336283, "grad_norm": 12.262460708618164, "learning_rate": 4.166666666666667e-05, "loss": 0.2238, "step": 840 }, { "epoch": 29.98230088495575, "eval_accuracy": 0.7975, "eval_loss": 0.6781703233718872, "eval_runtime": 5.7261, "eval_samples_per_second": 69.856, "eval_steps_per_second": 2.27, "step": 847 }, { "epoch": 30.088495575221238, "grad_norm": 12.248478889465332, "learning_rate": 4.150132275132275e-05, "loss": 0.2414, "step": 850 }, { "epoch": 30.442477876106196, "grad_norm": 26.66759490966797, "learning_rate": 4.133597883597884e-05, "loss": 0.222, "step": 860 }, { "epoch": 30.79646017699115, "grad_norm": 17.73442268371582, "learning_rate": 4.117063492063492e-05, "loss": 0.2387, "step": 870 }, { "epoch": 30.97345132743363, "eval_accuracy": 0.81, "eval_loss": 0.6944553256034851, "eval_runtime": 5.9266, "eval_samples_per_second": 67.492, "eval_steps_per_second": 2.193, "step": 875 }, { "epoch": 31.150442477876105, "grad_norm": 10.954143524169922, "learning_rate": 4.100529100529101e-05, "loss": 0.1897, "step": 880 }, { "epoch": 31.504424778761063, "grad_norm": 11.53641414642334, "learning_rate": 4.083994708994709e-05, "loss": 0.215, "step": 890 }, { "epoch": 31.858407079646017, "grad_norm": 19.236658096313477, "learning_rate": 4.067460317460318e-05, "loss": 0.223, "step": 900 }, { "epoch": 32.0, "eval_accuracy": 0.7825, "eval_loss": 0.7377186417579651, "eval_runtime": 5.6881, "eval_samples_per_second": 70.322, "eval_steps_per_second": 2.285, "step": 904 }, { "epoch": 32.21238938053097, "grad_norm": 18.005434036254883, "learning_rate": 4.0509259259259265e-05, "loss": 0.1703, "step": 910 }, { "epoch": 32.56637168141593, "grad_norm": 10.189879417419434, "learning_rate": 4.0343915343915346e-05, "loss": 0.1882, "step": 920 }, { "epoch": 32.92035398230089, "grad_norm": 23.737546920776367, "learning_rate": 4.017857142857143e-05, "loss": 0.2211, "step": 930 }, { "epoch": 32.991150442477874, "eval_accuracy": 0.7775, "eval_loss": 0.7430591583251953, "eval_runtime": 5.8045, "eval_samples_per_second": 68.911, "eval_steps_per_second": 2.24, "step": 932 }, { "epoch": 33.27433628318584, "grad_norm": 12.777314186096191, "learning_rate": 4.0013227513227514e-05, "loss": 0.2187, "step": 940 }, { "epoch": 33.6283185840708, "grad_norm": 11.4727783203125, "learning_rate": 3.98478835978836e-05, "loss": 0.1952, "step": 950 }, { "epoch": 33.982300884955755, "grad_norm": 9.033596992492676, "learning_rate": 3.968253968253968e-05, "loss": 0.1882, "step": 960 }, { "epoch": 33.982300884955755, "eval_accuracy": 0.815, "eval_loss": 0.702938437461853, "eval_runtime": 5.7972, "eval_samples_per_second": 68.998, "eval_steps_per_second": 2.242, "step": 960 }, { "epoch": 34.336283185840706, "grad_norm": 10.962681770324707, "learning_rate": 3.951719576719577e-05, "loss": 0.1788, "step": 970 }, { "epoch": 34.690265486725664, "grad_norm": 9.644322395324707, "learning_rate": 3.935185185185186e-05, "loss": 0.1562, "step": 980 }, { "epoch": 34.97345132743363, "eval_accuracy": 0.815, "eval_loss": 0.6886518597602844, "eval_runtime": 5.7736, "eval_samples_per_second": 69.281, "eval_steps_per_second": 2.252, "step": 988 }, { "epoch": 35.04424778761062, "grad_norm": 7.208992004394531, "learning_rate": 3.918650793650794e-05, "loss": 0.1779, "step": 990 }, { "epoch": 35.39823008849557, "grad_norm": 15.789375305175781, "learning_rate": 3.9021164021164026e-05, "loss": 0.1652, "step": 1000 }, { "epoch": 35.75221238938053, "grad_norm": 9.542279243469238, "learning_rate": 3.885582010582011e-05, "loss": 0.1689, "step": 1010 }, { "epoch": 36.0, "eval_accuracy": 0.7975, "eval_loss": 0.7189816236495972, "eval_runtime": 5.7578, "eval_samples_per_second": 69.47, "eval_steps_per_second": 2.258, "step": 1017 }, { "epoch": 36.10619469026549, "grad_norm": 12.757901191711426, "learning_rate": 3.8690476190476195e-05, "loss": 0.178, "step": 1020 }, { "epoch": 36.46017699115044, "grad_norm": 8.646611213684082, "learning_rate": 3.8525132275132275e-05, "loss": 0.1613, "step": 1030 }, { "epoch": 36.8141592920354, "grad_norm": 12.245649337768555, "learning_rate": 3.835978835978836e-05, "loss": 0.1886, "step": 1040 }, { "epoch": 36.991150442477874, "eval_accuracy": 0.795, "eval_loss": 0.7677862048149109, "eval_runtime": 5.9107, "eval_samples_per_second": 67.674, "eval_steps_per_second": 2.199, "step": 1045 }, { "epoch": 37.16814159292036, "grad_norm": 16.76270866394043, "learning_rate": 3.8194444444444444e-05, "loss": 0.1777, "step": 1050 }, { "epoch": 37.52212389380531, "grad_norm": 12.296770095825195, "learning_rate": 3.802910052910053e-05, "loss": 0.182, "step": 1060 }, { "epoch": 37.876106194690266, "grad_norm": 19.789642333984375, "learning_rate": 3.786375661375662e-05, "loss": 0.1887, "step": 1070 }, { "epoch": 37.982300884955755, "eval_accuracy": 0.81, "eval_loss": 0.7334153652191162, "eval_runtime": 5.7675, "eval_samples_per_second": 69.354, "eval_steps_per_second": 2.254, "step": 1073 }, { "epoch": 38.230088495575224, "grad_norm": 9.019023895263672, "learning_rate": 3.76984126984127e-05, "loss": 0.1377, "step": 1080 }, { "epoch": 38.584070796460175, "grad_norm": 8.896963119506836, "learning_rate": 3.753306878306879e-05, "loss": 0.1751, "step": 1090 }, { "epoch": 38.93805309734513, "grad_norm": 8.166861534118652, "learning_rate": 3.736772486772487e-05, "loss": 0.1531, "step": 1100 }, { "epoch": 38.97345132743363, "eval_accuracy": 0.7925, "eval_loss": 0.7359188199043274, "eval_runtime": 5.8237, "eval_samples_per_second": 68.685, "eval_steps_per_second": 2.232, "step": 1101 }, { "epoch": 39.29203539823009, "grad_norm": 11.483717918395996, "learning_rate": 3.7202380952380956e-05, "loss": 0.1703, "step": 1110 }, { "epoch": 39.64601769911504, "grad_norm": 7.077027797698975, "learning_rate": 3.7037037037037037e-05, "loss": 0.1376, "step": 1120 }, { "epoch": 40.0, "grad_norm": 10.199649810791016, "learning_rate": 3.6871693121693124e-05, "loss": 0.1662, "step": 1130 }, { "epoch": 40.0, "eval_accuracy": 0.8075, "eval_loss": 0.7593528032302856, "eval_runtime": 5.7627, "eval_samples_per_second": 69.411, "eval_steps_per_second": 2.256, "step": 1130 }, { "epoch": 40.35398230088496, "grad_norm": 12.18074893951416, "learning_rate": 3.6706349206349205e-05, "loss": 0.1502, "step": 1140 }, { "epoch": 40.70796460176991, "grad_norm": 13.122712135314941, "learning_rate": 3.654100529100529e-05, "loss": 0.1273, "step": 1150 }, { "epoch": 40.991150442477874, "eval_accuracy": 0.81, "eval_loss": 0.7341694831848145, "eval_runtime": 5.7253, "eval_samples_per_second": 69.866, "eval_steps_per_second": 2.271, "step": 1158 }, { "epoch": 41.06194690265487, "grad_norm": 12.024120330810547, "learning_rate": 3.637566137566138e-05, "loss": 0.1334, "step": 1160 }, { "epoch": 41.415929203539825, "grad_norm": 13.965011596679688, "learning_rate": 3.621031746031746e-05, "loss": 0.1394, "step": 1170 }, { "epoch": 41.769911504424776, "grad_norm": 17.121511459350586, "learning_rate": 3.604497354497355e-05, "loss": 0.1986, "step": 1180 }, { "epoch": 41.982300884955755, "eval_accuracy": 0.805, "eval_loss": 0.7780522108078003, "eval_runtime": 5.9228, "eval_samples_per_second": 67.536, "eval_steps_per_second": 2.195, "step": 1186 }, { "epoch": 42.123893805309734, "grad_norm": 11.281728744506836, "learning_rate": 3.587962962962963e-05, "loss": 0.1563, "step": 1190 }, { "epoch": 42.47787610619469, "grad_norm": 6.99351167678833, "learning_rate": 3.571428571428572e-05, "loss": 0.1541, "step": 1200 }, { "epoch": 42.83185840707964, "grad_norm": 12.189485549926758, "learning_rate": 3.55489417989418e-05, "loss": 0.1891, "step": 1210 }, { "epoch": 42.97345132743363, "eval_accuracy": 0.8225, "eval_loss": 0.7375677227973938, "eval_runtime": 5.736, "eval_samples_per_second": 69.735, "eval_steps_per_second": 2.266, "step": 1214 }, { "epoch": 43.1858407079646, "grad_norm": 7.004004955291748, "learning_rate": 3.5383597883597885e-05, "loss": 0.1332, "step": 1220 }, { "epoch": 43.53982300884956, "grad_norm": 15.816964149475098, "learning_rate": 3.521825396825397e-05, "loss": 0.1231, "step": 1230 }, { "epoch": 43.89380530973451, "grad_norm": 8.720746040344238, "learning_rate": 3.5052910052910054e-05, "loss": 0.1573, "step": 1240 }, { "epoch": 44.0, "eval_accuracy": 0.815, "eval_loss": 0.730354905128479, "eval_runtime": 5.7537, "eval_samples_per_second": 69.52, "eval_steps_per_second": 2.259, "step": 1243 }, { "epoch": 44.24778761061947, "grad_norm": 11.798288345336914, "learning_rate": 3.488756613756614e-05, "loss": 0.1337, "step": 1250 }, { "epoch": 44.60176991150443, "grad_norm": 8.698972702026367, "learning_rate": 3.472222222222222e-05, "loss": 0.1536, "step": 1260 }, { "epoch": 44.95575221238938, "grad_norm": 14.468975067138672, "learning_rate": 3.455687830687831e-05, "loss": 0.1536, "step": 1270 }, { "epoch": 44.991150442477874, "eval_accuracy": 0.8, "eval_loss": 0.777264416217804, "eval_runtime": 5.7994, "eval_samples_per_second": 68.972, "eval_steps_per_second": 2.242, "step": 1271 }, { "epoch": 45.309734513274336, "grad_norm": 5.724658966064453, "learning_rate": 3.439153439153439e-05, "loss": 0.1362, "step": 1280 }, { "epoch": 45.663716814159294, "grad_norm": 15.177201271057129, "learning_rate": 3.422619047619048e-05, "loss": 0.1562, "step": 1290 }, { "epoch": 45.982300884955755, "eval_accuracy": 0.8, "eval_loss": 0.7622714042663574, "eval_runtime": 5.6562, "eval_samples_per_second": 70.719, "eval_steps_per_second": 2.298, "step": 1299 }, { "epoch": 46.017699115044245, "grad_norm": 14.262038230895996, "learning_rate": 3.406084656084656e-05, "loss": 0.1609, "step": 1300 }, { "epoch": 46.3716814159292, "grad_norm": 10.63355541229248, "learning_rate": 3.3895502645502647e-05, "loss": 0.1297, "step": 1310 }, { "epoch": 46.72566371681416, "grad_norm": 11.54215145111084, "learning_rate": 3.3730158730158734e-05, "loss": 0.1264, "step": 1320 }, { "epoch": 46.97345132743363, "eval_accuracy": 0.7925, "eval_loss": 0.8314040899276733, "eval_runtime": 5.9374, "eval_samples_per_second": 67.369, "eval_steps_per_second": 2.189, "step": 1327 }, { "epoch": 47.07964601769911, "grad_norm": 7.798260688781738, "learning_rate": 3.3564814814814815e-05, "loss": 0.1552, "step": 1330 }, { "epoch": 47.43362831858407, "grad_norm": 10.773923873901367, "learning_rate": 3.33994708994709e-05, "loss": 0.1188, "step": 1340 }, { "epoch": 47.78761061946903, "grad_norm": 5.990432262420654, "learning_rate": 3.3234126984126983e-05, "loss": 0.1596, "step": 1350 }, { "epoch": 48.0, "eval_accuracy": 0.8175, "eval_loss": 0.7830512523651123, "eval_runtime": 5.8001, "eval_samples_per_second": 68.964, "eval_steps_per_second": 2.241, "step": 1356 }, { "epoch": 48.14159292035398, "grad_norm": 11.099681854248047, "learning_rate": 3.306878306878307e-05, "loss": 0.1342, "step": 1360 }, { "epoch": 48.49557522123894, "grad_norm": 7.706020355224609, "learning_rate": 3.290343915343915e-05, "loss": 0.1382, "step": 1370 }, { "epoch": 48.849557522123895, "grad_norm": 5.685822010040283, "learning_rate": 3.273809523809524e-05, "loss": 0.1237, "step": 1380 }, { "epoch": 48.991150442477874, "eval_accuracy": 0.8, "eval_loss": 0.7949317097663879, "eval_runtime": 5.6938, "eval_samples_per_second": 70.252, "eval_steps_per_second": 2.283, "step": 1384 }, { "epoch": 49.203539823008846, "grad_norm": 10.283050537109375, "learning_rate": 3.257275132275133e-05, "loss": 0.1161, "step": 1390 }, { "epoch": 49.557522123893804, "grad_norm": 6.40301513671875, "learning_rate": 3.240740740740741e-05, "loss": 0.1363, "step": 1400 }, { "epoch": 49.91150442477876, "grad_norm": 17.212697982788086, "learning_rate": 3.2242063492063495e-05, "loss": 0.1355, "step": 1410 }, { "epoch": 49.982300884955755, "eval_accuracy": 0.795, "eval_loss": 0.781341552734375, "eval_runtime": 5.872, "eval_samples_per_second": 68.12, "eval_steps_per_second": 2.214, "step": 1412 }, { "epoch": 50.26548672566372, "grad_norm": 4.185158729553223, "learning_rate": 3.2076719576719576e-05, "loss": 0.1294, "step": 1420 }, { "epoch": 50.61946902654867, "grad_norm": 10.888021469116211, "learning_rate": 3.1911375661375664e-05, "loss": 0.163, "step": 1430 }, { "epoch": 50.97345132743363, "grad_norm": 8.630338668823242, "learning_rate": 3.1746031746031745e-05, "loss": 0.1251, "step": 1440 }, { "epoch": 50.97345132743363, "eval_accuracy": 0.81, "eval_loss": 0.7647480964660645, "eval_runtime": 5.8333, "eval_samples_per_second": 68.571, "eval_steps_per_second": 2.229, "step": 1440 }, { "epoch": 51.32743362831859, "grad_norm": 19.542953491210938, "learning_rate": 3.158068783068783e-05, "loss": 0.1289, "step": 1450 }, { "epoch": 51.68141592920354, "grad_norm": 14.15263843536377, "learning_rate": 3.141534391534391e-05, "loss": 0.1181, "step": 1460 }, { "epoch": 52.0, "eval_accuracy": 0.8175, "eval_loss": 0.7552239894866943, "eval_runtime": 6.0471, "eval_samples_per_second": 66.148, "eval_steps_per_second": 2.15, "step": 1469 }, { "epoch": 52.0353982300885, "grad_norm": 8.696867942810059, "learning_rate": 3.125e-05, "loss": 0.1232, "step": 1470 }, { "epoch": 52.389380530973455, "grad_norm": 15.115714073181152, "learning_rate": 3.108465608465609e-05, "loss": 0.128, "step": 1480 }, { "epoch": 52.743362831858406, "grad_norm": 26.04839515686035, "learning_rate": 3.091931216931217e-05, "loss": 0.1224, "step": 1490 }, { "epoch": 52.991150442477874, "eval_accuracy": 0.795, "eval_loss": 0.8345909118652344, "eval_runtime": 5.7656, "eval_samples_per_second": 69.377, "eval_steps_per_second": 2.255, "step": 1497 }, { "epoch": 53.097345132743364, "grad_norm": 6.820947647094727, "learning_rate": 3.075396825396826e-05, "loss": 0.1088, "step": 1500 }, { "epoch": 53.45132743362832, "grad_norm": 10.287090301513672, "learning_rate": 3.058862433862434e-05, "loss": 0.1094, "step": 1510 }, { "epoch": 53.80530973451327, "grad_norm": 12.619726181030273, "learning_rate": 3.0423280423280425e-05, "loss": 0.1201, "step": 1520 }, { "epoch": 53.982300884955755, "eval_accuracy": 0.7975, "eval_loss": 0.7740535140037537, "eval_runtime": 5.8431, "eval_samples_per_second": 68.457, "eval_steps_per_second": 2.225, "step": 1525 }, { "epoch": 54.15929203539823, "grad_norm": 3.6249613761901855, "learning_rate": 3.0257936507936506e-05, "loss": 0.1278, "step": 1530 }, { "epoch": 54.51327433628319, "grad_norm": 8.490986824035645, "learning_rate": 3.0092592592592593e-05, "loss": 0.1359, "step": 1540 }, { "epoch": 54.86725663716814, "grad_norm": 8.873104095458984, "learning_rate": 2.9927248677248678e-05, "loss": 0.1109, "step": 1550 }, { "epoch": 54.97345132743363, "eval_accuracy": 0.785, "eval_loss": 0.772351861000061, "eval_runtime": 5.8419, "eval_samples_per_second": 68.47, "eval_steps_per_second": 2.225, "step": 1553 }, { "epoch": 55.2212389380531, "grad_norm": 3.0876898765563965, "learning_rate": 2.9761904761904762e-05, "loss": 0.1108, "step": 1560 }, { "epoch": 55.575221238938056, "grad_norm": 6.528329372406006, "learning_rate": 2.959656084656085e-05, "loss": 0.1236, "step": 1570 }, { "epoch": 55.92920353982301, "grad_norm": 11.930560111999512, "learning_rate": 2.943121693121693e-05, "loss": 0.1084, "step": 1580 }, { "epoch": 56.0, "eval_accuracy": 0.805, "eval_loss": 0.7904237508773804, "eval_runtime": 5.7739, "eval_samples_per_second": 69.277, "eval_steps_per_second": 2.252, "step": 1582 }, { "epoch": 56.283185840707965, "grad_norm": 6.980546951293945, "learning_rate": 2.9265873015873018e-05, "loss": 0.1144, "step": 1590 }, { "epoch": 56.63716814159292, "grad_norm": 11.986266136169434, "learning_rate": 2.91005291005291e-05, "loss": 0.1084, "step": 1600 }, { "epoch": 56.991150442477874, "grad_norm": 6.527588367462158, "learning_rate": 2.8935185185185186e-05, "loss": 0.1187, "step": 1610 }, { "epoch": 56.991150442477874, "eval_accuracy": 0.8125, "eval_loss": 0.7423826456069946, "eval_runtime": 5.9051, "eval_samples_per_second": 67.738, "eval_steps_per_second": 2.201, "step": 1610 }, { "epoch": 57.34513274336283, "grad_norm": 6.542221546173096, "learning_rate": 2.876984126984127e-05, "loss": 0.1191, "step": 1620 }, { "epoch": 57.69911504424779, "grad_norm": 6.406174659729004, "learning_rate": 2.8604497354497355e-05, "loss": 0.0935, "step": 1630 }, { "epoch": 57.982300884955755, "eval_accuracy": 0.815, "eval_loss": 0.7410502433776855, "eval_runtime": 6.0733, "eval_samples_per_second": 65.862, "eval_steps_per_second": 2.141, "step": 1638 }, { "epoch": 58.05309734513274, "grad_norm": 5.907907485961914, "learning_rate": 2.8439153439153442e-05, "loss": 0.0776, "step": 1640 }, { "epoch": 58.4070796460177, "grad_norm": 7.151067733764648, "learning_rate": 2.8273809523809523e-05, "loss": 0.1022, "step": 1650 }, { "epoch": 58.76106194690266, "grad_norm": 6.808224201202393, "learning_rate": 2.810846560846561e-05, "loss": 0.1023, "step": 1660 }, { "epoch": 58.97345132743363, "eval_accuracy": 0.81, "eval_loss": 0.7475782632827759, "eval_runtime": 5.7079, "eval_samples_per_second": 70.078, "eval_steps_per_second": 2.278, "step": 1666 }, { "epoch": 59.11504424778761, "grad_norm": 7.42250919342041, "learning_rate": 2.7943121693121695e-05, "loss": 0.1206, "step": 1670 }, { "epoch": 59.469026548672566, "grad_norm": 11.038393020629883, "learning_rate": 2.777777777777778e-05, "loss": 0.1148, "step": 1680 }, { "epoch": 59.823008849557525, "grad_norm": 6.563925266265869, "learning_rate": 2.7612433862433863e-05, "loss": 0.1166, "step": 1690 }, { "epoch": 60.0, "eval_accuracy": 0.8175, "eval_loss": 0.7742089033126831, "eval_runtime": 5.6896, "eval_samples_per_second": 70.304, "eval_steps_per_second": 2.285, "step": 1695 }, { "epoch": 60.176991150442475, "grad_norm": 8.62690544128418, "learning_rate": 2.7447089947089948e-05, "loss": 0.1133, "step": 1700 }, { "epoch": 60.530973451327434, "grad_norm": 8.701354026794434, "learning_rate": 2.7281746031746032e-05, "loss": 0.0961, "step": 1710 }, { "epoch": 60.88495575221239, "grad_norm": 5.880581855773926, "learning_rate": 2.7116402116402116e-05, "loss": 0.099, "step": 1720 }, { "epoch": 60.991150442477874, "eval_accuracy": 0.815, "eval_loss": 0.7696540951728821, "eval_runtime": 5.8242, "eval_samples_per_second": 68.679, "eval_steps_per_second": 2.232, "step": 1723 }, { "epoch": 61.23893805309734, "grad_norm": 6.893290996551514, "learning_rate": 2.6951058201058204e-05, "loss": 0.0968, "step": 1730 }, { "epoch": 61.5929203539823, "grad_norm": 9.383642196655273, "learning_rate": 2.6785714285714288e-05, "loss": 0.0927, "step": 1740 }, { "epoch": 61.94690265486726, "grad_norm": 10.903635025024414, "learning_rate": 2.6620370370370372e-05, "loss": 0.1157, "step": 1750 }, { "epoch": 61.982300884955755, "eval_accuracy": 0.8, "eval_loss": 0.8537997603416443, "eval_runtime": 5.862, "eval_samples_per_second": 68.236, "eval_steps_per_second": 2.218, "step": 1751 }, { "epoch": 62.30088495575221, "grad_norm": 11.495246887207031, "learning_rate": 2.6455026455026456e-05, "loss": 0.1071, "step": 1760 }, { "epoch": 62.65486725663717, "grad_norm": 8.735861778259277, "learning_rate": 2.628968253968254e-05, "loss": 0.1137, "step": 1770 }, { "epoch": 62.97345132743363, "eval_accuracy": 0.8125, "eval_loss": 0.8544909954071045, "eval_runtime": 5.719, "eval_samples_per_second": 69.942, "eval_steps_per_second": 2.273, "step": 1779 }, { "epoch": 63.008849557522126, "grad_norm": 9.511134147644043, "learning_rate": 2.6124338624338625e-05, "loss": 0.0962, "step": 1780 }, { "epoch": 63.36283185840708, "grad_norm": 9.274235725402832, "learning_rate": 2.5958994708994712e-05, "loss": 0.09, "step": 1790 }, { "epoch": 63.716814159292035, "grad_norm": 17.82819175720215, "learning_rate": 2.5793650793650796e-05, "loss": 0.094, "step": 1800 }, { "epoch": 64.0, "eval_accuracy": 0.7925, "eval_loss": 0.8463137745857239, "eval_runtime": 5.7366, "eval_samples_per_second": 69.727, "eval_steps_per_second": 2.266, "step": 1808 }, { "epoch": 64.070796460177, "grad_norm": 13.545799255371094, "learning_rate": 2.562830687830688e-05, "loss": 0.1186, "step": 1810 }, { "epoch": 64.42477876106194, "grad_norm": 8.884902954101562, "learning_rate": 2.5462962962962965e-05, "loss": 0.1027, "step": 1820 }, { "epoch": 64.77876106194691, "grad_norm": 9.229199409484863, "learning_rate": 2.529761904761905e-05, "loss": 0.1161, "step": 1830 }, { "epoch": 64.99115044247787, "eval_accuracy": 0.81, "eval_loss": 0.8350917100906372, "eval_runtime": 5.7053, "eval_samples_per_second": 70.11, "eval_steps_per_second": 2.279, "step": 1836 }, { "epoch": 65.13274336283186, "grad_norm": 4.687648296356201, "learning_rate": 2.5132275132275137e-05, "loss": 0.0954, "step": 1840 }, { "epoch": 65.48672566371681, "grad_norm": 8.967106819152832, "learning_rate": 2.496693121693122e-05, "loss": 0.1102, "step": 1850 }, { "epoch": 65.84070796460178, "grad_norm": 7.873806953430176, "learning_rate": 2.4801587301587305e-05, "loss": 0.08, "step": 1860 }, { "epoch": 65.98230088495575, "eval_accuracy": 0.7925, "eval_loss": 0.8610497117042542, "eval_runtime": 5.7201, "eval_samples_per_second": 69.929, "eval_steps_per_second": 2.273, "step": 1864 }, { "epoch": 66.19469026548673, "grad_norm": 7.836842060089111, "learning_rate": 2.463624338624339e-05, "loss": 0.1068, "step": 1870 }, { "epoch": 66.54867256637168, "grad_norm": 8.14975357055664, "learning_rate": 2.4470899470899473e-05, "loss": 0.0787, "step": 1880 }, { "epoch": 66.90265486725664, "grad_norm": 5.568120956420898, "learning_rate": 2.4305555555555558e-05, "loss": 0.0799, "step": 1890 }, { "epoch": 66.97345132743362, "eval_accuracy": 0.8075, "eval_loss": 0.8592771887779236, "eval_runtime": 5.7815, "eval_samples_per_second": 69.186, "eval_steps_per_second": 2.249, "step": 1892 }, { "epoch": 67.2566371681416, "grad_norm": 5.271720886230469, "learning_rate": 2.4140211640211642e-05, "loss": 0.0781, "step": 1900 }, { "epoch": 67.61061946902655, "grad_norm": 3.000176429748535, "learning_rate": 2.3974867724867726e-05, "loss": 0.0927, "step": 1910 }, { "epoch": 67.96460176991151, "grad_norm": 1.825054407119751, "learning_rate": 2.380952380952381e-05, "loss": 0.0783, "step": 1920 }, { "epoch": 68.0, "eval_accuracy": 0.815, "eval_loss": 0.8423022627830505, "eval_runtime": 5.6528, "eval_samples_per_second": 70.762, "eval_steps_per_second": 2.3, "step": 1921 }, { "epoch": 68.31858407079646, "grad_norm": 9.14376449584961, "learning_rate": 2.3644179894179898e-05, "loss": 0.1037, "step": 1930 }, { "epoch": 68.67256637168141, "grad_norm": 11.533547401428223, "learning_rate": 2.3478835978835982e-05, "loss": 0.0851, "step": 1940 }, { "epoch": 68.99115044247787, "eval_accuracy": 0.82, "eval_loss": 0.8265037536621094, "eval_runtime": 5.9247, "eval_samples_per_second": 67.514, "eval_steps_per_second": 2.194, "step": 1949 }, { "epoch": 69.02654867256638, "grad_norm": 8.8108549118042, "learning_rate": 2.3313492063492066e-05, "loss": 0.1005, "step": 1950 }, { "epoch": 69.38053097345133, "grad_norm": 7.721718788146973, "learning_rate": 2.314814814814815e-05, "loss": 0.0839, "step": 1960 }, { "epoch": 69.73451327433628, "grad_norm": 9.032380104064941, "learning_rate": 2.2982804232804235e-05, "loss": 0.0775, "step": 1970 }, { "epoch": 69.98230088495575, "eval_accuracy": 0.805, "eval_loss": 0.8707697987556458, "eval_runtime": 5.6882, "eval_samples_per_second": 70.321, "eval_steps_per_second": 2.285, "step": 1977 }, { "epoch": 70.08849557522124, "grad_norm": 7.697175979614258, "learning_rate": 2.281746031746032e-05, "loss": 0.1005, "step": 1980 }, { "epoch": 70.4424778761062, "grad_norm": 11.258265495300293, "learning_rate": 2.2652116402116403e-05, "loss": 0.1152, "step": 1990 }, { "epoch": 70.79646017699115, "grad_norm": 6.715971946716309, "learning_rate": 2.2486772486772487e-05, "loss": 0.0902, "step": 2000 }, { "epoch": 70.97345132743362, "eval_accuracy": 0.81, "eval_loss": 0.8180540204048157, "eval_runtime": 5.7974, "eval_samples_per_second": 68.996, "eval_steps_per_second": 2.242, "step": 2005 }, { "epoch": 71.15044247787611, "grad_norm": 8.447669982910156, "learning_rate": 2.2321428571428575e-05, "loss": 0.0664, "step": 2010 }, { "epoch": 71.50442477876106, "grad_norm": 10.469520568847656, "learning_rate": 2.215608465608466e-05, "loss": 0.0893, "step": 2020 }, { "epoch": 71.85840707964601, "grad_norm": 6.971242904663086, "learning_rate": 2.1990740740740743e-05, "loss": 0.0904, "step": 2030 }, { "epoch": 72.0, "eval_accuracy": 0.82, "eval_loss": 0.8296825885772705, "eval_runtime": 5.7586, "eval_samples_per_second": 69.462, "eval_steps_per_second": 2.258, "step": 2034 }, { "epoch": 72.21238938053098, "grad_norm": 6.059171199798584, "learning_rate": 2.1825396825396827e-05, "loss": 0.0767, "step": 2040 }, { "epoch": 72.56637168141593, "grad_norm": 9.134629249572754, "learning_rate": 2.166005291005291e-05, "loss": 0.0897, "step": 2050 }, { "epoch": 72.92035398230088, "grad_norm": 6.5583415031433105, "learning_rate": 2.1494708994708996e-05, "loss": 0.0898, "step": 2060 }, { "epoch": 72.99115044247787, "eval_accuracy": 0.82, "eval_loss": 0.8464323282241821, "eval_runtime": 5.7076, "eval_samples_per_second": 70.082, "eval_steps_per_second": 2.278, "step": 2062 }, { "epoch": 73.27433628318585, "grad_norm": 12.021257400512695, "learning_rate": 2.132936507936508e-05, "loss": 0.1061, "step": 2070 }, { "epoch": 73.6283185840708, "grad_norm": 4.3469367027282715, "learning_rate": 2.1164021164021164e-05, "loss": 0.0744, "step": 2080 }, { "epoch": 73.98230088495575, "grad_norm": 9.244330406188965, "learning_rate": 2.0998677248677252e-05, "loss": 0.1013, "step": 2090 }, { "epoch": 73.98230088495575, "eval_accuracy": 0.81, "eval_loss": 0.8324652314186096, "eval_runtime": 5.8164, "eval_samples_per_second": 68.771, "eval_steps_per_second": 2.235, "step": 2090 }, { "epoch": 74.33628318584071, "grad_norm": 4.678669452667236, "learning_rate": 2.0833333333333336e-05, "loss": 0.1001, "step": 2100 }, { "epoch": 74.69026548672566, "grad_norm": 9.78695297241211, "learning_rate": 2.066798941798942e-05, "loss": 0.0726, "step": 2110 }, { "epoch": 74.97345132743362, "eval_accuracy": 0.8, "eval_loss": 0.8771929144859314, "eval_runtime": 5.7555, "eval_samples_per_second": 69.499, "eval_steps_per_second": 2.259, "step": 2118 }, { "epoch": 75.04424778761062, "grad_norm": 3.598501443862915, "learning_rate": 2.0502645502645504e-05, "loss": 0.0975, "step": 2120 }, { "epoch": 75.39823008849558, "grad_norm": 10.049243927001953, "learning_rate": 2.033730158730159e-05, "loss": 0.0858, "step": 2130 }, { "epoch": 75.75221238938053, "grad_norm": 10.887565612792969, "learning_rate": 2.0171957671957673e-05, "loss": 0.0745, "step": 2140 }, { "epoch": 76.0, "eval_accuracy": 0.8125, "eval_loss": 0.8505265712738037, "eval_runtime": 5.6825, "eval_samples_per_second": 70.391, "eval_steps_per_second": 2.288, "step": 2147 }, { "epoch": 76.10619469026548, "grad_norm": 9.399345397949219, "learning_rate": 2.0006613756613757e-05, "loss": 0.0909, "step": 2150 }, { "epoch": 76.46017699115045, "grad_norm": 3.6555662155151367, "learning_rate": 1.984126984126984e-05, "loss": 0.0799, "step": 2160 }, { "epoch": 76.8141592920354, "grad_norm": 5.8622894287109375, "learning_rate": 1.967592592592593e-05, "loss": 0.0891, "step": 2170 }, { "epoch": 76.99115044247787, "eval_accuracy": 0.81, "eval_loss": 0.8693811297416687, "eval_runtime": 5.7342, "eval_samples_per_second": 69.757, "eval_steps_per_second": 2.267, "step": 2175 }, { "epoch": 77.16814159292035, "grad_norm": 3.6685290336608887, "learning_rate": 1.9510582010582013e-05, "loss": 0.0909, "step": 2180 }, { "epoch": 77.52212389380531, "grad_norm": 9.96608829498291, "learning_rate": 1.9345238095238097e-05, "loss": 0.0962, "step": 2190 }, { "epoch": 77.87610619469027, "grad_norm": 7.857000827789307, "learning_rate": 1.917989417989418e-05, "loss": 0.0791, "step": 2200 }, { "epoch": 77.98230088495575, "eval_accuracy": 0.81, "eval_loss": 0.8765752911567688, "eval_runtime": 5.7248, "eval_samples_per_second": 69.872, "eval_steps_per_second": 2.271, "step": 2203 }, { "epoch": 78.23008849557522, "grad_norm": 10.403280258178711, "learning_rate": 1.9014550264550266e-05, "loss": 0.0622, "step": 2210 }, { "epoch": 78.58407079646018, "grad_norm": 6.770401477813721, "learning_rate": 1.884920634920635e-05, "loss": 0.0689, "step": 2220 }, { "epoch": 78.93805309734513, "grad_norm": 10.228433609008789, "learning_rate": 1.8683862433862434e-05, "loss": 0.0639, "step": 2230 }, { "epoch": 78.97345132743362, "eval_accuracy": 0.8125, "eval_loss": 0.8461715579032898, "eval_runtime": 6.0121, "eval_samples_per_second": 66.532, "eval_steps_per_second": 2.162, "step": 2231 }, { "epoch": 79.29203539823008, "grad_norm": 6.610928535461426, "learning_rate": 1.8518518518518518e-05, "loss": 0.0705, "step": 2240 }, { "epoch": 79.64601769911505, "grad_norm": 9.066596984863281, "learning_rate": 1.8353174603174602e-05, "loss": 0.0795, "step": 2250 }, { "epoch": 80.0, "grad_norm": 11.537944793701172, "learning_rate": 1.818783068783069e-05, "loss": 0.0676, "step": 2260 }, { "epoch": 80.0, "eval_accuracy": 0.8075, "eval_loss": 0.8991250395774841, "eval_runtime": 5.7259, "eval_samples_per_second": 69.858, "eval_steps_per_second": 2.27, "step": 2260 }, { "epoch": 80.35398230088495, "grad_norm": 5.221861839294434, "learning_rate": 1.8022486772486774e-05, "loss": 0.0932, "step": 2270 }, { "epoch": 80.70796460176992, "grad_norm": 6.904067039489746, "learning_rate": 1.785714285714286e-05, "loss": 0.0904, "step": 2280 }, { "epoch": 80.99115044247787, "eval_accuracy": 0.815, "eval_loss": 0.8550813794136047, "eval_runtime": 5.7321, "eval_samples_per_second": 69.782, "eval_steps_per_second": 2.268, "step": 2288 }, { "epoch": 81.06194690265487, "grad_norm": 10.940287590026855, "learning_rate": 1.7691798941798943e-05, "loss": 0.0942, "step": 2290 }, { "epoch": 81.41592920353982, "grad_norm": 4.109130382537842, "learning_rate": 1.7526455026455027e-05, "loss": 0.0791, "step": 2300 }, { "epoch": 81.76991150442478, "grad_norm": 10.844812393188477, "learning_rate": 1.736111111111111e-05, "loss": 0.0788, "step": 2310 }, { "epoch": 81.98230088495575, "eval_accuracy": 0.795, "eval_loss": 0.9301651120185852, "eval_runtime": 5.6474, "eval_samples_per_second": 70.829, "eval_steps_per_second": 2.302, "step": 2316 }, { "epoch": 82.12389380530973, "grad_norm": 3.5678551197052, "learning_rate": 1.7195767195767195e-05, "loss": 0.0603, "step": 2320 }, { "epoch": 82.47787610619469, "grad_norm": 7.562065124511719, "learning_rate": 1.703042328042328e-05, "loss": 0.093, "step": 2330 }, { "epoch": 82.83185840707965, "grad_norm": 10.092254638671875, "learning_rate": 1.6865079365079367e-05, "loss": 0.0787, "step": 2340 }, { "epoch": 82.97345132743362, "eval_accuracy": 0.8025, "eval_loss": 0.8706057071685791, "eval_runtime": 5.8345, "eval_samples_per_second": 68.558, "eval_steps_per_second": 2.228, "step": 2344 }, { "epoch": 83.1858407079646, "grad_norm": 4.765111923217773, "learning_rate": 1.669973544973545e-05, "loss": 0.0579, "step": 2350 }, { "epoch": 83.53982300884955, "grad_norm": 6.426796913146973, "learning_rate": 1.6534391534391536e-05, "loss": 0.0697, "step": 2360 }, { "epoch": 83.89380530973452, "grad_norm": 7.382542610168457, "learning_rate": 1.636904761904762e-05, "loss": 0.0918, "step": 2370 }, { "epoch": 84.0, "eval_accuracy": 0.805, "eval_loss": 0.868044912815094, "eval_runtime": 5.7723, "eval_samples_per_second": 69.297, "eval_steps_per_second": 2.252, "step": 2373 }, { "epoch": 84.24778761061947, "grad_norm": 5.388473987579346, "learning_rate": 1.6203703703703704e-05, "loss": 0.0752, "step": 2380 }, { "epoch": 84.60176991150442, "grad_norm": 6.751432418823242, "learning_rate": 1.6038359788359788e-05, "loss": 0.0671, "step": 2390 }, { "epoch": 84.95575221238938, "grad_norm": 8.372601509094238, "learning_rate": 1.5873015873015872e-05, "loss": 0.0681, "step": 2400 }, { "epoch": 84.99115044247787, "eval_accuracy": 0.8125, "eval_loss": 0.8481296300888062, "eval_runtime": 5.711, "eval_samples_per_second": 70.04, "eval_steps_per_second": 2.276, "step": 2401 }, { "epoch": 85.30973451327434, "grad_norm": 6.79984712600708, "learning_rate": 1.5707671957671957e-05, "loss": 0.0634, "step": 2410 }, { "epoch": 85.66371681415929, "grad_norm": 9.60888671875, "learning_rate": 1.5542328042328044e-05, "loss": 0.115, "step": 2420 }, { "epoch": 85.98230088495575, "eval_accuracy": 0.8025, "eval_loss": 0.8552606105804443, "eval_runtime": 6.0227, "eval_samples_per_second": 66.415, "eval_steps_per_second": 2.158, "step": 2429 }, { "epoch": 86.01769911504425, "grad_norm": 9.006217956542969, "learning_rate": 1.537698412698413e-05, "loss": 0.0741, "step": 2430 }, { "epoch": 86.3716814159292, "grad_norm": 8.767806053161621, "learning_rate": 1.5211640211640213e-05, "loss": 0.0652, "step": 2440 }, { "epoch": 86.72566371681415, "grad_norm": 6.8285675048828125, "learning_rate": 1.5046296296296297e-05, "loss": 0.0599, "step": 2450 }, { "epoch": 86.97345132743362, "eval_accuracy": 0.805, "eval_loss": 0.8886809349060059, "eval_runtime": 5.6679, "eval_samples_per_second": 70.573, "eval_steps_per_second": 2.294, "step": 2457 }, { "epoch": 87.07964601769912, "grad_norm": 7.630321502685547, "learning_rate": 1.4880952380952381e-05, "loss": 0.0805, "step": 2460 }, { "epoch": 87.43362831858407, "grad_norm": 8.364428520202637, "learning_rate": 1.4715608465608465e-05, "loss": 0.0743, "step": 2470 }, { "epoch": 87.78761061946902, "grad_norm": 8.15882682800293, "learning_rate": 1.455026455026455e-05, "loss": 0.0774, "step": 2480 }, { "epoch": 88.0, "eval_accuracy": 0.81, "eval_loss": 0.9255210161209106, "eval_runtime": 5.684, "eval_samples_per_second": 70.374, "eval_steps_per_second": 2.287, "step": 2486 }, { "epoch": 88.14159292035399, "grad_norm": 3.7120983600616455, "learning_rate": 1.4384920634920635e-05, "loss": 0.0635, "step": 2490 }, { "epoch": 88.49557522123894, "grad_norm": 3.117091655731201, "learning_rate": 1.4219576719576721e-05, "loss": 0.0722, "step": 2500 }, { "epoch": 88.84955752212389, "grad_norm": 4.884605407714844, "learning_rate": 1.4054232804232805e-05, "loss": 0.0701, "step": 2510 }, { "epoch": 88.99115044247787, "eval_accuracy": 0.81, "eval_loss": 0.8794758319854736, "eval_runtime": 5.697, "eval_samples_per_second": 70.213, "eval_steps_per_second": 2.282, "step": 2514 }, { "epoch": 89.20353982300885, "grad_norm": 6.620547294616699, "learning_rate": 1.388888888888889e-05, "loss": 0.0626, "step": 2520 }, { "epoch": 89.5575221238938, "grad_norm": 6.801345348358154, "learning_rate": 1.3723544973544974e-05, "loss": 0.0682, "step": 2530 }, { "epoch": 89.91150442477876, "grad_norm": 5.619492053985596, "learning_rate": 1.3558201058201058e-05, "loss": 0.074, "step": 2540 }, { "epoch": 89.98230088495575, "eval_accuracy": 0.8175, "eval_loss": 0.8634124994277954, "eval_runtime": 5.7593, "eval_samples_per_second": 69.453, "eval_steps_per_second": 2.257, "step": 2542 }, { "epoch": 90.26548672566372, "grad_norm": 10.153610229492188, "learning_rate": 1.3392857142857144e-05, "loss": 0.0794, "step": 2550 }, { "epoch": 90.61946902654867, "grad_norm": 5.089029312133789, "learning_rate": 1.3227513227513228e-05, "loss": 0.0611, "step": 2560 }, { "epoch": 90.97345132743362, "grad_norm": 6.003979206085205, "learning_rate": 1.3062169312169312e-05, "loss": 0.0497, "step": 2570 }, { "epoch": 90.97345132743362, "eval_accuracy": 0.82, "eval_loss": 0.8793442249298096, "eval_runtime": 6.0414, "eval_samples_per_second": 66.21, "eval_steps_per_second": 2.152, "step": 2570 }, { "epoch": 91.32743362831859, "grad_norm": 6.3248090744018555, "learning_rate": 1.2896825396825398e-05, "loss": 0.0557, "step": 2580 }, { "epoch": 91.68141592920354, "grad_norm": 4.216904640197754, "learning_rate": 1.2731481481481482e-05, "loss": 0.0569, "step": 2590 }, { "epoch": 92.0, "eval_accuracy": 0.7925, "eval_loss": 0.9006764888763428, "eval_runtime": 5.7399, "eval_samples_per_second": 69.688, "eval_steps_per_second": 2.265, "step": 2599 }, { "epoch": 92.03539823008849, "grad_norm": 6.895070552825928, "learning_rate": 1.2566137566137568e-05, "loss": 0.0769, "step": 2600 }, { "epoch": 92.38938053097345, "grad_norm": 3.7500102519989014, "learning_rate": 1.2400793650793652e-05, "loss": 0.0543, "step": 2610 }, { "epoch": 92.7433628318584, "grad_norm": 10.693458557128906, "learning_rate": 1.2235449735449737e-05, "loss": 0.0722, "step": 2620 }, { "epoch": 92.99115044247787, "eval_accuracy": 0.815, "eval_loss": 0.8700942993164062, "eval_runtime": 5.6593, "eval_samples_per_second": 70.681, "eval_steps_per_second": 2.297, "step": 2627 }, { "epoch": 93.09734513274336, "grad_norm": 5.407220363616943, "learning_rate": 1.2070105820105821e-05, "loss": 0.0822, "step": 2630 }, { "epoch": 93.45132743362832, "grad_norm": 2.9195971488952637, "learning_rate": 1.1904761904761905e-05, "loss": 0.0624, "step": 2640 }, { "epoch": 93.80530973451327, "grad_norm": 7.752827167510986, "learning_rate": 1.1739417989417991e-05, "loss": 0.0674, "step": 2650 }, { "epoch": 93.98230088495575, "eval_accuracy": 0.8225, "eval_loss": 0.8879609704017639, "eval_runtime": 5.7797, "eval_samples_per_second": 69.208, "eval_steps_per_second": 2.249, "step": 2655 }, { "epoch": 94.15929203539822, "grad_norm": 6.314310550689697, "learning_rate": 1.1574074074074075e-05, "loss": 0.0685, "step": 2660 }, { "epoch": 94.51327433628319, "grad_norm": 9.070263862609863, "learning_rate": 1.140873015873016e-05, "loss": 0.0649, "step": 2670 }, { "epoch": 94.86725663716814, "grad_norm": 4.211071968078613, "learning_rate": 1.1243386243386244e-05, "loss": 0.0643, "step": 2680 }, { "epoch": 94.97345132743362, "eval_accuracy": 0.8075, "eval_loss": 0.8854994177818298, "eval_runtime": 5.6723, "eval_samples_per_second": 70.518, "eval_steps_per_second": 2.292, "step": 2683 }, { "epoch": 95.22123893805309, "grad_norm": 6.701349258422852, "learning_rate": 1.107804232804233e-05, "loss": 0.054, "step": 2690 }, { "epoch": 95.57522123893806, "grad_norm": 9.44869327545166, "learning_rate": 1.0912698412698414e-05, "loss": 0.0594, "step": 2700 }, { "epoch": 95.929203539823, "grad_norm": 5.749889373779297, "learning_rate": 1.0747354497354498e-05, "loss": 0.0583, "step": 2710 }, { "epoch": 96.0, "eval_accuracy": 0.815, "eval_loss": 0.8918322920799255, "eval_runtime": 5.8266, "eval_samples_per_second": 68.651, "eval_steps_per_second": 2.231, "step": 2712 }, { "epoch": 96.28318584070796, "grad_norm": 9.790497779846191, "learning_rate": 1.0582010582010582e-05, "loss": 0.0731, "step": 2720 }, { "epoch": 96.63716814159292, "grad_norm": 10.20504379272461, "learning_rate": 1.0416666666666668e-05, "loss": 0.067, "step": 2730 }, { "epoch": 96.99115044247787, "grad_norm": 9.304971694946289, "learning_rate": 1.0251322751322752e-05, "loss": 0.0558, "step": 2740 }, { "epoch": 96.99115044247787, "eval_accuracy": 0.8275, "eval_loss": 0.8735535144805908, "eval_runtime": 5.748, "eval_samples_per_second": 69.59, "eval_steps_per_second": 2.262, "step": 2740 }, { "epoch": 97.34513274336283, "grad_norm": 2.155658483505249, "learning_rate": 1.0085978835978836e-05, "loss": 0.0952, "step": 2750 }, { "epoch": 97.69911504424779, "grad_norm": 4.216080665588379, "learning_rate": 9.92063492063492e-06, "loss": 0.0622, "step": 2760 }, { "epoch": 97.98230088495575, "eval_accuracy": 0.815, "eval_loss": 0.905790388584137, "eval_runtime": 6.0004, "eval_samples_per_second": 66.662, "eval_steps_per_second": 2.167, "step": 2768 }, { "epoch": 98.05309734513274, "grad_norm": 5.462803840637207, "learning_rate": 9.755291005291007e-06, "loss": 0.0576, "step": 2770 }, { "epoch": 98.40707964601769, "grad_norm": 6.204135894775391, "learning_rate": 9.58994708994709e-06, "loss": 0.0871, "step": 2780 }, { "epoch": 98.76106194690266, "grad_norm": 7.025479316711426, "learning_rate": 9.424603174603175e-06, "loss": 0.0689, "step": 2790 }, { "epoch": 98.97345132743362, "eval_accuracy": 0.8075, "eval_loss": 0.9006683230400085, "eval_runtime": 5.6446, "eval_samples_per_second": 70.864, "eval_steps_per_second": 2.303, "step": 2796 }, { "epoch": 99.11504424778761, "grad_norm": 12.175108909606934, "learning_rate": 9.259259259259259e-06, "loss": 0.0653, "step": 2800 }, { "epoch": 99.46902654867256, "grad_norm": 2.277102470397949, "learning_rate": 9.093915343915345e-06, "loss": 0.0533, "step": 2810 }, { "epoch": 99.82300884955752, "grad_norm": 3.374624252319336, "learning_rate": 8.92857142857143e-06, "loss": 0.0782, "step": 2820 }, { "epoch": 100.0, "eval_accuracy": 0.8025, "eval_loss": 0.9216282367706299, "eval_runtime": 5.6935, "eval_samples_per_second": 70.255, "eval_steps_per_second": 2.283, "step": 2825 }, { "epoch": 100.17699115044248, "grad_norm": 4.110525131225586, "learning_rate": 8.763227513227513e-06, "loss": 0.0678, "step": 2830 }, { "epoch": 100.53097345132744, "grad_norm": 1.8253668546676636, "learning_rate": 8.597883597883598e-06, "loss": 0.0506, "step": 2840 }, { "epoch": 100.88495575221239, "grad_norm": 5.427482604980469, "learning_rate": 8.432539682539684e-06, "loss": 0.0696, "step": 2850 }, { "epoch": 100.99115044247787, "eval_accuracy": 0.8075, "eval_loss": 0.9158985614776611, "eval_runtime": 5.7392, "eval_samples_per_second": 69.696, "eval_steps_per_second": 2.265, "step": 2853 }, { "epoch": 101.23893805309734, "grad_norm": 11.669960975646973, "learning_rate": 8.267195767195768e-06, "loss": 0.0424, "step": 2860 }, { "epoch": 101.59292035398231, "grad_norm": 3.504544496536255, "learning_rate": 8.101851851851852e-06, "loss": 0.0676, "step": 2870 }, { "epoch": 101.94690265486726, "grad_norm": 4.86131477355957, "learning_rate": 7.936507936507936e-06, "loss": 0.0554, "step": 2880 }, { "epoch": 101.98230088495575, "eval_accuracy": 0.8125, "eval_loss": 0.9194761514663696, "eval_runtime": 5.7011, "eval_samples_per_second": 70.162, "eval_steps_per_second": 2.28, "step": 2881 }, { "epoch": 102.30088495575221, "grad_norm": 10.070198059082031, "learning_rate": 7.771164021164022e-06, "loss": 0.0784, "step": 2890 }, { "epoch": 102.65486725663717, "grad_norm": 6.48520040512085, "learning_rate": 7.605820105820106e-06, "loss": 0.0585, "step": 2900 }, { "epoch": 102.97345132743362, "eval_accuracy": 0.8125, "eval_loss": 0.9314340949058533, "eval_runtime": 5.7273, "eval_samples_per_second": 69.841, "eval_steps_per_second": 2.27, "step": 2909 }, { "epoch": 103.00884955752213, "grad_norm": 3.9195964336395264, "learning_rate": 7.4404761904761905e-06, "loss": 0.068, "step": 2910 }, { "epoch": 103.36283185840708, "grad_norm": 7.709494590759277, "learning_rate": 7.275132275132275e-06, "loss": 0.0555, "step": 2920 }, { "epoch": 103.71681415929204, "grad_norm": 5.181375026702881, "learning_rate": 7.1097883597883606e-06, "loss": 0.0541, "step": 2930 }, { "epoch": 104.0, "eval_accuracy": 0.825, "eval_loss": 0.893872082233429, "eval_runtime": 5.7025, "eval_samples_per_second": 70.145, "eval_steps_per_second": 2.28, "step": 2938 }, { "epoch": 104.070796460177, "grad_norm": 11.922988891601562, "learning_rate": 6.944444444444445e-06, "loss": 0.0656, "step": 2940 }, { "epoch": 104.42477876106194, "grad_norm": 6.153034687042236, "learning_rate": 6.779100529100529e-06, "loss": 0.0668, "step": 2950 }, { "epoch": 104.77876106194691, "grad_norm": 7.586311340332031, "learning_rate": 6.613756613756614e-06, "loss": 0.0636, "step": 2960 }, { "epoch": 104.99115044247787, "eval_accuracy": 0.8025, "eval_loss": 0.9045028686523438, "eval_runtime": 5.7995, "eval_samples_per_second": 68.972, "eval_steps_per_second": 2.242, "step": 2966 }, { "epoch": 105.13274336283186, "grad_norm": 7.499364852905273, "learning_rate": 6.448412698412699e-06, "loss": 0.0578, "step": 2970 }, { "epoch": 105.48672566371681, "grad_norm": 5.914554595947266, "learning_rate": 6.283068783068784e-06, "loss": 0.0683, "step": 2980 }, { "epoch": 105.84070796460178, "grad_norm": 5.446691513061523, "learning_rate": 6.117724867724868e-06, "loss": 0.0684, "step": 2990 }, { "epoch": 105.98230088495575, "eval_accuracy": 0.8075, "eval_loss": 0.8892062306404114, "eval_runtime": 5.7229, "eval_samples_per_second": 69.895, "eval_steps_per_second": 2.272, "step": 2994 }, { "epoch": 106.19469026548673, "grad_norm": 5.555883407592773, "learning_rate": 5.9523809523809525e-06, "loss": 0.0629, "step": 3000 }, { "epoch": 106.54867256637168, "grad_norm": 6.0038042068481445, "learning_rate": 5.787037037037038e-06, "loss": 0.0637, "step": 3010 }, { "epoch": 106.90265486725664, "grad_norm": 4.660765647888184, "learning_rate": 5.621693121693122e-06, "loss": 0.0608, "step": 3020 }, { "epoch": 106.97345132743362, "eval_accuracy": 0.8075, "eval_loss": 0.8998861908912659, "eval_runtime": 5.6896, "eval_samples_per_second": 70.303, "eval_steps_per_second": 2.285, "step": 3022 }, { "epoch": 107.2566371681416, "grad_norm": 2.3250956535339355, "learning_rate": 5.456349206349207e-06, "loss": 0.065, "step": 3030 }, { "epoch": 107.61061946902655, "grad_norm": 3.4695749282836914, "learning_rate": 5.291005291005291e-06, "loss": 0.0614, "step": 3040 }, { "epoch": 107.96460176991151, "grad_norm": 1.7842631340026855, "learning_rate": 5.125661375661376e-06, "loss": 0.0663, "step": 3050 }, { "epoch": 108.0, "eval_accuracy": 0.8075, "eval_loss": 0.9033371210098267, "eval_runtime": 6.1003, "eval_samples_per_second": 65.571, "eval_steps_per_second": 2.131, "step": 3051 }, { "epoch": 108.31858407079646, "grad_norm": 4.970130920410156, "learning_rate": 4.96031746031746e-06, "loss": 0.0466, "step": 3060 }, { "epoch": 108.67256637168141, "grad_norm": 6.935737609863281, "learning_rate": 4.794973544973545e-06, "loss": 0.054, "step": 3070 }, { "epoch": 108.99115044247787, "eval_accuracy": 0.805, "eval_loss": 0.9248512387275696, "eval_runtime": 5.6759, "eval_samples_per_second": 70.473, "eval_steps_per_second": 2.29, "step": 3079 }, { "epoch": 109.02654867256638, "grad_norm": 1.8442103862762451, "learning_rate": 4.6296296296296296e-06, "loss": 0.0681, "step": 3080 }, { "epoch": 109.38053097345133, "grad_norm": 6.584358215332031, "learning_rate": 4.464285714285715e-06, "loss": 0.0551, "step": 3090 }, { "epoch": 109.73451327433628, "grad_norm": 2.5174403190612793, "learning_rate": 4.298941798941799e-06, "loss": 0.0538, "step": 3100 }, { "epoch": 109.98230088495575, "eval_accuracy": 0.81, "eval_loss": 0.9065310955047607, "eval_runtime": 5.7721, "eval_samples_per_second": 69.299, "eval_steps_per_second": 2.252, "step": 3107 }, { "epoch": 110.08849557522124, "grad_norm": 3.406076669692993, "learning_rate": 4.133597883597884e-06, "loss": 0.0768, "step": 3110 }, { "epoch": 110.4424778761062, "grad_norm": 3.832432746887207, "learning_rate": 3.968253968253968e-06, "loss": 0.0674, "step": 3120 }, { "epoch": 110.79646017699115, "grad_norm": 2.3623971939086914, "learning_rate": 3.802910052910053e-06, "loss": 0.0696, "step": 3130 }, { "epoch": 110.97345132743362, "eval_accuracy": 0.8175, "eval_loss": 0.9002352952957153, "eval_runtime": 5.823, "eval_samples_per_second": 68.693, "eval_steps_per_second": 2.233, "step": 3135 }, { "epoch": 111.15044247787611, "grad_norm": 8.914955139160156, "learning_rate": 3.6375661375661373e-06, "loss": 0.059, "step": 3140 }, { "epoch": 111.50442477876106, "grad_norm": 6.295753479003906, "learning_rate": 3.4722222222222224e-06, "loss": 0.0496, "step": 3150 }, { "epoch": 111.85840707964601, "grad_norm": 4.604882717132568, "learning_rate": 3.306878306878307e-06, "loss": 0.0585, "step": 3160 }, { "epoch": 112.0, "eval_accuracy": 0.8025, "eval_loss": 0.9105786085128784, "eval_runtime": 5.6843, "eval_samples_per_second": 70.369, "eval_steps_per_second": 2.287, "step": 3164 }, { "epoch": 112.21238938053098, "grad_norm": 6.111964702606201, "learning_rate": 3.141534391534392e-06, "loss": 0.0387, "step": 3170 }, { "epoch": 112.56637168141593, "grad_norm": 9.29433536529541, "learning_rate": 2.9761904761904763e-06, "loss": 0.0623, "step": 3180 }, { "epoch": 112.92035398230088, "grad_norm": 10.9270601272583, "learning_rate": 2.810846560846561e-06, "loss": 0.0641, "step": 3190 }, { "epoch": 112.99115044247787, "eval_accuracy": 0.81, "eval_loss": 0.9088242053985596, "eval_runtime": 5.7312, "eval_samples_per_second": 69.794, "eval_steps_per_second": 2.268, "step": 3192 }, { "epoch": 113.27433628318585, "grad_norm": 9.549851417541504, "learning_rate": 2.6455026455026455e-06, "loss": 0.0555, "step": 3200 }, { "epoch": 113.6283185840708, "grad_norm": 7.800489902496338, "learning_rate": 2.48015873015873e-06, "loss": 0.0527, "step": 3210 }, { "epoch": 113.98230088495575, "grad_norm": 4.22268533706665, "learning_rate": 2.3148148148148148e-06, "loss": 0.0611, "step": 3220 }, { "epoch": 113.98230088495575, "eval_accuracy": 0.8075, "eval_loss": 0.915170431137085, "eval_runtime": 5.6521, "eval_samples_per_second": 70.77, "eval_steps_per_second": 2.3, "step": 3220 }, { "epoch": 114.33628318584071, "grad_norm": 6.939643383026123, "learning_rate": 2.1494708994708994e-06, "loss": 0.092, "step": 3230 }, { "epoch": 114.69026548672566, "grad_norm": 4.18217658996582, "learning_rate": 1.984126984126984e-06, "loss": 0.0528, "step": 3240 }, { "epoch": 114.97345132743362, "eval_accuracy": 0.8125, "eval_loss": 0.9140109419822693, "eval_runtime": 5.9093, "eval_samples_per_second": 67.69, "eval_steps_per_second": 2.2, "step": 3248 }, { "epoch": 115.04424778761062, "grad_norm": 2.470015287399292, "learning_rate": 1.8187830687830687e-06, "loss": 0.0664, "step": 3250 }, { "epoch": 115.39823008849558, "grad_norm": 5.625057220458984, "learning_rate": 1.6534391534391535e-06, "loss": 0.0567, "step": 3260 }, { "epoch": 115.75221238938053, "grad_norm": 8.182403564453125, "learning_rate": 1.4880952380952381e-06, "loss": 0.0631, "step": 3270 }, { "epoch": 116.0, "eval_accuracy": 0.81, "eval_loss": 0.9184489250183105, "eval_runtime": 5.7237, "eval_samples_per_second": 69.885, "eval_steps_per_second": 2.271, "step": 3277 }, { "epoch": 116.10619469026548, "grad_norm": 3.7127795219421387, "learning_rate": 1.3227513227513228e-06, "loss": 0.0667, "step": 3280 }, { "epoch": 116.46017699115045, "grad_norm": 9.592253684997559, "learning_rate": 1.1574074074074074e-06, "loss": 0.0559, "step": 3290 }, { "epoch": 116.8141592920354, "grad_norm": 12.001859664916992, "learning_rate": 9.92063492063492e-07, "loss": 0.0744, "step": 3300 }, { "epoch": 116.99115044247787, "eval_accuracy": 0.8125, "eval_loss": 0.9216000437736511, "eval_runtime": 5.7416, "eval_samples_per_second": 69.667, "eval_steps_per_second": 2.264, "step": 3305 }, { "epoch": 117.16814159292035, "grad_norm": 5.99811315536499, "learning_rate": 8.267195767195768e-07, "loss": 0.051, "step": 3310 }, { "epoch": 117.52212389380531, "grad_norm": 4.772040367126465, "learning_rate": 6.613756613756614e-07, "loss": 0.0705, "step": 3320 }, { "epoch": 117.87610619469027, "grad_norm": 3.2538766860961914, "learning_rate": 4.96031746031746e-07, "loss": 0.0407, "step": 3330 }, { "epoch": 117.98230088495575, "eval_accuracy": 0.8125, "eval_loss": 0.9210975766181946, "eval_runtime": 5.6807, "eval_samples_per_second": 70.414, "eval_steps_per_second": 2.288, "step": 3333 }, { "epoch": 118.23008849557522, "grad_norm": 5.481634140014648, "learning_rate": 3.306878306878307e-07, "loss": 0.0682, "step": 3340 }, { "epoch": 118.58407079646018, "grad_norm": 7.475958824157715, "learning_rate": 1.6534391534391535e-07, "loss": 0.058, "step": 3350 }, { "epoch": 118.93805309734513, "grad_norm": 12.294817924499512, "learning_rate": 0.0, "loss": 0.0573, "step": 3360 }, { "epoch": 118.93805309734513, "eval_accuracy": 0.81, "eval_loss": 0.92020583152771, "eval_runtime": 5.7978, "eval_samples_per_second": 68.992, "eval_steps_per_second": 2.242, "step": 3360 }, { "epoch": 118.93805309734513, "step": 3360, "total_flos": 1.0779764781475824e+19, "train_loss": 0.2818299961143306, "train_runtime": 9749.4057, "train_samples_per_second": 44.31, "train_steps_per_second": 0.345 } ], "logging_steps": 10, "max_steps": 3360, "num_input_tokens_seen": 0, "num_train_epochs": 120, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0779764781475824e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }