{ "best_metric": 0.74, "best_model_checkpoint": "swinv2-base-patch4-window16-256-finetuned-eurosat/checkpoint-1446", "epoch": 29.657794676806084, "eval_steps": 500, "global_step": 1950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15, "grad_norm": 10.112361907958984, "learning_rate": 2.564102564102564e-06, "loss": 6.2407, "step": 10 }, { "epoch": 0.3, "grad_norm": 8.693501472473145, "learning_rate": 5.128205128205128e-06, "loss": 6.2346, "step": 20 }, { "epoch": 0.46, "grad_norm": 31.615943908691406, "learning_rate": 7.692307692307694e-06, "loss": 6.2382, "step": 30 }, { "epoch": 0.61, "grad_norm": 4.278875827789307, "learning_rate": 1.0256410256410256e-05, "loss": 6.2402, "step": 40 }, { "epoch": 0.76, "grad_norm": 5.3635783195495605, "learning_rate": 1.282051282051282e-05, "loss": 6.2228, "step": 50 }, { "epoch": 0.91, "grad_norm": 3.80220890045166, "learning_rate": 1.5384615384615387e-05, "loss": 6.2125, "step": 60 }, { "epoch": 0.99, "eval_accuracy": 0.0, "eval_loss": 6.20700216293335, "eval_runtime": 9.9729, "eval_samples_per_second": 35.095, "eval_steps_per_second": 3.008, "step": 65 }, { "epoch": 1.06, "grad_norm": 4.892258167266846, "learning_rate": 1.794871794871795e-05, "loss": 6.197, "step": 70 }, { "epoch": 1.22, "grad_norm": 5.695966720581055, "learning_rate": 2.0512820512820512e-05, "loss": 6.161, "step": 80 }, { "epoch": 1.37, "grad_norm": 5.43502950668335, "learning_rate": 2.307692307692308e-05, "loss": 6.1137, "step": 90 }, { "epoch": 1.52, "grad_norm": 10.649894714355469, "learning_rate": 2.564102564102564e-05, "loss": 6.1187, "step": 100 }, { "epoch": 1.67, "grad_norm": 6.482412815093994, "learning_rate": 2.8205128205128207e-05, "loss": 6.0495, "step": 110 }, { "epoch": 1.83, "grad_norm": 20.60057830810547, "learning_rate": 3.0769230769230774e-05, "loss": 6.0249, "step": 120 }, { "epoch": 1.98, "grad_norm": 10.727930068969727, "learning_rate": 3.3333333333333335e-05, "loss": 5.9584, "step": 130 }, { "epoch": 1.99, "eval_accuracy": 0.054285714285714284, "eval_loss": 5.894497871398926, "eval_runtime": 8.019, "eval_samples_per_second": 43.646, "eval_steps_per_second": 3.741, "step": 131 }, { "epoch": 2.13, "grad_norm": 9.321084976196289, "learning_rate": 3.58974358974359e-05, "loss": 5.6353, "step": 140 }, { "epoch": 2.28, "grad_norm": 15.699618339538574, "learning_rate": 3.846153846153846e-05, "loss": 5.4549, "step": 150 }, { "epoch": 2.43, "grad_norm": 13.360058784484863, "learning_rate": 4.1025641025641023e-05, "loss": 5.1768, "step": 160 }, { "epoch": 2.59, "grad_norm": 14.356024742126465, "learning_rate": 4.358974358974359e-05, "loss": 5.08, "step": 170 }, { "epoch": 2.74, "grad_norm": 12.42605209350586, "learning_rate": 4.615384615384616e-05, "loss": 4.8681, "step": 180 }, { "epoch": 2.89, "grad_norm": 13.120949745178223, "learning_rate": 4.871794871794872e-05, "loss": 4.7047, "step": 190 }, { "epoch": 3.0, "eval_accuracy": 0.3028571428571429, "eval_loss": 4.368250370025635, "eval_runtime": 8.0905, "eval_samples_per_second": 43.26, "eval_steps_per_second": 3.708, "step": 197 }, { "epoch": 3.04, "grad_norm": 14.149765014648438, "learning_rate": 4.985754985754986e-05, "loss": 4.2241, "step": 200 }, { "epoch": 3.19, "grad_norm": 12.356833457946777, "learning_rate": 4.9572649572649575e-05, "loss": 3.6145, "step": 210 }, { "epoch": 3.35, "grad_norm": 13.141170501708984, "learning_rate": 4.928774928774929e-05, "loss": 3.3265, "step": 220 }, { "epoch": 3.5, "grad_norm": 11.219949722290039, "learning_rate": 4.9002849002849004e-05, "loss": 3.1353, "step": 230 }, { "epoch": 3.65, "grad_norm": 12.410125732421875, "learning_rate": 4.871794871794872e-05, "loss": 2.9184, "step": 240 }, { "epoch": 3.8, "grad_norm": 12.346807479858398, "learning_rate": 4.8433048433048433e-05, "loss": 2.6721, "step": 250 }, { "epoch": 3.95, "grad_norm": 11.234230995178223, "learning_rate": 4.814814814814815e-05, "loss": 2.7217, "step": 260 }, { "epoch": 4.0, "eval_accuracy": 0.5457142857142857, "eval_loss": 2.71696400642395, "eval_runtime": 8.017, "eval_samples_per_second": 43.657, "eval_steps_per_second": 3.742, "step": 263 }, { "epoch": 4.11, "grad_norm": 9.922853469848633, "learning_rate": 4.786324786324787e-05, "loss": 1.9846, "step": 270 }, { "epoch": 4.26, "grad_norm": 10.338376998901367, "learning_rate": 4.7578347578347584e-05, "loss": 1.7283, "step": 280 }, { "epoch": 4.41, "grad_norm": 10.503661155700684, "learning_rate": 4.72934472934473e-05, "loss": 1.7337, "step": 290 }, { "epoch": 4.56, "grad_norm": 9.319334983825684, "learning_rate": 4.700854700854701e-05, "loss": 1.6741, "step": 300 }, { "epoch": 4.71, "grad_norm": 9.088035583496094, "learning_rate": 4.672364672364672e-05, "loss": 1.5406, "step": 310 }, { "epoch": 4.87, "grad_norm": 8.811790466308594, "learning_rate": 4.643874643874644e-05, "loss": 1.6097, "step": 320 }, { "epoch": 4.99, "eval_accuracy": 0.6314285714285715, "eval_loss": 2.015495538711548, "eval_runtime": 7.9582, "eval_samples_per_second": 43.98, "eval_steps_per_second": 3.77, "step": 328 }, { "epoch": 5.02, "grad_norm": 9.573624610900879, "learning_rate": 4.615384615384616e-05, "loss": 1.4472, "step": 330 }, { "epoch": 5.17, "grad_norm": 9.028331756591797, "learning_rate": 4.586894586894587e-05, "loss": 0.9959, "step": 340 }, { "epoch": 5.32, "grad_norm": 8.725614547729492, "learning_rate": 4.558404558404559e-05, "loss": 0.945, "step": 350 }, { "epoch": 5.48, "grad_norm": 10.008910179138184, "learning_rate": 4.52991452991453e-05, "loss": 0.9564, "step": 360 }, { "epoch": 5.63, "grad_norm": 9.664880752563477, "learning_rate": 4.501424501424502e-05, "loss": 0.9423, "step": 370 }, { "epoch": 5.78, "grad_norm": 7.615637302398682, "learning_rate": 4.472934472934473e-05, "loss": 0.9333, "step": 380 }, { "epoch": 5.93, "grad_norm": 9.067399978637695, "learning_rate": 4.4444444444444447e-05, "loss": 0.8932, "step": 390 }, { "epoch": 5.99, "eval_accuracy": 0.6742857142857143, "eval_loss": 1.70182204246521, "eval_runtime": 8.0389, "eval_samples_per_second": 43.538, "eval_steps_per_second": 3.732, "step": 394 }, { "epoch": 6.08, "grad_norm": 5.935275554656982, "learning_rate": 4.415954415954416e-05, "loss": 0.7379, "step": 400 }, { "epoch": 6.24, "grad_norm": 7.257266521453857, "learning_rate": 4.3874643874643876e-05, "loss": 0.5602, "step": 410 }, { "epoch": 6.39, "grad_norm": 9.825379371643066, "learning_rate": 4.358974358974359e-05, "loss": 0.5144, "step": 420 }, { "epoch": 6.54, "grad_norm": 6.920632362365723, "learning_rate": 4.3304843304843306e-05, "loss": 0.5718, "step": 430 }, { "epoch": 6.69, "grad_norm": 7.798554420471191, "learning_rate": 4.301994301994302e-05, "loss": 0.515, "step": 440 }, { "epoch": 6.84, "grad_norm": 6.575021266937256, "learning_rate": 4.2735042735042735e-05, "loss": 0.5472, "step": 450 }, { "epoch": 7.0, "grad_norm": 5.641183853149414, "learning_rate": 4.2450142450142457e-05, "loss": 0.5734, "step": 460 }, { "epoch": 7.0, "eval_accuracy": 0.7057142857142857, "eval_loss": 1.5249170064926147, "eval_runtime": 8.0507, "eval_samples_per_second": 43.474, "eval_steps_per_second": 3.726, "step": 460 }, { "epoch": 7.15, "grad_norm": 3.5870327949523926, "learning_rate": 4.216524216524217e-05, "loss": 0.3342, "step": 470 }, { "epoch": 7.3, "grad_norm": 4.1048479080200195, "learning_rate": 4.1880341880341886e-05, "loss": 0.3382, "step": 480 }, { "epoch": 7.45, "grad_norm": 6.017439842224121, "learning_rate": 4.15954415954416e-05, "loss": 0.3804, "step": 490 }, { "epoch": 7.6, "grad_norm": 5.106074333190918, "learning_rate": 4.131054131054131e-05, "loss": 0.3606, "step": 500 }, { "epoch": 7.76, "grad_norm": 5.5891900062561035, "learning_rate": 4.1025641025641023e-05, "loss": 0.3295, "step": 510 }, { "epoch": 7.91, "grad_norm": 4.079031944274902, "learning_rate": 4.074074074074074e-05, "loss": 0.324, "step": 520 }, { "epoch": 8.0, "eval_accuracy": 0.7085714285714285, "eval_loss": 1.4846410751342773, "eval_runtime": 7.9798, "eval_samples_per_second": 43.861, "eval_steps_per_second": 3.76, "step": 526 }, { "epoch": 8.06, "grad_norm": 3.212510824203491, "learning_rate": 4.045584045584046e-05, "loss": 0.2964, "step": 530 }, { "epoch": 8.21, "grad_norm": 5.004084587097168, "learning_rate": 4.0170940170940174e-05, "loss": 0.2145, "step": 540 }, { "epoch": 8.37, "grad_norm": 4.74351167678833, "learning_rate": 3.988603988603989e-05, "loss": 0.2206, "step": 550 }, { "epoch": 8.52, "grad_norm": 5.272638320922852, "learning_rate": 3.9601139601139604e-05, "loss": 0.2131, "step": 560 }, { "epoch": 8.67, "grad_norm": 3.062843084335327, "learning_rate": 3.931623931623932e-05, "loss": 0.2447, "step": 570 }, { "epoch": 8.82, "grad_norm": 3.7355995178222656, "learning_rate": 3.903133903133903e-05, "loss": 0.213, "step": 580 }, { "epoch": 8.97, "grad_norm": 3.62921404838562, "learning_rate": 3.874643874643875e-05, "loss": 0.2195, "step": 590 }, { "epoch": 8.99, "eval_accuracy": 0.7114285714285714, "eval_loss": 1.4269201755523682, "eval_runtime": 8.004, "eval_samples_per_second": 43.728, "eval_steps_per_second": 3.748, "step": 591 }, { "epoch": 9.13, "grad_norm": 2.647521734237671, "learning_rate": 3.846153846153846e-05, "loss": 0.1677, "step": 600 }, { "epoch": 9.28, "grad_norm": 4.363504409790039, "learning_rate": 3.817663817663818e-05, "loss": 0.1513, "step": 610 }, { "epoch": 9.43, "grad_norm": 2.5766873359680176, "learning_rate": 3.789173789173789e-05, "loss": 0.1684, "step": 620 }, { "epoch": 9.58, "grad_norm": 3.8854830265045166, "learning_rate": 3.760683760683761e-05, "loss": 0.1552, "step": 630 }, { "epoch": 9.73, "grad_norm": 6.697465896606445, "learning_rate": 3.732193732193732e-05, "loss": 0.188, "step": 640 }, { "epoch": 9.89, "grad_norm": 3.860522985458374, "learning_rate": 3.7037037037037037e-05, "loss": 0.1679, "step": 650 }, { "epoch": 9.99, "eval_accuracy": 0.7171428571428572, "eval_loss": 1.4169081449508667, "eval_runtime": 8.0283, "eval_samples_per_second": 43.596, "eval_steps_per_second": 3.737, "step": 657 }, { "epoch": 10.04, "grad_norm": 4.154173374176025, "learning_rate": 3.675213675213676e-05, "loss": 0.1645, "step": 660 }, { "epoch": 10.19, "grad_norm": 1.8003276586532593, "learning_rate": 3.646723646723647e-05, "loss": 0.105, "step": 670 }, { "epoch": 10.34, "grad_norm": 4.1917619705200195, "learning_rate": 3.618233618233619e-05, "loss": 0.149, "step": 680 }, { "epoch": 10.49, "grad_norm": 3.338636636734009, "learning_rate": 3.58974358974359e-05, "loss": 0.1287, "step": 690 }, { "epoch": 10.65, "grad_norm": 1.6283141374588013, "learning_rate": 3.561253561253561e-05, "loss": 0.1458, "step": 700 }, { "epoch": 10.8, "grad_norm": 2.769218921661377, "learning_rate": 3.5327635327635325e-05, "loss": 0.1394, "step": 710 }, { "epoch": 10.95, "grad_norm": 3.2028868198394775, "learning_rate": 3.504273504273504e-05, "loss": 0.1277, "step": 720 }, { "epoch": 11.0, "eval_accuracy": 0.7057142857142857, "eval_loss": 1.404009222984314, "eval_runtime": 8.031, "eval_samples_per_second": 43.581, "eval_steps_per_second": 3.736, "step": 723 }, { "epoch": 11.1, "grad_norm": 1.2642875909805298, "learning_rate": 3.475783475783476e-05, "loss": 0.1187, "step": 730 }, { "epoch": 11.25, "grad_norm": 1.5215080976486206, "learning_rate": 3.4472934472934476e-05, "loss": 0.0854, "step": 740 }, { "epoch": 11.41, "grad_norm": 2.877058982849121, "learning_rate": 3.418803418803419e-05, "loss": 0.1105, "step": 750 }, { "epoch": 11.56, "grad_norm": 5.0010552406311035, "learning_rate": 3.3903133903133905e-05, "loss": 0.0912, "step": 760 }, { "epoch": 11.71, "grad_norm": 5.7503981590271, "learning_rate": 3.361823361823362e-05, "loss": 0.1264, "step": 770 }, { "epoch": 11.86, "grad_norm": 3.2310426235198975, "learning_rate": 3.3333333333333335e-05, "loss": 0.1238, "step": 780 }, { "epoch": 12.0, "eval_accuracy": 0.7285714285714285, "eval_loss": 1.4007512331008911, "eval_runtime": 8.0356, "eval_samples_per_second": 43.556, "eval_steps_per_second": 3.733, "step": 789 }, { "epoch": 12.02, "grad_norm": 1.719030737876892, "learning_rate": 3.304843304843305e-05, "loss": 0.0817, "step": 790 }, { "epoch": 12.17, "grad_norm": 3.475520610809326, "learning_rate": 3.2763532763532764e-05, "loss": 0.0765, "step": 800 }, { "epoch": 12.32, "grad_norm": 3.978292226791382, "learning_rate": 3.247863247863248e-05, "loss": 0.0874, "step": 810 }, { "epoch": 12.47, "grad_norm": 1.6397371292114258, "learning_rate": 3.2193732193732194e-05, "loss": 0.1348, "step": 820 }, { "epoch": 12.62, "grad_norm": 0.9705621600151062, "learning_rate": 3.190883190883191e-05, "loss": 0.057, "step": 830 }, { "epoch": 12.78, "grad_norm": 3.8919146060943604, "learning_rate": 3.162393162393162e-05, "loss": 0.085, "step": 840 }, { "epoch": 12.93, "grad_norm": 1.4797801971435547, "learning_rate": 3.133903133903134e-05, "loss": 0.088, "step": 850 }, { "epoch": 12.99, "eval_accuracy": 0.7114285714285714, "eval_loss": 1.3840457201004028, "eval_runtime": 7.9781, "eval_samples_per_second": 43.87, "eval_steps_per_second": 3.76, "step": 854 }, { "epoch": 13.08, "grad_norm": 2.244473695755005, "learning_rate": 3.105413105413106e-05, "loss": 0.0673, "step": 860 }, { "epoch": 13.23, "grad_norm": 1.467897653579712, "learning_rate": 3.0769230769230774e-05, "loss": 0.0523, "step": 870 }, { "epoch": 13.38, "grad_norm": 2.4079532623291016, "learning_rate": 3.0484330484330486e-05, "loss": 0.0752, "step": 880 }, { "epoch": 13.54, "grad_norm": 3.189384698867798, "learning_rate": 3.01994301994302e-05, "loss": 0.0559, "step": 890 }, { "epoch": 13.69, "grad_norm": 2.8496036529541016, "learning_rate": 2.9914529914529915e-05, "loss": 0.0688, "step": 900 }, { "epoch": 13.84, "grad_norm": 0.6937215328216553, "learning_rate": 2.962962962962963e-05, "loss": 0.073, "step": 910 }, { "epoch": 13.99, "grad_norm": 1.4593366384506226, "learning_rate": 2.9344729344729345e-05, "loss": 0.0834, "step": 920 }, { "epoch": 13.99, "eval_accuracy": 0.72, "eval_loss": 1.3873815536499023, "eval_runtime": 8.1063, "eval_samples_per_second": 43.176, "eval_steps_per_second": 3.701, "step": 920 }, { "epoch": 14.14, "grad_norm": 0.6792957186698914, "learning_rate": 2.9059829059829063e-05, "loss": 0.0434, "step": 930 }, { "epoch": 14.3, "grad_norm": 1.9660212993621826, "learning_rate": 2.8774928774928778e-05, "loss": 0.0457, "step": 940 }, { "epoch": 14.45, "grad_norm": 1.9186339378356934, "learning_rate": 2.8490028490028492e-05, "loss": 0.0485, "step": 950 }, { "epoch": 14.6, "grad_norm": 1.0086941719055176, "learning_rate": 2.8205128205128207e-05, "loss": 0.0472, "step": 960 }, { "epoch": 14.75, "grad_norm": 2.760943651199341, "learning_rate": 2.7920227920227922e-05, "loss": 0.0733, "step": 970 }, { "epoch": 14.9, "grad_norm": 0.8688881993293762, "learning_rate": 2.7635327635327633e-05, "loss": 0.0813, "step": 980 }, { "epoch": 15.0, "eval_accuracy": 0.7257142857142858, "eval_loss": 1.3705737590789795, "eval_runtime": 8.1151, "eval_samples_per_second": 43.13, "eval_steps_per_second": 3.697, "step": 986 }, { "epoch": 15.06, "grad_norm": 0.6380533576011658, "learning_rate": 2.7350427350427355e-05, "loss": 0.0466, "step": 990 }, { "epoch": 15.21, "grad_norm": 6.788400650024414, "learning_rate": 2.706552706552707e-05, "loss": 0.044, "step": 1000 }, { "epoch": 15.36, "grad_norm": 2.104766607284546, "learning_rate": 2.6780626780626784e-05, "loss": 0.0723, "step": 1010 }, { "epoch": 15.51, "grad_norm": 1.0589812994003296, "learning_rate": 2.64957264957265e-05, "loss": 0.0628, "step": 1020 }, { "epoch": 15.67, "grad_norm": 1.543593168258667, "learning_rate": 2.621082621082621e-05, "loss": 0.0485, "step": 1030 }, { "epoch": 15.82, "grad_norm": 2.2463526725769043, "learning_rate": 2.5925925925925925e-05, "loss": 0.0442, "step": 1040 }, { "epoch": 15.97, "grad_norm": 5.468172550201416, "learning_rate": 2.564102564102564e-05, "loss": 0.0423, "step": 1050 }, { "epoch": 16.0, "eval_accuracy": 0.7228571428571429, "eval_loss": 1.3519986867904663, "eval_runtime": 8.094, "eval_samples_per_second": 43.242, "eval_steps_per_second": 3.706, "step": 1052 }, { "epoch": 16.12, "grad_norm": 2.093841791152954, "learning_rate": 2.535612535612536e-05, "loss": 0.0532, "step": 1060 }, { "epoch": 16.27, "grad_norm": 0.7975372672080994, "learning_rate": 2.5071225071225073e-05, "loss": 0.0273, "step": 1070 }, { "epoch": 16.43, "grad_norm": 6.552361965179443, "learning_rate": 2.4786324786324787e-05, "loss": 0.0643, "step": 1080 }, { "epoch": 16.58, "grad_norm": 1.8863351345062256, "learning_rate": 2.4501424501424502e-05, "loss": 0.0345, "step": 1090 }, { "epoch": 16.73, "grad_norm": 0.8653244376182556, "learning_rate": 2.4216524216524217e-05, "loss": 0.0502, "step": 1100 }, { "epoch": 16.88, "grad_norm": 0.7265773415565491, "learning_rate": 2.3931623931623935e-05, "loss": 0.067, "step": 1110 }, { "epoch": 16.99, "eval_accuracy": 0.7228571428571429, "eval_loss": 1.3108690977096558, "eval_runtime": 8.0282, "eval_samples_per_second": 43.597, "eval_steps_per_second": 3.737, "step": 1117 }, { "epoch": 17.03, "grad_norm": 0.5706465244293213, "learning_rate": 2.364672364672365e-05, "loss": 0.0456, "step": 1120 }, { "epoch": 17.19, "grad_norm": 0.4868156313896179, "learning_rate": 2.336182336182336e-05, "loss": 0.0239, "step": 1130 }, { "epoch": 17.34, "grad_norm": 0.2969132661819458, "learning_rate": 2.307692307692308e-05, "loss": 0.0258, "step": 1140 }, { "epoch": 17.49, "grad_norm": 0.7196402549743652, "learning_rate": 2.2792022792022794e-05, "loss": 0.0307, "step": 1150 }, { "epoch": 17.64, "grad_norm": 0.6792505383491516, "learning_rate": 2.250712250712251e-05, "loss": 0.0357, "step": 1160 }, { "epoch": 17.79, "grad_norm": 1.3564707040786743, "learning_rate": 2.2222222222222223e-05, "loss": 0.0447, "step": 1170 }, { "epoch": 17.95, "grad_norm": 0.7506925463676453, "learning_rate": 2.1937321937321938e-05, "loss": 0.0438, "step": 1180 }, { "epoch": 17.99, "eval_accuracy": 0.7171428571428572, "eval_loss": 1.3395991325378418, "eval_runtime": 7.9804, "eval_samples_per_second": 43.857, "eval_steps_per_second": 3.759, "step": 1183 }, { "epoch": 18.1, "grad_norm": 0.2639639377593994, "learning_rate": 2.1652421652421653e-05, "loss": 0.0364, "step": 1190 }, { "epoch": 18.25, "grad_norm": 0.6512497067451477, "learning_rate": 2.1367521367521368e-05, "loss": 0.035, "step": 1200 }, { "epoch": 18.4, "grad_norm": 0.36454707384109497, "learning_rate": 2.1082621082621086e-05, "loss": 0.031, "step": 1210 }, { "epoch": 18.56, "grad_norm": 1.9671510457992554, "learning_rate": 2.07977207977208e-05, "loss": 0.0365, "step": 1220 }, { "epoch": 18.71, "grad_norm": 2.5179057121276855, "learning_rate": 2.0512820512820512e-05, "loss": 0.0343, "step": 1230 }, { "epoch": 18.86, "grad_norm": 0.5848199725151062, "learning_rate": 2.022792022792023e-05, "loss": 0.0399, "step": 1240 }, { "epoch": 19.0, "eval_accuracy": 0.7257142857142858, "eval_loss": 1.3867747783660889, "eval_runtime": 7.995, "eval_samples_per_second": 43.778, "eval_steps_per_second": 3.752, "step": 1249 }, { "epoch": 19.01, "grad_norm": 1.6354899406433105, "learning_rate": 1.9943019943019945e-05, "loss": 0.0488, "step": 1250 }, { "epoch": 19.16, "grad_norm": 4.593708038330078, "learning_rate": 1.965811965811966e-05, "loss": 0.0326, "step": 1260 }, { "epoch": 19.32, "grad_norm": 0.5004624128341675, "learning_rate": 1.9373219373219374e-05, "loss": 0.0312, "step": 1270 }, { "epoch": 19.47, "grad_norm": 3.982077121734619, "learning_rate": 1.908831908831909e-05, "loss": 0.0367, "step": 1280 }, { "epoch": 19.62, "grad_norm": 1.31514573097229, "learning_rate": 1.8803418803418804e-05, "loss": 0.0288, "step": 1290 }, { "epoch": 19.77, "grad_norm": 2.477193593978882, "learning_rate": 1.8518518518518518e-05, "loss": 0.0188, "step": 1300 }, { "epoch": 19.92, "grad_norm": 1.13873291015625, "learning_rate": 1.8233618233618236e-05, "loss": 0.022, "step": 1310 }, { "epoch": 20.0, "eval_accuracy": 0.7257142857142858, "eval_loss": 1.3571245670318604, "eval_runtime": 7.9825, "eval_samples_per_second": 43.846, "eval_steps_per_second": 3.758, "step": 1315 }, { "epoch": 20.08, "grad_norm": 0.1975400298833847, "learning_rate": 1.794871794871795e-05, "loss": 0.016, "step": 1320 }, { "epoch": 20.23, "grad_norm": 2.610684871673584, "learning_rate": 1.7663817663817662e-05, "loss": 0.0364, "step": 1330 }, { "epoch": 20.38, "grad_norm": 2.5552616119384766, "learning_rate": 1.737891737891738e-05, "loss": 0.0209, "step": 1340 }, { "epoch": 20.53, "grad_norm": 1.8163336515426636, "learning_rate": 1.7094017094017095e-05, "loss": 0.014, "step": 1350 }, { "epoch": 20.68, "grad_norm": 2.3455891609191895, "learning_rate": 1.680911680911681e-05, "loss": 0.015, "step": 1360 }, { "epoch": 20.84, "grad_norm": 1.0087167024612427, "learning_rate": 1.6524216524216525e-05, "loss": 0.021, "step": 1370 }, { "epoch": 20.99, "grad_norm": 4.435824394226074, "learning_rate": 1.623931623931624e-05, "loss": 0.0326, "step": 1380 }, { "epoch": 20.99, "eval_accuracy": 0.7342857142857143, "eval_loss": 1.316083550453186, "eval_runtime": 8.0694, "eval_samples_per_second": 43.374, "eval_steps_per_second": 3.718, "step": 1380 }, { "epoch": 21.14, "grad_norm": 2.11207914352417, "learning_rate": 1.5954415954415954e-05, "loss": 0.0249, "step": 1390 }, { "epoch": 21.29, "grad_norm": 0.3664344251155853, "learning_rate": 1.566951566951567e-05, "loss": 0.0168, "step": 1400 }, { "epoch": 21.44, "grad_norm": 2.1651501655578613, "learning_rate": 1.5384615384615387e-05, "loss": 0.0269, "step": 1410 }, { "epoch": 21.6, "grad_norm": 6.236063480377197, "learning_rate": 1.50997150997151e-05, "loss": 0.0266, "step": 1420 }, { "epoch": 21.75, "grad_norm": 0.4216400980949402, "learning_rate": 1.4814814814814815e-05, "loss": 0.0276, "step": 1430 }, { "epoch": 21.9, "grad_norm": 0.34464436769485474, "learning_rate": 1.4529914529914531e-05, "loss": 0.0217, "step": 1440 }, { "epoch": 21.99, "eval_accuracy": 0.74, "eval_loss": 1.3431659936904907, "eval_runtime": 8.2814, "eval_samples_per_second": 42.263, "eval_steps_per_second": 3.623, "step": 1446 }, { "epoch": 22.05, "grad_norm": 0.550115168094635, "learning_rate": 1.4245014245014246e-05, "loss": 0.0116, "step": 1450 }, { "epoch": 22.21, "grad_norm": 0.7523086071014404, "learning_rate": 1.3960113960113961e-05, "loss": 0.0167, "step": 1460 }, { "epoch": 22.36, "grad_norm": 0.4303203821182251, "learning_rate": 1.3675213675213677e-05, "loss": 0.0152, "step": 1470 }, { "epoch": 22.51, "grad_norm": 0.9599018096923828, "learning_rate": 1.3390313390313392e-05, "loss": 0.0129, "step": 1480 }, { "epoch": 22.66, "grad_norm": 0.6038946509361267, "learning_rate": 1.3105413105413105e-05, "loss": 0.0153, "step": 1490 }, { "epoch": 22.81, "grad_norm": 2.5680289268493652, "learning_rate": 1.282051282051282e-05, "loss": 0.0302, "step": 1500 }, { "epoch": 22.97, "grad_norm": 0.7856467366218567, "learning_rate": 1.2535612535612536e-05, "loss": 0.0185, "step": 1510 }, { "epoch": 23.0, "eval_accuracy": 0.7342857142857143, "eval_loss": 1.3489614725112915, "eval_runtime": 8.0906, "eval_samples_per_second": 43.26, "eval_steps_per_second": 3.708, "step": 1512 }, { "epoch": 23.12, "grad_norm": 0.6607487201690674, "learning_rate": 1.2250712250712251e-05, "loss": 0.014, "step": 1520 }, { "epoch": 23.27, "grad_norm": 0.14532317221164703, "learning_rate": 1.1965811965811967e-05, "loss": 0.02, "step": 1530 }, { "epoch": 23.42, "grad_norm": 0.3423649072647095, "learning_rate": 1.168091168091168e-05, "loss": 0.0156, "step": 1540 }, { "epoch": 23.57, "grad_norm": 0.15258215367794037, "learning_rate": 1.1396011396011397e-05, "loss": 0.0087, "step": 1550 }, { "epoch": 23.73, "grad_norm": 0.20266969501972198, "learning_rate": 1.1111111111111112e-05, "loss": 0.0257, "step": 1560 }, { "epoch": 23.88, "grad_norm": 0.46567222476005554, "learning_rate": 1.0826210826210826e-05, "loss": 0.0247, "step": 1570 }, { "epoch": 24.0, "eval_accuracy": 0.7285714285714285, "eval_loss": 1.3712286949157715, "eval_runtime": 8.0686, "eval_samples_per_second": 43.378, "eval_steps_per_second": 3.718, "step": 1578 }, { "epoch": 24.03, "grad_norm": 0.43167567253112793, "learning_rate": 1.0541310541310543e-05, "loss": 0.0151, "step": 1580 }, { "epoch": 24.18, "grad_norm": 0.3076987862586975, "learning_rate": 1.0256410256410256e-05, "loss": 0.0145, "step": 1590 }, { "epoch": 24.33, "grad_norm": 0.28051629662513733, "learning_rate": 9.971509971509972e-06, "loss": 0.0068, "step": 1600 }, { "epoch": 24.49, "grad_norm": 0.17808012664318085, "learning_rate": 9.686609686609687e-06, "loss": 0.015, "step": 1610 }, { "epoch": 24.64, "grad_norm": 0.46903499960899353, "learning_rate": 9.401709401709402e-06, "loss": 0.0111, "step": 1620 }, { "epoch": 24.79, "grad_norm": 3.1560771465301514, "learning_rate": 9.116809116809118e-06, "loss": 0.0198, "step": 1630 }, { "epoch": 24.94, "grad_norm": 1.1795072555541992, "learning_rate": 8.831908831908831e-06, "loss": 0.0147, "step": 1640 }, { "epoch": 24.99, "eval_accuracy": 0.7285714285714285, "eval_loss": 1.3384881019592285, "eval_runtime": 8.045, "eval_samples_per_second": 43.505, "eval_steps_per_second": 3.729, "step": 1643 }, { "epoch": 25.1, "grad_norm": 2.324568748474121, "learning_rate": 8.547008547008548e-06, "loss": 0.0147, "step": 1650 }, { "epoch": 25.25, "grad_norm": 0.6252849102020264, "learning_rate": 8.262108262108262e-06, "loss": 0.0141, "step": 1660 }, { "epoch": 25.4, "grad_norm": 2.523175001144409, "learning_rate": 7.977207977207977e-06, "loss": 0.0288, "step": 1670 }, { "epoch": 25.55, "grad_norm": 0.6321514844894409, "learning_rate": 7.692307692307694e-06, "loss": 0.0151, "step": 1680 }, { "epoch": 25.7, "grad_norm": 0.1425185650587082, "learning_rate": 7.4074074074074075e-06, "loss": 0.0093, "step": 1690 }, { "epoch": 25.86, "grad_norm": 0.6362813115119934, "learning_rate": 7.122507122507123e-06, "loss": 0.0164, "step": 1700 }, { "epoch": 25.99, "eval_accuracy": 0.7228571428571429, "eval_loss": 1.352995753288269, "eval_runtime": 7.9452, "eval_samples_per_second": 44.052, "eval_steps_per_second": 3.776, "step": 1709 }, { "epoch": 26.01, "grad_norm": 0.11444679647684097, "learning_rate": 6.837606837606839e-06, "loss": 0.0198, "step": 1710 }, { "epoch": 26.16, "grad_norm": 0.34033504128456116, "learning_rate": 6.5527065527065525e-06, "loss": 0.013, "step": 1720 }, { "epoch": 26.31, "grad_norm": 1.7793394327163696, "learning_rate": 6.267806267806268e-06, "loss": 0.0122, "step": 1730 }, { "epoch": 26.46, "grad_norm": 0.11746495217084885, "learning_rate": 5.982905982905984e-06, "loss": 0.0153, "step": 1740 }, { "epoch": 26.62, "grad_norm": 4.355152606964111, "learning_rate": 5.6980056980056985e-06, "loss": 0.0153, "step": 1750 }, { "epoch": 26.77, "grad_norm": 0.5570241808891296, "learning_rate": 5.413105413105413e-06, "loss": 0.013, "step": 1760 }, { "epoch": 26.92, "grad_norm": 0.22895778715610504, "learning_rate": 5.128205128205128e-06, "loss": 0.0148, "step": 1770 }, { "epoch": 27.0, "eval_accuracy": 0.7257142857142858, "eval_loss": 1.3564364910125732, "eval_runtime": 8.0323, "eval_samples_per_second": 43.574, "eval_steps_per_second": 3.735, "step": 1775 }, { "epoch": 27.07, "grad_norm": 1.6692248582839966, "learning_rate": 4.8433048433048435e-06, "loss": 0.0217, "step": 1780 }, { "epoch": 27.22, "grad_norm": 0.4036758542060852, "learning_rate": 4.558404558404559e-06, "loss": 0.0068, "step": 1790 }, { "epoch": 27.38, "grad_norm": 0.1422310322523117, "learning_rate": 4.273504273504274e-06, "loss": 0.0086, "step": 1800 }, { "epoch": 27.53, "grad_norm": 0.36455395817756653, "learning_rate": 3.988603988603989e-06, "loss": 0.0097, "step": 1810 }, { "epoch": 27.68, "grad_norm": 2.0207414627075195, "learning_rate": 3.7037037037037037e-06, "loss": 0.009, "step": 1820 }, { "epoch": 27.83, "grad_norm": 0.2137887328863144, "learning_rate": 3.4188034188034193e-06, "loss": 0.0073, "step": 1830 }, { "epoch": 27.98, "grad_norm": 1.0078092813491821, "learning_rate": 3.133903133903134e-06, "loss": 0.0095, "step": 1840 }, { "epoch": 28.0, "eval_accuracy": 0.7228571428571429, "eval_loss": 1.3562867641448975, "eval_runtime": 8.0528, "eval_samples_per_second": 43.463, "eval_steps_per_second": 3.725, "step": 1841 }, { "epoch": 28.14, "grad_norm": 0.11777978390455246, "learning_rate": 2.8490028490028492e-06, "loss": 0.0076, "step": 1850 }, { "epoch": 28.29, "grad_norm": 0.4021410644054413, "learning_rate": 2.564102564102564e-06, "loss": 0.0091, "step": 1860 }, { "epoch": 28.44, "grad_norm": 0.19985055923461914, "learning_rate": 2.2792022792022796e-06, "loss": 0.0091, "step": 1870 }, { "epoch": 28.59, "grad_norm": 0.30899757146835327, "learning_rate": 1.9943019943019943e-06, "loss": 0.0096, "step": 1880 }, { "epoch": 28.75, "grad_norm": 0.1285697966814041, "learning_rate": 1.7094017094017097e-06, "loss": 0.0108, "step": 1890 }, { "epoch": 28.9, "grad_norm": 1.3066548109054565, "learning_rate": 1.4245014245014246e-06, "loss": 0.0105, "step": 1900 }, { "epoch": 28.99, "eval_accuracy": 0.7171428571428572, "eval_loss": 1.3570489883422852, "eval_runtime": 8.0496, "eval_samples_per_second": 43.481, "eval_steps_per_second": 3.727, "step": 1906 }, { "epoch": 29.05, "grad_norm": 0.1782771348953247, "learning_rate": 1.1396011396011398e-06, "loss": 0.0109, "step": 1910 }, { "epoch": 29.2, "grad_norm": 0.0780392736196518, "learning_rate": 8.547008547008548e-07, "loss": 0.0058, "step": 1920 }, { "epoch": 29.35, "grad_norm": 1.1122561693191528, "learning_rate": 5.698005698005699e-07, "loss": 0.012, "step": 1930 }, { "epoch": 29.51, "grad_norm": 0.21714162826538086, "learning_rate": 2.8490028490028494e-07, "loss": 0.0088, "step": 1940 }, { "epoch": 29.66, "grad_norm": 0.504612922668457, "learning_rate": 0.0, "loss": 0.0105, "step": 1950 }, { "epoch": 29.66, "eval_accuracy": 0.7171428571428572, "eval_loss": 1.3564331531524658, "eval_runtime": 8.0797, "eval_samples_per_second": 43.319, "eval_steps_per_second": 3.713, "step": 1950 } ], "logging_steps": 10, "max_steps": 1950, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "total_flos": 5.87953618460352e+18, "train_batch_size": 12, "trial_name": null, "trial_params": null }