{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.0, "eval_steps": 500, "global_step": 78735, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "learning_rate": 0.0009936495840477551, "loss": 1.2096, "step": 500 }, { "epoch": 0.19, "learning_rate": 0.0009872991680955102, "loss": 1.2958, "step": 1000 }, { "epoch": 0.29, "learning_rate": 0.0009809487521432654, "loss": 1.3278, "step": 1500 }, { "epoch": 0.38, "learning_rate": 0.0009745983361910206, "loss": 1.3214, "step": 2000 }, { "epoch": 0.48, "learning_rate": 0.0009682479202387756, "loss": 1.3299, "step": 2500 }, { "epoch": 0.57, "learning_rate": 0.0009618975042865308, "loss": 1.3244, "step": 3000 }, { "epoch": 0.67, "learning_rate": 0.0009555470883342859, "loss": 1.3279, "step": 3500 }, { "epoch": 0.76, "learning_rate": 0.000949196672382041, "loss": 1.3269, "step": 4000 }, { "epoch": 0.86, "learning_rate": 0.0009428462564297961, "loss": 1.3179, "step": 4500 }, { "epoch": 0.95, "learning_rate": 0.0009364958404775514, "loss": 1.3057, "step": 5000 }, { "epoch": 1.0, "eval_accuracy": 0.58219165847995, "eval_loss": 2.046381950378418, "eval_runtime": 1873.4069, "eval_samples_per_second": 89.652, "eval_steps_per_second": 0.701, "step": 5249 }, { "epoch": 1.05, "learning_rate": 0.0009301454245253064, "loss": 1.1726, "step": 5500 }, { "epoch": 1.14, "learning_rate": 0.0009237950085730616, "loss": 1.0732, "step": 6000 }, { "epoch": 1.24, "learning_rate": 0.0009174445926208167, "loss": 1.0922, "step": 6500 }, { "epoch": 1.33, "learning_rate": 0.0009110941766685719, "loss": 1.1075, "step": 7000 }, { "epoch": 1.43, "learning_rate": 0.0009047437607163269, "loss": 1.112, "step": 7500 }, { "epoch": 1.52, "learning_rate": 0.0008983933447640821, "loss": 1.1417, "step": 8000 }, { "epoch": 1.62, "learning_rate": 0.0008920429288118372, "loss": 1.1343, "step": 8500 }, { "epoch": 1.71, "learning_rate": 0.0008856925128595922, "loss": 1.1319, "step": 9000 }, { "epoch": 1.81, "learning_rate": 0.0008793420969073475, "loss": 1.1326, "step": 9500 }, { "epoch": 1.91, "learning_rate": 0.0008729916809551026, "loss": 1.1387, "step": 10000 }, { "epoch": 2.0, "eval_accuracy": 0.6084486916138251, "eval_loss": 1.9456982612609863, "eval_runtime": 4861.9841, "eval_samples_per_second": 34.545, "eval_steps_per_second": 0.27, "step": 10498 }, { "epoch": 2.0, "learning_rate": 0.0008666412650028577, "loss": 1.124, "step": 10500 }, { "epoch": 2.1, "learning_rate": 0.0008602908490506128, "loss": 0.9, "step": 11000 }, { "epoch": 2.19, "learning_rate": 0.000853940433098368, "loss": 0.9401, "step": 11500 }, { "epoch": 2.29, "learning_rate": 0.000847590017146123, "loss": 0.967, "step": 12000 }, { "epoch": 2.38, "learning_rate": 0.0008412396011938782, "loss": 0.9734, "step": 12500 }, { "epoch": 2.48, "learning_rate": 0.0008348891852416333, "loss": 0.9657, "step": 13000 }, { "epoch": 2.57, "learning_rate": 0.0008285387692893885, "loss": 0.9883, "step": 13500 }, { "epoch": 2.67, "learning_rate": 0.0008221883533371436, "loss": 0.9781, "step": 14000 }, { "epoch": 2.76, "learning_rate": 0.0008158379373848988, "loss": 0.9822, "step": 14500 }, { "epoch": 2.86, "learning_rate": 0.0008094875214326538, "loss": 0.992, "step": 15000 }, { "epoch": 2.95, "learning_rate": 0.000803137105480409, "loss": 0.988, "step": 15500 }, { "epoch": 3.0, "eval_accuracy": 0.6124319014021613, "eval_loss": 1.965035319328308, "eval_runtime": 4587.2857, "eval_samples_per_second": 36.613, "eval_steps_per_second": 0.286, "step": 15747 }, { "epoch": 3.05, "learning_rate": 0.0007967866895281641, "loss": 0.8822, "step": 16000 }, { "epoch": 3.14, "learning_rate": 0.0007904362735759192, "loss": 0.8149, "step": 16500 }, { "epoch": 3.24, "learning_rate": 0.0007840858576236743, "loss": 0.8325, "step": 17000 }, { "epoch": 3.33, "learning_rate": 0.0007777354416714295, "loss": 0.8412, "step": 17500 }, { "epoch": 3.43, "learning_rate": 0.0007713850257191847, "loss": 0.8421, "step": 18000 }, { "epoch": 3.52, "learning_rate": 0.0007650346097669397, "loss": 0.8487, "step": 18500 }, { "epoch": 3.62, "learning_rate": 0.0007586841938146949, "loss": 0.855, "step": 19000 }, { "epoch": 3.71, "learning_rate": 0.00075233377786245, "loss": 0.8623, "step": 19500 }, { "epoch": 3.81, "learning_rate": 0.0007459833619102051, "loss": 0.8668, "step": 20000 }, { "epoch": 3.91, "learning_rate": 0.0007396329459579602, "loss": 0.8653, "step": 20500 }, { "epoch": 4.0, "eval_accuracy": 0.634402071983567, "eval_loss": 1.9380866289138794, "eval_runtime": 1885.7719, "eval_samples_per_second": 89.064, "eval_steps_per_second": 0.696, "step": 20996 }, { "epoch": 4.0, "learning_rate": 0.0007332825300057155, "loss": 0.864, "step": 21000 }, { "epoch": 4.1, "learning_rate": 0.0007269321140534705, "loss": 0.7073, "step": 21500 }, { "epoch": 4.19, "learning_rate": 0.0007205816981012257, "loss": 0.7257, "step": 22000 }, { "epoch": 4.29, "learning_rate": 0.0007142312821489808, "loss": 0.7415, "step": 22500 }, { "epoch": 4.38, "learning_rate": 0.0007078808661967359, "loss": 0.7397, "step": 23000 }, { "epoch": 4.48, "learning_rate": 0.000701530450244491, "loss": 0.7635, "step": 23500 }, { "epoch": 4.57, "learning_rate": 0.0006951800342922462, "loss": 0.7736, "step": 24000 }, { "epoch": 4.67, "learning_rate": 0.0006888296183400012, "loss": 0.7694, "step": 24500 }, { "epoch": 4.76, "learning_rate": 0.0006824792023877565, "loss": 0.759, "step": 25000 }, { "epoch": 4.86, "learning_rate": 0.0006761287864355116, "loss": 0.7728, "step": 25500 }, { "epoch": 4.95, "learning_rate": 0.0006697783704832666, "loss": 0.7662, "step": 26000 }, { "epoch": 5.0, "eval_accuracy": 0.6335327915215385, "eval_loss": 1.9391114711761475, "eval_runtime": 1717.5004, "eval_samples_per_second": 97.79, "eval_steps_per_second": 0.764, "step": 26245 }, { "epoch": 5.05, "learning_rate": 0.0006634279545310218, "loss": 0.7034, "step": 26500 }, { "epoch": 5.14, "learning_rate": 0.0006570775385787769, "loss": 0.6446, "step": 27000 }, { "epoch": 5.24, "learning_rate": 0.000650727122626532, "loss": 0.6511, "step": 27500 }, { "epoch": 5.33, "learning_rate": 0.0006443767066742871, "loss": 0.6562, "step": 28000 }, { "epoch": 5.43, "learning_rate": 0.0006380262907220423, "loss": 0.6673, "step": 28500 }, { "epoch": 5.52, "learning_rate": 0.0006316758747697975, "loss": 0.6719, "step": 29000 }, { "epoch": 5.62, "learning_rate": 0.0006253254588175526, "loss": 0.6818, "step": 29500 }, { "epoch": 5.72, "learning_rate": 0.0006189750428653077, "loss": 0.6745, "step": 30000 }, { "epoch": 5.81, "learning_rate": 0.0006126246269130629, "loss": 0.6778, "step": 30500 }, { "epoch": 5.91, "learning_rate": 0.0006062742109608179, "loss": 0.6882, "step": 31000 }, { "epoch": 6.0, "eval_accuracy": 0.6443749813938258, "eval_loss": 1.9590805768966675, "eval_runtime": 1673.9405, "eval_samples_per_second": 100.335, "eval_steps_per_second": 0.784, "step": 31494 }, { "epoch": 6.0, "learning_rate": 0.0005999237950085731, "loss": 0.6787, "step": 31500 }, { "epoch": 6.1, "learning_rate": 0.0005935733790563282, "loss": 0.5613, "step": 32000 }, { "epoch": 6.19, "learning_rate": 0.0005872229631040833, "loss": 0.5867, "step": 32500 }, { "epoch": 6.29, "learning_rate": 0.0005808725471518384, "loss": 0.5839, "step": 33000 }, { "epoch": 6.38, "learning_rate": 0.0005745221311995937, "loss": 0.588, "step": 33500 }, { "epoch": 6.48, "learning_rate": 0.0005681717152473487, "loss": 0.6022, "step": 34000 }, { "epoch": 6.57, "learning_rate": 0.0005618212992951039, "loss": 0.596, "step": 34500 }, { "epoch": 6.67, "learning_rate": 0.000555470883342859, "loss": 0.6072, "step": 35000 }, { "epoch": 6.76, "learning_rate": 0.000549120467390614, "loss": 0.615, "step": 35500 }, { "epoch": 6.86, "learning_rate": 0.0005427700514383692, "loss": 0.6103, "step": 36000 }, { "epoch": 6.95, "learning_rate": 0.0005364196354861243, "loss": 0.601, "step": 36500 }, { "epoch": 7.0, "eval_accuracy": 0.6510374802774552, "eval_loss": 1.9506298303604126, "eval_runtime": 4758.5372, "eval_samples_per_second": 35.296, "eval_steps_per_second": 0.276, "step": 36743 }, { "epoch": 7.05, "learning_rate": 0.0005300692195338794, "loss": 0.5518, "step": 37000 }, { "epoch": 7.14, "learning_rate": 0.0005237188035816346, "loss": 0.5144, "step": 37500 }, { "epoch": 7.24, "learning_rate": 0.0005173683876293898, "loss": 0.5214, "step": 38000 }, { "epoch": 7.33, "learning_rate": 0.0005110179716771448, "loss": 0.522, "step": 38500 }, { "epoch": 7.43, "learning_rate": 0.0005046675557249, "loss": 0.5331, "step": 39000 }, { "epoch": 7.53, "learning_rate": 0.0004983171397726551, "loss": 0.5357, "step": 39500 }, { "epoch": 7.62, "learning_rate": 0.0004919667238204102, "loss": 0.53, "step": 40000 }, { "epoch": 7.72, "learning_rate": 0.00048561630786816534, "loss": 0.5397, "step": 40500 }, { "epoch": 7.81, "learning_rate": 0.0004792658919159205, "loss": 0.5319, "step": 41000 }, { "epoch": 7.91, "learning_rate": 0.0004729154759636756, "loss": 0.5363, "step": 41500 }, { "epoch": 8.0, "eval_accuracy": 0.6616772349736536, "eval_loss": 1.9555561542510986, "eval_runtime": 4754.1095, "eval_samples_per_second": 35.328, "eval_steps_per_second": 0.276, "step": 41992 }, { "epoch": 8.0, "learning_rate": 0.0004665650600114307, "loss": 0.3811, "step": 42000 }, { "epoch": 8.1, "learning_rate": 0.0004602146440591859, "loss": 0.4593, "step": 42500 }, { "epoch": 8.19, "learning_rate": 0.000453864228106941, "loss": 0.4769, "step": 43000 }, { "epoch": 8.29, "learning_rate": 0.00044751381215469617, "loss": 0.4676, "step": 43500 }, { "epoch": 8.38, "learning_rate": 0.0004411633962024513, "loss": 0.4816, "step": 44000 }, { "epoch": 8.48, "learning_rate": 0.0004348129802502064, "loss": 0.4765, "step": 44500 }, { "epoch": 8.57, "learning_rate": 0.00042846256429796155, "loss": 0.4835, "step": 45000 }, { "epoch": 8.67, "learning_rate": 0.00042211214834571666, "loss": 0.4767, "step": 45500 }, { "epoch": 8.76, "learning_rate": 0.0004157617323934718, "loss": 0.4877, "step": 46000 }, { "epoch": 8.86, "learning_rate": 0.00040941131644122694, "loss": 0.4706, "step": 46500 }, { "epoch": 8.95, "learning_rate": 0.00040306090048898205, "loss": 0.4871, "step": 47000 }, { "epoch": 9.0, "eval_accuracy": 0.6740853204727457, "eval_loss": 1.9037331342697144, "eval_runtime": 5063.3946, "eval_samples_per_second": 33.17, "eval_steps_per_second": 0.259, "step": 47241 }, { "epoch": 9.05, "learning_rate": 0.00039671048453673716, "loss": 0.4473, "step": 47500 }, { "epoch": 9.14, "learning_rate": 0.0003903600685844923, "loss": 0.4062, "step": 48000 }, { "epoch": 9.24, "learning_rate": 0.0003840096526322474, "loss": 0.4208, "step": 48500 }, { "epoch": 9.34, "learning_rate": 0.00037765923668000255, "loss": 0.4233, "step": 49000 }, { "epoch": 9.43, "learning_rate": 0.00037130882072775766, "loss": 0.4211, "step": 49500 }, { "epoch": 9.53, "learning_rate": 0.0003649584047755128, "loss": 0.4273, "step": 50000 }, { "epoch": 9.62, "learning_rate": 0.00035860798882326794, "loss": 0.4259, "step": 50500 }, { "epoch": 9.72, "learning_rate": 0.00035225757287102305, "loss": 0.4157, "step": 51000 }, { "epoch": 9.81, "learning_rate": 0.00034590715691877816, "loss": 0.4294, "step": 51500 }, { "epoch": 9.91, "learning_rate": 0.00033955674096653333, "loss": 0.4338, "step": 52000 }, { "epoch": 10.0, "eval_accuracy": 0.6806287398410289, "eval_loss": 1.9794013500213623, "eval_runtime": 4548.1376, "eval_samples_per_second": 36.928, "eval_steps_per_second": 0.289, "step": 52490 }, { "epoch": 10.0, "learning_rate": 0.00033320632501428844, "loss": 0.4278, "step": 52500 }, { "epoch": 10.1, "learning_rate": 0.00032685590906204355, "loss": 0.3655, "step": 53000 }, { "epoch": 10.19, "learning_rate": 0.0003205054931097987, "loss": 0.3792, "step": 53500 }, { "epoch": 10.29, "learning_rate": 0.00031415507715755383, "loss": 0.3765, "step": 54000 }, { "epoch": 10.38, "learning_rate": 0.000307804661205309, "loss": 0.3777, "step": 54500 }, { "epoch": 10.48, "learning_rate": 0.0003014542452530641, "loss": 0.3766, "step": 55000 }, { "epoch": 10.57, "learning_rate": 0.0002951038293008192, "loss": 0.3847, "step": 55500 }, { "epoch": 10.67, "learning_rate": 0.0002887534133485744, "loss": 0.3802, "step": 56000 }, { "epoch": 10.76, "learning_rate": 0.00028240299739632944, "loss": 0.3836, "step": 56500 }, { "epoch": 10.86, "learning_rate": 0.00027605258144408455, "loss": 0.3696, "step": 57000 }, { "epoch": 10.95, "learning_rate": 0.0002697021654918397, "loss": 0.3738, "step": 57500 }, { "epoch": 11.0, "eval_accuracy": 0.6849275103450329, "eval_loss": 2.0053181648254395, "eval_runtime": 3187.3586, "eval_samples_per_second": 52.694, "eval_steps_per_second": 0.412, "step": 57739 }, { "epoch": 11.05, "learning_rate": 0.0002633517495395948, "loss": 0.3603, "step": 58000 }, { "epoch": 11.14, "learning_rate": 0.00025700133358734994, "loss": 0.3371, "step": 58500 }, { "epoch": 11.24, "learning_rate": 0.0002506509176351051, "loss": 0.3334, "step": 59000 }, { "epoch": 11.34, "learning_rate": 0.0002443005016828602, "loss": 0.3342, "step": 59500 }, { "epoch": 11.43, "learning_rate": 0.00023795008573061535, "loss": 0.3392, "step": 60000 }, { "epoch": 11.53, "learning_rate": 0.0002315996697783705, "loss": 0.3323, "step": 60500 }, { "epoch": 11.62, "learning_rate": 0.00022524925382612563, "loss": 0.3444, "step": 61000 }, { "epoch": 11.72, "learning_rate": 0.00021889883787388074, "loss": 0.3395, "step": 61500 }, { "epoch": 11.81, "learning_rate": 0.00021254842192163588, "loss": 0.3321, "step": 62000 }, { "epoch": 11.91, "learning_rate": 0.00020619800596939102, "loss": 0.3338, "step": 62500 }, { "epoch": 12.0, "eval_accuracy": 0.6955315411866274, "eval_loss": 2.0140492916107178, "eval_runtime": 1705.2241, "eval_samples_per_second": 98.494, "eval_steps_per_second": 0.77, "step": 62988 }, { "epoch": 12.0, "learning_rate": 0.0001998475900171461, "loss": 0.3326, "step": 63000 }, { "epoch": 12.1, "learning_rate": 0.00019349717406490124, "loss": 0.3104, "step": 63500 }, { "epoch": 12.19, "learning_rate": 0.00018714675811265638, "loss": 0.2943, "step": 64000 }, { "epoch": 12.29, "learning_rate": 0.00018079634216041152, "loss": 0.3063, "step": 64500 }, { "epoch": 12.38, "learning_rate": 0.00017444592620816663, "loss": 0.302, "step": 65000 }, { "epoch": 12.48, "learning_rate": 0.00016809551025592177, "loss": 0.295, "step": 65500 }, { "epoch": 12.57, "learning_rate": 0.0001617450943036769, "loss": 0.2945, "step": 66000 }, { "epoch": 12.67, "learning_rate": 0.00015539467835143204, "loss": 0.3072, "step": 66500 }, { "epoch": 12.76, "learning_rate": 0.00014904426239918715, "loss": 0.2913, "step": 67000 }, { "epoch": 12.86, "learning_rate": 0.00014269384644694227, "loss": 0.3055, "step": 67500 }, { "epoch": 12.95, "learning_rate": 0.0001363434304946974, "loss": 0.2989, "step": 68000 }, { "epoch": 13.0, "eval_accuracy": 0.6974427674079366, "eval_loss": 2.083035707473755, "eval_runtime": 1683.3073, "eval_samples_per_second": 99.777, "eval_steps_per_second": 0.78, "step": 68237 }, { "epoch": 13.05, "learning_rate": 0.00012999301454245252, "loss": 0.2808, "step": 68500 }, { "epoch": 13.15, "learning_rate": 0.00012364259859020765, "loss": 0.2789, "step": 69000 }, { "epoch": 13.24, "learning_rate": 0.00011729218263796279, "loss": 0.2743, "step": 69500 }, { "epoch": 13.34, "learning_rate": 0.00011094176668571792, "loss": 0.2767, "step": 70000 }, { "epoch": 13.43, "learning_rate": 0.00010459135073347304, "loss": 0.2648, "step": 70500 }, { "epoch": 13.53, "learning_rate": 9.824093478122817e-05, "loss": 0.2724, "step": 71000 }, { "epoch": 13.62, "learning_rate": 9.18905188289833e-05, "loss": 0.2753, "step": 71500 }, { "epoch": 13.72, "learning_rate": 8.554010287673843e-05, "loss": 0.2743, "step": 72000 }, { "epoch": 13.81, "learning_rate": 7.918968692449357e-05, "loss": 0.2658, "step": 72500 }, { "epoch": 13.91, "learning_rate": 7.283927097224868e-05, "loss": 0.267, "step": 73000 }, { "epoch": 14.0, "eval_accuracy": 0.7040516805096603, "eval_loss": 2.0823681354522705, "eval_runtime": 2701.2349, "eval_samples_per_second": 62.177, "eval_steps_per_second": 0.486, "step": 73486 }, { "epoch": 14.0, "learning_rate": 6.64888550200038e-05, "loss": 0.2592, "step": 73500 }, { "epoch": 14.1, "learning_rate": 6.013843906775894e-05, "loss": 0.2473, "step": 74000 }, { "epoch": 14.19, "learning_rate": 5.378802311551407e-05, "loss": 0.2471, "step": 74500 }, { "epoch": 14.29, "learning_rate": 4.74376071632692e-05, "loss": 0.2592, "step": 75000 }, { "epoch": 14.38, "learning_rate": 4.1087191211024324e-05, "loss": 0.2493, "step": 75500 }, { "epoch": 14.48, "learning_rate": 3.473677525877945e-05, "loss": 0.2484, "step": 76000 }, { "epoch": 14.57, "learning_rate": 2.838635930653458e-05, "loss": 0.2468, "step": 76500 }, { "epoch": 14.67, "learning_rate": 2.203594335428971e-05, "loss": 0.2432, "step": 77000 }, { "epoch": 14.76, "learning_rate": 1.5685527402044834e-05, "loss": 0.2466, "step": 77500 }, { "epoch": 14.86, "learning_rate": 9.335111449799962e-06, "loss": 0.2472, "step": 78000 }, { "epoch": 14.96, "learning_rate": 2.98469549755509e-06, "loss": 0.236, "step": 78500 }, { "epoch": 15.0, "eval_accuracy": 0.708475484504778, "eval_loss": 2.0700299739837646, "eval_runtime": 1790.1892, "eval_samples_per_second": 93.82, "eval_steps_per_second": 0.733, "step": 78735 }, { "epoch": 15.0, "step": 78735, "total_flos": 6.872831806674565e+20, "train_loss": 0.0, "train_runtime": 39.8476, "train_samples_per_second": 252894.596, "train_steps_per_second": 1975.901 } ], "logging_steps": 500, "max_steps": 78735, "num_train_epochs": 15, "save_steps": 500, "total_flos": 6.872831806674565e+20, "trial_name": null, "trial_params": null }