{ "best_metric": 0.9931387305259705, "best_model_checkpoint": "/kaggle/output/checkpoint-49000", "epoch": 1.996414602346806, "eval_steps": 1000, "global_step": 49000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.7777777777777777e-11, "loss": 1.1383, "step": 1 }, { "epoch": 0.04, "learning_rate": 2.7750000000000004e-08, "loss": 1.1424, "step": 1000 }, { "epoch": 0.04, "eval_accuracy": 0.32375249500998005, "eval_loss": 1.1077626943588257, "eval_runtime": 54.8633, "eval_samples_per_second": 91.318, "eval_steps_per_second": 11.428, "step": 1000 }, { "epoch": 0.08, "learning_rate": 5.5527777777777784e-08, "loss": 1.1244, "step": 2000 }, { "epoch": 0.08, "eval_accuracy": 0.33652694610778444, "eval_loss": 1.1080161333084106, "eval_runtime": 54.7384, "eval_samples_per_second": 91.526, "eval_steps_per_second": 11.454, "step": 2000 }, { "epoch": 0.12, "learning_rate": 8.327777777777778e-08, "loss": 1.1228, "step": 3000 }, { "epoch": 0.12, "eval_accuracy": 0.34331337325349304, "eval_loss": 1.1084064245224, "eval_runtime": 54.7948, "eval_samples_per_second": 91.432, "eval_steps_per_second": 11.443, "step": 3000 }, { "epoch": 0.16, "learning_rate": 1.1105555555555557e-07, "loss": 1.1216, "step": 4000 }, { "epoch": 0.16, "eval_accuracy": 0.3385229540918164, "eval_loss": 1.1014840602874756, "eval_runtime": 54.8508, "eval_samples_per_second": 91.339, "eval_steps_per_second": 11.431, "step": 4000 }, { "epoch": 0.2, "learning_rate": 1.3880555555555558e-07, "loss": 1.1181, "step": 5000 }, { "epoch": 0.2, "eval_accuracy": 0.33073852295409184, "eval_loss": 1.1008135080337524, "eval_runtime": 54.8304, "eval_samples_per_second": 91.373, "eval_steps_per_second": 11.435, "step": 5000 }, { "epoch": 0.24, "learning_rate": 1.6658333333333335e-07, "loss": 1.1132, "step": 6000 }, { "epoch": 0.24, "eval_accuracy": 0.3520958083832335, "eval_loss": 1.0993762016296387, "eval_runtime": 54.8804, "eval_samples_per_second": 91.289, "eval_steps_per_second": 11.425, "step": 6000 }, { "epoch": 0.29, "learning_rate": 1.9433333333333334e-07, "loss": 1.1113, "step": 7000 }, { "epoch": 0.29, "eval_accuracy": 0.3530938123752495, "eval_loss": 1.0965770483016968, "eval_runtime": 54.8881, "eval_samples_per_second": 91.277, "eval_steps_per_second": 11.423, "step": 7000 }, { "epoch": 0.33, "learning_rate": 2.2211111111111114e-07, "loss": 1.1111, "step": 8000 }, { "epoch": 0.33, "eval_accuracy": 0.35708582834331337, "eval_loss": 1.094658613204956, "eval_runtime": 54.8233, "eval_samples_per_second": 91.384, "eval_steps_per_second": 11.437, "step": 8000 }, { "epoch": 0.37, "learning_rate": 2.4986111111111113e-07, "loss": 1.109, "step": 9000 }, { "epoch": 0.37, "eval_accuracy": 0.34191616766467064, "eval_loss": 1.106990933418274, "eval_runtime": 54.9095, "eval_samples_per_second": 91.241, "eval_steps_per_second": 11.419, "step": 9000 }, { "epoch": 0.41, "learning_rate": 2.776388888888889e-07, "loss": 1.1036, "step": 10000 }, { "epoch": 0.41, "eval_accuracy": 0.37584830339321357, "eval_loss": 1.0930211544036865, "eval_runtime": 54.9067, "eval_samples_per_second": 91.246, "eval_steps_per_second": 11.419, "step": 10000 }, { "epoch": 0.45, "learning_rate": 3.0541666666666667e-07, "loss": 1.1045, "step": 11000 }, { "epoch": 0.45, "eval_accuracy": 0.3652694610778443, "eval_loss": 1.092846393585205, "eval_runtime": 54.8964, "eval_samples_per_second": 91.263, "eval_steps_per_second": 11.422, "step": 11000 }, { "epoch": 0.49, "learning_rate": 3.3319444444444444e-07, "loss": 1.1024, "step": 12000 }, { "epoch": 0.49, "eval_accuracy": 0.39261477045908183, "eval_loss": 1.089038372039795, "eval_runtime": 54.9763, "eval_samples_per_second": 91.13, "eval_steps_per_second": 11.405, "step": 12000 }, { "epoch": 0.53, "learning_rate": 3.6094444444444446e-07, "loss": 1.1007, "step": 13000 }, { "epoch": 0.53, "eval_accuracy": 0.34311377245508984, "eval_loss": 1.0933948755264282, "eval_runtime": 54.9285, "eval_samples_per_second": 91.209, "eval_steps_per_second": 11.415, "step": 13000 }, { "epoch": 0.57, "learning_rate": 3.8872222222222223e-07, "loss": 1.0985, "step": 14000 }, { "epoch": 0.57, "eval_accuracy": 0.36367265469061877, "eval_loss": 1.09434974193573, "eval_runtime": 54.8032, "eval_samples_per_second": 91.418, "eval_steps_per_second": 11.441, "step": 14000 }, { "epoch": 0.61, "learning_rate": 4.1650000000000006e-07, "loss": 1.0988, "step": 15000 }, { "epoch": 0.61, "eval_accuracy": 0.39481037924151696, "eval_loss": 1.0886671543121338, "eval_runtime": 54.9221, "eval_samples_per_second": 91.22, "eval_steps_per_second": 11.416, "step": 15000 }, { "epoch": 0.65, "learning_rate": 4.4425e-07, "loss": 1.0965, "step": 16000 }, { "epoch": 0.65, "eval_accuracy": 0.3916167664670659, "eval_loss": 1.0834949016571045, "eval_runtime": 54.5628, "eval_samples_per_second": 91.821, "eval_steps_per_second": 11.491, "step": 16000 }, { "epoch": 0.69, "learning_rate": 4.7202777777777785e-07, "loss": 1.0926, "step": 17000 }, { "epoch": 0.69, "eval_accuracy": 0.4239520958083832, "eval_loss": 1.079688310623169, "eval_runtime": 54.6989, "eval_samples_per_second": 91.592, "eval_steps_per_second": 11.463, "step": 17000 }, { "epoch": 0.73, "learning_rate": 4.998055555555556e-07, "loss": 1.0956, "step": 18000 }, { "epoch": 0.73, "eval_accuracy": 0.4219560878243513, "eval_loss": 1.080493688583374, "eval_runtime": 54.6863, "eval_samples_per_second": 91.613, "eval_steps_per_second": 11.465, "step": 18000 }, { "epoch": 0.77, "learning_rate": 5.275555555555556e-07, "loss": 1.0878, "step": 19000 }, { "epoch": 0.77, "eval_accuracy": 0.4343313373253493, "eval_loss": 1.0664235353469849, "eval_runtime": 54.7843, "eval_samples_per_second": 91.45, "eval_steps_per_second": 11.445, "step": 19000 }, { "epoch": 0.81, "learning_rate": 5.553333333333334e-07, "loss": 1.0793, "step": 20000 }, { "epoch": 0.81, "eval_accuracy": 0.4365269461077844, "eval_loss": 1.06390380859375, "eval_runtime": 54.7978, "eval_samples_per_second": 91.427, "eval_steps_per_second": 11.442, "step": 20000 }, { "epoch": 0.86, "learning_rate": 5.830833333333334e-07, "loss": 1.0746, "step": 21000 }, { "epoch": 0.86, "eval_accuracy": 0.4311377245508982, "eval_loss": 1.0611063241958618, "eval_runtime": 54.6084, "eval_samples_per_second": 91.744, "eval_steps_per_second": 11.482, "step": 21000 }, { "epoch": 0.9, "learning_rate": 6.108611111111111e-07, "loss": 1.0757, "step": 22000 }, { "epoch": 0.9, "eval_accuracy": 0.43253493013972055, "eval_loss": 1.0579031705856323, "eval_runtime": 54.7147, "eval_samples_per_second": 91.566, "eval_steps_per_second": 11.459, "step": 22000 }, { "epoch": 0.94, "learning_rate": 6.386111111111112e-07, "loss": 1.0712, "step": 23000 }, { "epoch": 0.94, "eval_accuracy": 0.43353293413173655, "eval_loss": 1.0545520782470703, "eval_runtime": 54.8205, "eval_samples_per_second": 91.389, "eval_steps_per_second": 11.437, "step": 23000 }, { "epoch": 0.98, "learning_rate": 6.663888888888889e-07, "loss": 1.0703, "step": 24000 }, { "epoch": 0.98, "eval_accuracy": 0.43872255489021955, "eval_loss": 1.0489881038665771, "eval_runtime": 54.6899, "eval_samples_per_second": 91.607, "eval_steps_per_second": 11.465, "step": 24000 }, { "epoch": 1.02, "learning_rate": 6.941666666666667e-07, "loss": 1.0673, "step": 25000 }, { "epoch": 1.02, "eval_accuracy": 0.4467065868263473, "eval_loss": 1.0496938228607178, "eval_runtime": 54.8292, "eval_samples_per_second": 91.375, "eval_steps_per_second": 11.436, "step": 25000 }, { "epoch": 1.06, "learning_rate": 7.219166666666666e-07, "loss": 1.0672, "step": 26000 }, { "epoch": 1.06, "eval_accuracy": 0.4297405189620758, "eval_loss": 1.0511012077331543, "eval_runtime": 54.8046, "eval_samples_per_second": 91.416, "eval_steps_per_second": 11.441, "step": 26000 }, { "epoch": 1.1, "learning_rate": 7.496944444444444e-07, "loss": 1.0658, "step": 27000 }, { "epoch": 1.1, "eval_accuracy": 0.43912175648702595, "eval_loss": 1.0465787649154663, "eval_runtime": 54.7507, "eval_samples_per_second": 91.506, "eval_steps_per_second": 11.452, "step": 27000 }, { "epoch": 1.14, "learning_rate": 7.774444444444445e-07, "loss": 1.0638, "step": 28000 }, { "epoch": 1.14, "eval_accuracy": 0.43333333333333335, "eval_loss": 1.0430774688720703, "eval_runtime": 54.6439, "eval_samples_per_second": 91.685, "eval_steps_per_second": 11.474, "step": 28000 }, { "epoch": 1.18, "learning_rate": 8.052222222222223e-07, "loss": 1.0602, "step": 29000 }, { "epoch": 1.18, "eval_accuracy": 0.4479041916167665, "eval_loss": 1.0387687683105469, "eval_runtime": 54.8009, "eval_samples_per_second": 91.422, "eval_steps_per_second": 11.441, "step": 29000 }, { "epoch": 1.22, "learning_rate": 8.329722222222223e-07, "loss": 1.0567, "step": 30000 }, { "epoch": 1.22, "eval_accuracy": 0.4485029940119761, "eval_loss": 1.0339411497116089, "eval_runtime": 54.6621, "eval_samples_per_second": 91.654, "eval_steps_per_second": 11.47, "step": 30000 }, { "epoch": 1.26, "learning_rate": 8.607500000000001e-07, "loss": 1.0611, "step": 31000 }, { "epoch": 1.26, "eval_accuracy": 0.4465069860279441, "eval_loss": 1.0385600328445435, "eval_runtime": 54.7106, "eval_samples_per_second": 91.573, "eval_steps_per_second": 11.46, "step": 31000 }, { "epoch": 1.3, "learning_rate": 8.885e-07, "loss": 1.0555, "step": 32000 }, { "epoch": 1.3, "eval_accuracy": 0.4377245508982036, "eval_loss": 1.0331332683563232, "eval_runtime": 54.7554, "eval_samples_per_second": 91.498, "eval_steps_per_second": 11.451, "step": 32000 }, { "epoch": 1.34, "learning_rate": 9.162777777777779e-07, "loss": 1.0512, "step": 33000 }, { "epoch": 1.34, "eval_accuracy": 0.44610778443113774, "eval_loss": 1.028577446937561, "eval_runtime": 54.7331, "eval_samples_per_second": 91.535, "eval_steps_per_second": 11.456, "step": 33000 }, { "epoch": 1.39, "learning_rate": 9.440277777777779e-07, "loss": 1.048, "step": 34000 }, { "epoch": 1.39, "eval_accuracy": 0.4532934131736527, "eval_loss": 1.0286486148834229, "eval_runtime": 54.7585, "eval_samples_per_second": 91.493, "eval_steps_per_second": 11.45, "step": 34000 }, { "epoch": 1.43, "learning_rate": 9.718055555555557e-07, "loss": 1.0524, "step": 35000 }, { "epoch": 1.43, "eval_accuracy": 0.449500998003992, "eval_loss": 1.0262507200241089, "eval_runtime": 54.1381, "eval_samples_per_second": 92.541, "eval_steps_per_second": 11.582, "step": 35000 }, { "epoch": 1.47, "learning_rate": 9.995555555555557e-07, "loss": 1.0472, "step": 36000 }, { "epoch": 1.47, "eval_accuracy": 0.4552894211576846, "eval_loss": 1.022876501083374, "eval_runtime": 54.2077, "eval_samples_per_second": 92.422, "eval_steps_per_second": 11.567, "step": 36000 }, { "epoch": 1.51, "learning_rate": 1.0273333333333335e-06, "loss": 1.0454, "step": 37000 }, { "epoch": 1.51, "eval_accuracy": 0.4477045908183633, "eval_loss": 1.0219467878341675, "eval_runtime": 54.4001, "eval_samples_per_second": 92.095, "eval_steps_per_second": 11.526, "step": 37000 }, { "epoch": 1.55, "learning_rate": 1.0550833333333334e-06, "loss": 1.0473, "step": 38000 }, { "epoch": 1.55, "eval_accuracy": 0.46107784431137727, "eval_loss": 1.0189749002456665, "eval_runtime": 54.291, "eval_samples_per_second": 92.28, "eval_steps_per_second": 11.549, "step": 38000 }, { "epoch": 1.59, "learning_rate": 1.0828611111111111e-06, "loss": 1.0465, "step": 39000 }, { "epoch": 1.59, "eval_accuracy": 0.4500998003992016, "eval_loss": 1.0226292610168457, "eval_runtime": 54.6634, "eval_samples_per_second": 91.652, "eval_steps_per_second": 11.47, "step": 39000 }, { "epoch": 1.63, "learning_rate": 1.1106111111111112e-06, "loss": 1.0408, "step": 40000 }, { "epoch": 1.63, "eval_accuracy": 0.45229540918163674, "eval_loss": 1.0191301107406616, "eval_runtime": 54.6371, "eval_samples_per_second": 91.696, "eval_steps_per_second": 11.476, "step": 40000 }, { "epoch": 1.67, "learning_rate": 1.138388888888889e-06, "loss": 1.0433, "step": 41000 }, { "epoch": 1.67, "eval_accuracy": 0.4620758483033932, "eval_loss": 1.0231131315231323, "eval_runtime": 54.2747, "eval_samples_per_second": 92.308, "eval_steps_per_second": 11.552, "step": 41000 }, { "epoch": 1.71, "learning_rate": 1.166138888888889e-06, "loss": 1.0392, "step": 42000 }, { "epoch": 1.71, "eval_accuracy": 0.46726546906187627, "eval_loss": 1.0097001791000366, "eval_runtime": 54.4967, "eval_samples_per_second": 91.932, "eval_steps_per_second": 11.505, "step": 42000 }, { "epoch": 1.75, "learning_rate": 1.1939166666666668e-06, "loss": 1.0381, "step": 43000 }, { "epoch": 1.75, "eval_accuracy": 0.4483033932135729, "eval_loss": 1.032842755317688, "eval_runtime": 54.2658, "eval_samples_per_second": 92.323, "eval_steps_per_second": 11.554, "step": 43000 }, { "epoch": 1.79, "learning_rate": 1.2216944444444446e-06, "loss": 1.0401, "step": 44000 }, { "epoch": 1.79, "eval_accuracy": 0.4806387225548902, "eval_loss": 1.0062618255615234, "eval_runtime": 54.1855, "eval_samples_per_second": 92.46, "eval_steps_per_second": 11.571, "step": 44000 }, { "epoch": 1.83, "learning_rate": 1.2494444444444447e-06, "loss": 1.0366, "step": 45000 }, { "epoch": 1.83, "eval_accuracy": 0.46966067864271455, "eval_loss": 1.0111383199691772, "eval_runtime": 54.4723, "eval_samples_per_second": 91.973, "eval_steps_per_second": 11.51, "step": 45000 }, { "epoch": 1.87, "learning_rate": 1.2772222222222224e-06, "loss": 1.0365, "step": 46000 }, { "epoch": 1.87, "eval_accuracy": 0.47584830339321355, "eval_loss": 1.0140398740768433, "eval_runtime": 54.4788, "eval_samples_per_second": 91.962, "eval_steps_per_second": 11.509, "step": 46000 }, { "epoch": 1.91, "learning_rate": 1.3049722222222223e-06, "loss": 1.0299, "step": 47000 }, { "epoch": 1.91, "eval_accuracy": 0.4784431137724551, "eval_loss": 1.0014474391937256, "eval_runtime": 54.2653, "eval_samples_per_second": 92.324, "eval_steps_per_second": 11.554, "step": 47000 }, { "epoch": 1.96, "learning_rate": 1.33275e-06, "loss": 1.0342, "step": 48000 }, { "epoch": 1.96, "eval_accuracy": 0.47744510978043914, "eval_loss": 1.0174448490142822, "eval_runtime": 54.2558, "eval_samples_per_second": 92.34, "eval_steps_per_second": 11.556, "step": 48000 }, { "epoch": 2.0, "learning_rate": 1.3605000000000001e-06, "loss": 1.028, "step": 49000 }, { "epoch": 2.0, "eval_accuracy": 0.49860279441117766, "eval_loss": 0.9931387305259705, "eval_runtime": 54.9876, "eval_samples_per_second": 91.111, "eval_steps_per_second": 11.403, "step": 49000 } ], "logging_steps": 1000, "max_steps": 10000000, "num_train_epochs": 408, "save_steps": 1000, "total_flos": 1.024271400936407e+17, "trial_name": null, "trial_params": null }