diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6577 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "global_step": 468696, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 4.967996313175278e-05, + "loss": 2.3652, + "step": 1000 + }, + { + "epoch": 0.01, + "eval_loss": 2.5228402614593506, + "eval_runtime": 19.0224, + "eval_samples_per_second": 105.139, + "eval_steps_per_second": 1.682, + "step": 1000 + }, + { + "epoch": 0.01, + "learning_rate": 4.935992626350556e-05, + "loss": 2.3257, + "step": 2000 + }, + { + "epoch": 0.01, + "eval_loss": 2.4702858924865723, + "eval_runtime": 19.2339, + "eval_samples_per_second": 103.983, + "eval_steps_per_second": 1.664, + "step": 2000 + }, + { + "epoch": 0.02, + "learning_rate": 4.903988939525834e-05, + "loss": 2.3123, + "step": 3000 + }, + { + "epoch": 0.02, + "eval_loss": 2.5149893760681152, + "eval_runtime": 18.8526, + "eval_samples_per_second": 106.086, + "eval_steps_per_second": 1.697, + "step": 3000 + }, + { + "epoch": 0.03, + "learning_rate": 4.871985252701111e-05, + "loss": 2.3068, + "step": 4000 + }, + { + "epoch": 0.03, + "eval_loss": 2.4867441654205322, + "eval_runtime": 18.7454, + "eval_samples_per_second": 106.693, + "eval_steps_per_second": 1.707, + "step": 4000 + }, + { + "epoch": 0.03, + "learning_rate": 4.8399815658763894e-05, + "loss": 2.2906, + "step": 5000 + }, + { + "epoch": 0.03, + "eval_loss": 2.4581546783447266, + "eval_runtime": 19.2143, + "eval_samples_per_second": 104.089, + "eval_steps_per_second": 1.665, + "step": 5000 + }, + { + "epoch": 0.04, + "learning_rate": 4.807977879051667e-05, + "loss": 2.2817, + "step": 6000 + }, + { + "epoch": 0.04, + "eval_loss": 2.477738380432129, + "eval_runtime": 18.9443, + "eval_samples_per_second": 105.573, + "eval_steps_per_second": 1.689, + "step": 6000 + }, + { + "epoch": 0.04, + "learning_rate": 4.7759741922269444e-05, + "loss": 2.2713, + "step": 7000 + }, + { + "epoch": 0.04, + "eval_loss": 2.490509271621704, + "eval_runtime": 19.0513, + "eval_samples_per_second": 104.98, + "eval_steps_per_second": 1.68, + "step": 7000 + }, + { + "epoch": 0.05, + "learning_rate": 4.743970505402223e-05, + "loss": 2.2653, + "step": 8000 + }, + { + "epoch": 0.05, + "eval_loss": 2.472813129425049, + "eval_runtime": 19.2806, + "eval_samples_per_second": 103.731, + "eval_steps_per_second": 1.66, + "step": 8000 + }, + { + "epoch": 0.06, + "learning_rate": 4.7119668185775e-05, + "loss": 2.2581, + "step": 9000 + }, + { + "epoch": 0.06, + "eval_loss": 2.4772017002105713, + "eval_runtime": 19.5407, + "eval_samples_per_second": 102.351, + "eval_steps_per_second": 1.638, + "step": 9000 + }, + { + "epoch": 0.06, + "learning_rate": 4.679963131752778e-05, + "loss": 2.2687, + "step": 10000 + }, + { + "epoch": 0.06, + "eval_loss": 2.4584801197052, + "eval_runtime": 18.8505, + "eval_samples_per_second": 106.098, + "eval_steps_per_second": 1.698, + "step": 10000 + }, + { + "epoch": 0.07, + "learning_rate": 4.6479594449280565e-05, + "loss": 2.2473, + "step": 11000 + }, + { + "epoch": 0.07, + "eval_loss": 2.450211524963379, + "eval_runtime": 19.0047, + "eval_samples_per_second": 105.237, + "eval_steps_per_second": 1.684, + "step": 11000 + }, + { + "epoch": 0.08, + "learning_rate": 4.6159557581033336e-05, + "loss": 2.2536, + "step": 12000 + }, + { + "epoch": 0.08, + "eval_loss": 2.402937650680542, + "eval_runtime": 19.1856, + "eval_samples_per_second": 104.245, + "eval_steps_per_second": 1.668, + "step": 12000 + }, + { + "epoch": 0.08, + "learning_rate": 4.5839520712786115e-05, + "loss": 2.2355, + "step": 13000 + }, + { + "epoch": 0.08, + "eval_loss": 2.5034797191619873, + "eval_runtime": 18.9351, + "eval_samples_per_second": 105.624, + "eval_steps_per_second": 1.69, + "step": 13000 + }, + { + "epoch": 0.09, + "learning_rate": 4.551948384453889e-05, + "loss": 2.2356, + "step": 14000 + }, + { + "epoch": 0.09, + "eval_loss": 2.443594217300415, + "eval_runtime": 19.1979, + "eval_samples_per_second": 104.178, + "eval_steps_per_second": 1.667, + "step": 14000 + }, + { + "epoch": 0.1, + "learning_rate": 4.519944697629167e-05, + "loss": 2.2385, + "step": 15000 + }, + { + "epoch": 0.1, + "eval_loss": 2.4230918884277344, + "eval_runtime": 19.1941, + "eval_samples_per_second": 104.199, + "eval_steps_per_second": 1.667, + "step": 15000 + }, + { + "epoch": 0.1, + "learning_rate": 4.487941010804445e-05, + "loss": 2.229, + "step": 16000 + }, + { + "epoch": 0.1, + "eval_loss": 2.435939073562622, + "eval_runtime": 18.7132, + "eval_samples_per_second": 106.876, + "eval_steps_per_second": 1.71, + "step": 16000 + }, + { + "epoch": 0.11, + "learning_rate": 4.455937323979723e-05, + "loss": 2.2308, + "step": 17000 + }, + { + "epoch": 0.11, + "eval_loss": 2.379002094268799, + "eval_runtime": 18.8323, + "eval_samples_per_second": 106.2, + "eval_steps_per_second": 1.699, + "step": 17000 + }, + { + "epoch": 0.12, + "learning_rate": 4.4239336371550006e-05, + "loss": 2.2247, + "step": 18000 + }, + { + "epoch": 0.12, + "eval_loss": 2.440680742263794, + "eval_runtime": 18.8124, + "eval_samples_per_second": 106.313, + "eval_steps_per_second": 1.701, + "step": 18000 + }, + { + "epoch": 0.12, + "learning_rate": 4.391929950330278e-05, + "loss": 2.2262, + "step": 19000 + }, + { + "epoch": 0.12, + "eval_loss": 2.401104211807251, + "eval_runtime": 18.8589, + "eval_samples_per_second": 106.051, + "eval_steps_per_second": 1.697, + "step": 19000 + }, + { + "epoch": 0.13, + "learning_rate": 4.359926263505556e-05, + "loss": 2.2074, + "step": 20000 + }, + { + "epoch": 0.13, + "eval_loss": 2.382688522338867, + "eval_runtime": 18.7139, + "eval_samples_per_second": 106.872, + "eval_steps_per_second": 1.71, + "step": 20000 + }, + { + "epoch": 0.13, + "learning_rate": 4.327922576680834e-05, + "loss": 2.2204, + "step": 21000 + }, + { + "epoch": 0.13, + "eval_loss": 2.421189308166504, + "eval_runtime": 18.9386, + "eval_samples_per_second": 105.604, + "eval_steps_per_second": 1.69, + "step": 21000 + }, + { + "epoch": 0.14, + "learning_rate": 4.295918889856111e-05, + "loss": 2.2123, + "step": 22000 + }, + { + "epoch": 0.14, + "eval_loss": 2.4362740516662598, + "eval_runtime": 18.9745, + "eval_samples_per_second": 105.405, + "eval_steps_per_second": 1.686, + "step": 22000 + }, + { + "epoch": 0.15, + "learning_rate": 4.263915203031389e-05, + "loss": 2.2225, + "step": 23000 + }, + { + "epoch": 0.15, + "eval_loss": 2.426682710647583, + "eval_runtime": 19.1794, + "eval_samples_per_second": 104.278, + "eval_steps_per_second": 1.668, + "step": 23000 + }, + { + "epoch": 0.15, + "learning_rate": 4.231911516206668e-05, + "loss": 2.2137, + "step": 24000 + }, + { + "epoch": 0.15, + "eval_loss": 2.4169669151306152, + "eval_runtime": 18.8197, + "eval_samples_per_second": 106.272, + "eval_steps_per_second": 1.7, + "step": 24000 + }, + { + "epoch": 0.16, + "learning_rate": 4.199907829381945e-05, + "loss": 2.2143, + "step": 25000 + }, + { + "epoch": 0.16, + "eval_loss": 2.4082441329956055, + "eval_runtime": 18.9737, + "eval_samples_per_second": 105.409, + "eval_steps_per_second": 1.687, + "step": 25000 + }, + { + "epoch": 0.17, + "learning_rate": 4.167904142557223e-05, + "loss": 2.2131, + "step": 26000 + }, + { + "epoch": 0.17, + "eval_loss": 2.4836766719818115, + "eval_runtime": 19.0574, + "eval_samples_per_second": 104.946, + "eval_steps_per_second": 1.679, + "step": 26000 + }, + { + "epoch": 0.17, + "learning_rate": 4.1359004557325005e-05, + "loss": 2.1954, + "step": 27000 + }, + { + "epoch": 0.17, + "eval_loss": 2.43381404876709, + "eval_runtime": 18.8859, + "eval_samples_per_second": 105.899, + "eval_steps_per_second": 1.694, + "step": 27000 + }, + { + "epoch": 0.18, + "learning_rate": 4.1038967689077783e-05, + "loss": 2.1934, + "step": 28000 + }, + { + "epoch": 0.18, + "eval_loss": 2.4075064659118652, + "eval_runtime": 18.689, + "eval_samples_per_second": 107.015, + "eval_steps_per_second": 1.712, + "step": 28000 + }, + { + "epoch": 0.19, + "learning_rate": 4.071893082083056e-05, + "loss": 2.1943, + "step": 29000 + }, + { + "epoch": 0.19, + "eval_loss": 2.383098602294922, + "eval_runtime": 18.8218, + "eval_samples_per_second": 106.26, + "eval_steps_per_second": 1.7, + "step": 29000 + }, + { + "epoch": 0.19, + "learning_rate": 4.039889395258334e-05, + "loss": 2.1944, + "step": 30000 + }, + { + "epoch": 0.19, + "eval_loss": 2.3953185081481934, + "eval_runtime": 18.9451, + "eval_samples_per_second": 105.568, + "eval_steps_per_second": 1.689, + "step": 30000 + }, + { + "epoch": 0.2, + "learning_rate": 4.007885708433612e-05, + "loss": 2.1914, + "step": 31000 + }, + { + "epoch": 0.2, + "eval_loss": 2.411050796508789, + "eval_runtime": 18.7128, + "eval_samples_per_second": 106.878, + "eval_steps_per_second": 1.71, + "step": 31000 + }, + { + "epoch": 0.2, + "learning_rate": 3.975882021608889e-05, + "loss": 2.1865, + "step": 32000 + }, + { + "epoch": 0.2, + "eval_loss": 2.390427827835083, + "eval_runtime": 18.9045, + "eval_samples_per_second": 105.795, + "eval_steps_per_second": 1.693, + "step": 32000 + }, + { + "epoch": 0.21, + "learning_rate": 3.9438783347841675e-05, + "loss": 2.1871, + "step": 33000 + }, + { + "epoch": 0.21, + "eval_loss": 2.401388168334961, + "eval_runtime": 18.7096, + "eval_samples_per_second": 106.897, + "eval_steps_per_second": 1.71, + "step": 33000 + }, + { + "epoch": 0.22, + "learning_rate": 3.9118746479594454e-05, + "loss": 2.1792, + "step": 34000 + }, + { + "epoch": 0.22, + "eval_loss": 2.4562745094299316, + "eval_runtime": 18.8567, + "eval_samples_per_second": 106.063, + "eval_steps_per_second": 1.697, + "step": 34000 + }, + { + "epoch": 0.22, + "learning_rate": 3.8798709611347225e-05, + "loss": 2.1921, + "step": 35000 + }, + { + "epoch": 0.22, + "eval_loss": 2.399921417236328, + "eval_runtime": 18.7883, + "eval_samples_per_second": 106.449, + "eval_steps_per_second": 1.703, + "step": 35000 + }, + { + "epoch": 0.23, + "learning_rate": 3.847867274310001e-05, + "loss": 2.1831, + "step": 36000 + }, + { + "epoch": 0.23, + "eval_loss": 2.3935768604278564, + "eval_runtime": 18.8237, + "eval_samples_per_second": 106.249, + "eval_steps_per_second": 1.7, + "step": 36000 + }, + { + "epoch": 0.24, + "learning_rate": 3.815863587485278e-05, + "loss": 2.169, + "step": 37000 + }, + { + "epoch": 0.24, + "eval_loss": 2.385082960128784, + "eval_runtime": 18.9677, + "eval_samples_per_second": 105.442, + "eval_steps_per_second": 1.687, + "step": 37000 + }, + { + "epoch": 0.24, + "learning_rate": 3.783859900660556e-05, + "loss": 2.1619, + "step": 38000 + }, + { + "epoch": 0.24, + "eval_loss": 2.3289620876312256, + "eval_runtime": 19.0182, + "eval_samples_per_second": 105.162, + "eval_steps_per_second": 1.683, + "step": 38000 + }, + { + "epoch": 0.25, + "learning_rate": 3.7518562138358346e-05, + "loss": 2.1651, + "step": 39000 + }, + { + "epoch": 0.25, + "eval_loss": 2.3818867206573486, + "eval_runtime": 18.9593, + "eval_samples_per_second": 105.489, + "eval_steps_per_second": 1.688, + "step": 39000 + }, + { + "epoch": 0.26, + "learning_rate": 3.719852527011112e-05, + "loss": 2.1704, + "step": 40000 + }, + { + "epoch": 0.26, + "eval_loss": 2.3583953380584717, + "eval_runtime": 18.8577, + "eval_samples_per_second": 106.057, + "eval_steps_per_second": 1.697, + "step": 40000 + }, + { + "epoch": 0.26, + "learning_rate": 3.6878488401863896e-05, + "loss": 2.1601, + "step": 41000 + }, + { + "epoch": 0.26, + "eval_loss": 2.3705227375030518, + "eval_runtime": 19.0038, + "eval_samples_per_second": 105.242, + "eval_steps_per_second": 1.684, + "step": 41000 + }, + { + "epoch": 0.27, + "learning_rate": 3.6558451533616674e-05, + "loss": 2.1819, + "step": 42000 + }, + { + "epoch": 0.27, + "eval_loss": 2.3806064128875732, + "eval_runtime": 19.09, + "eval_samples_per_second": 104.767, + "eval_steps_per_second": 1.676, + "step": 42000 + }, + { + "epoch": 0.28, + "learning_rate": 3.623841466536945e-05, + "loss": 2.1666, + "step": 43000 + }, + { + "epoch": 0.28, + "eval_loss": 2.3670589923858643, + "eval_runtime": 18.9485, + "eval_samples_per_second": 105.549, + "eval_steps_per_second": 1.689, + "step": 43000 + }, + { + "epoch": 0.28, + "learning_rate": 3.591837779712223e-05, + "loss": 2.1718, + "step": 44000 + }, + { + "epoch": 0.28, + "eval_loss": 2.364011764526367, + "eval_runtime": 18.7665, + "eval_samples_per_second": 106.573, + "eval_steps_per_second": 1.705, + "step": 44000 + }, + { + "epoch": 0.29, + "learning_rate": 3.559834092887501e-05, + "loss": 2.1521, + "step": 45000 + }, + { + "epoch": 0.29, + "eval_loss": 2.373670816421509, + "eval_runtime": 18.9014, + "eval_samples_per_second": 105.812, + "eval_steps_per_second": 1.693, + "step": 45000 + }, + { + "epoch": 0.29, + "learning_rate": 3.527830406062779e-05, + "loss": 2.148, + "step": 46000 + }, + { + "epoch": 0.29, + "eval_loss": 2.377063035964966, + "eval_runtime": 19.012, + "eval_samples_per_second": 105.197, + "eval_steps_per_second": 1.683, + "step": 46000 + }, + { + "epoch": 0.3, + "learning_rate": 3.495826719238056e-05, + "loss": 2.1438, + "step": 47000 + }, + { + "epoch": 0.3, + "eval_loss": 2.3637826442718506, + "eval_runtime": 18.982, + "eval_samples_per_second": 105.363, + "eval_steps_per_second": 1.686, + "step": 47000 + }, + { + "epoch": 0.31, + "learning_rate": 3.4638230324133344e-05, + "loss": 2.1536, + "step": 48000 + }, + { + "epoch": 0.31, + "eval_loss": 2.3571810722351074, + "eval_runtime": 18.8471, + "eval_samples_per_second": 106.117, + "eval_steps_per_second": 1.698, + "step": 48000 + }, + { + "epoch": 0.31, + "learning_rate": 3.431819345588612e-05, + "loss": 2.1505, + "step": 49000 + }, + { + "epoch": 0.31, + "eval_loss": 2.3516790866851807, + "eval_runtime": 18.8575, + "eval_samples_per_second": 106.059, + "eval_steps_per_second": 1.697, + "step": 49000 + }, + { + "epoch": 0.32, + "learning_rate": 3.3998156587638894e-05, + "loss": 2.1319, + "step": 50000 + }, + { + "epoch": 0.32, + "eval_loss": 2.3615307807922363, + "eval_runtime": 19.1166, + "eval_samples_per_second": 104.621, + "eval_steps_per_second": 1.674, + "step": 50000 + }, + { + "epoch": 0.33, + "learning_rate": 3.367811971939168e-05, + "loss": 2.123, + "step": 51000 + }, + { + "epoch": 0.33, + "eval_loss": 2.3522212505340576, + "eval_runtime": 19.1501, + "eval_samples_per_second": 104.438, + "eval_steps_per_second": 1.671, + "step": 51000 + }, + { + "epoch": 0.33, + "learning_rate": 3.335808285114445e-05, + "loss": 2.1513, + "step": 52000 + }, + { + "epoch": 0.33, + "eval_loss": 2.388401746749878, + "eval_runtime": 18.8344, + "eval_samples_per_second": 106.189, + "eval_steps_per_second": 1.699, + "step": 52000 + }, + { + "epoch": 0.34, + "learning_rate": 3.303804598289723e-05, + "loss": 2.1419, + "step": 53000 + }, + { + "epoch": 0.34, + "eval_loss": 2.32639479637146, + "eval_runtime": 18.8162, + "eval_samples_per_second": 106.292, + "eval_steps_per_second": 1.701, + "step": 53000 + }, + { + "epoch": 0.35, + "learning_rate": 3.271800911465001e-05, + "loss": 2.1404, + "step": 54000 + }, + { + "epoch": 0.35, + "eval_loss": 2.3595259189605713, + "eval_runtime": 18.8272, + "eval_samples_per_second": 106.229, + "eval_steps_per_second": 1.7, + "step": 54000 + }, + { + "epoch": 0.35, + "learning_rate": 3.2397972246402786e-05, + "loss": 2.128, + "step": 55000 + }, + { + "epoch": 0.35, + "eval_loss": 2.3471484184265137, + "eval_runtime": 18.9594, + "eval_samples_per_second": 105.489, + "eval_steps_per_second": 1.688, + "step": 55000 + }, + { + "epoch": 0.36, + "learning_rate": 3.2077935378155565e-05, + "loss": 2.1287, + "step": 56000 + }, + { + "epoch": 0.36, + "eval_loss": 2.347370147705078, + "eval_runtime": 18.9278, + "eval_samples_per_second": 105.665, + "eval_steps_per_second": 1.691, + "step": 56000 + }, + { + "epoch": 0.36, + "learning_rate": 3.175789850990834e-05, + "loss": 2.1372, + "step": 57000 + }, + { + "epoch": 0.36, + "eval_loss": 2.3139336109161377, + "eval_runtime": 19.0473, + "eval_samples_per_second": 105.002, + "eval_steps_per_second": 1.68, + "step": 57000 + }, + { + "epoch": 0.37, + "learning_rate": 3.143786164166112e-05, + "loss": 2.1301, + "step": 58000 + }, + { + "epoch": 0.37, + "eval_loss": 2.3145127296447754, + "eval_runtime": 18.831, + "eval_samples_per_second": 106.208, + "eval_steps_per_second": 1.699, + "step": 58000 + }, + { + "epoch": 0.38, + "learning_rate": 3.11178247734139e-05, + "loss": 2.128, + "step": 59000 + }, + { + "epoch": 0.38, + "eval_loss": 2.3634743690490723, + "eval_runtime": 19.0052, + "eval_samples_per_second": 105.234, + "eval_steps_per_second": 1.684, + "step": 59000 + }, + { + "epoch": 0.38, + "learning_rate": 3.079778790516668e-05, + "loss": 2.1088, + "step": 60000 + }, + { + "epoch": 0.38, + "eval_loss": 2.3068251609802246, + "eval_runtime": 18.9935, + "eval_samples_per_second": 105.299, + "eval_steps_per_second": 1.685, + "step": 60000 + }, + { + "epoch": 0.39, + "learning_rate": 3.0477751036919456e-05, + "loss": 2.122, + "step": 61000 + }, + { + "epoch": 0.39, + "eval_loss": 2.312502145767212, + "eval_runtime": 18.8963, + "eval_samples_per_second": 105.841, + "eval_steps_per_second": 1.693, + "step": 61000 + }, + { + "epoch": 0.4, + "learning_rate": 3.015771416867223e-05, + "loss": 2.1113, + "step": 62000 + }, + { + "epoch": 0.4, + "eval_loss": 2.3446314334869385, + "eval_runtime": 18.8671, + "eval_samples_per_second": 106.005, + "eval_steps_per_second": 1.696, + "step": 62000 + }, + { + "epoch": 0.4, + "learning_rate": 2.983767730042501e-05, + "loss": 2.1108, + "step": 63000 + }, + { + "epoch": 0.4, + "eval_loss": 2.3173420429229736, + "eval_runtime": 18.7418, + "eval_samples_per_second": 106.713, + "eval_steps_per_second": 1.707, + "step": 63000 + }, + { + "epoch": 0.41, + "learning_rate": 2.951764043217779e-05, + "loss": 2.125, + "step": 64000 + }, + { + "epoch": 0.41, + "eval_loss": 2.363111734390259, + "eval_runtime": 18.789, + "eval_samples_per_second": 106.445, + "eval_steps_per_second": 1.703, + "step": 64000 + }, + { + "epoch": 0.42, + "learning_rate": 2.9197603563930563e-05, + "loss": 2.1106, + "step": 65000 + }, + { + "epoch": 0.42, + "eval_loss": 2.331869602203369, + "eval_runtime": 18.9057, + "eval_samples_per_second": 105.788, + "eval_steps_per_second": 1.693, + "step": 65000 + }, + { + "epoch": 0.42, + "learning_rate": 2.8877566695683345e-05, + "loss": 2.1143, + "step": 66000 + }, + { + "epoch": 0.42, + "eval_loss": 2.300299882888794, + "eval_runtime": 18.7948, + "eval_samples_per_second": 106.413, + "eval_steps_per_second": 1.703, + "step": 66000 + }, + { + "epoch": 0.43, + "learning_rate": 2.8557529827436123e-05, + "loss": 2.0982, + "step": 67000 + }, + { + "epoch": 0.43, + "eval_loss": 2.3044443130493164, + "eval_runtime": 19.1803, + "eval_samples_per_second": 104.273, + "eval_steps_per_second": 1.668, + "step": 67000 + }, + { + "epoch": 0.44, + "learning_rate": 2.82374929591889e-05, + "loss": 2.1026, + "step": 68000 + }, + { + "epoch": 0.44, + "eval_loss": 2.305398464202881, + "eval_runtime": 18.9121, + "eval_samples_per_second": 105.752, + "eval_steps_per_second": 1.692, + "step": 68000 + }, + { + "epoch": 0.44, + "learning_rate": 2.791745609094168e-05, + "loss": 2.0995, + "step": 69000 + }, + { + "epoch": 0.44, + "eval_loss": 2.3068206310272217, + "eval_runtime": 18.8989, + "eval_samples_per_second": 105.826, + "eval_steps_per_second": 1.693, + "step": 69000 + }, + { + "epoch": 0.45, + "learning_rate": 2.7597419222694455e-05, + "loss": 2.0844, + "step": 70000 + }, + { + "epoch": 0.45, + "eval_loss": 2.3477184772491455, + "eval_runtime": 19.0274, + "eval_samples_per_second": 105.111, + "eval_steps_per_second": 1.682, + "step": 70000 + }, + { + "epoch": 0.45, + "learning_rate": 2.7277382354447233e-05, + "loss": 2.1008, + "step": 71000 + }, + { + "epoch": 0.45, + "eval_loss": 2.339860439300537, + "eval_runtime": 18.7939, + "eval_samples_per_second": 106.418, + "eval_steps_per_second": 1.703, + "step": 71000 + }, + { + "epoch": 0.46, + "learning_rate": 2.6957345486200015e-05, + "loss": 2.092, + "step": 72000 + }, + { + "epoch": 0.46, + "eval_loss": 2.3236074447631836, + "eval_runtime": 18.7746, + "eval_samples_per_second": 106.527, + "eval_steps_per_second": 1.704, + "step": 72000 + }, + { + "epoch": 0.47, + "learning_rate": 2.663730861795279e-05, + "loss": 2.09, + "step": 73000 + }, + { + "epoch": 0.47, + "eval_loss": 2.3070333003997803, + "eval_runtime": 19.3882, + "eval_samples_per_second": 103.155, + "eval_steps_per_second": 1.65, + "step": 73000 + }, + { + "epoch": 0.47, + "learning_rate": 2.631727174970557e-05, + "loss": 2.0984, + "step": 74000 + }, + { + "epoch": 0.47, + "eval_loss": 2.31845760345459, + "eval_runtime": 19.5362, + "eval_samples_per_second": 102.374, + "eval_steps_per_second": 1.638, + "step": 74000 + }, + { + "epoch": 0.48, + "learning_rate": 2.5997234881458344e-05, + "loss": 2.0965, + "step": 75000 + }, + { + "epoch": 0.48, + "eval_loss": 2.306812047958374, + "eval_runtime": 19.4702, + "eval_samples_per_second": 102.721, + "eval_steps_per_second": 1.644, + "step": 75000 + }, + { + "epoch": 0.49, + "learning_rate": 2.5677198013211122e-05, + "loss": 2.081, + "step": 76000 + }, + { + "epoch": 0.49, + "eval_loss": 2.274367570877075, + "eval_runtime": 19.5806, + "eval_samples_per_second": 102.142, + "eval_steps_per_second": 1.634, + "step": 76000 + }, + { + "epoch": 0.49, + "learning_rate": 2.5357161144963904e-05, + "loss": 2.0871, + "step": 77000 + }, + { + "epoch": 0.49, + "eval_loss": 2.254237651824951, + "eval_runtime": 19.7552, + "eval_samples_per_second": 101.239, + "eval_steps_per_second": 1.62, + "step": 77000 + }, + { + "epoch": 0.5, + "learning_rate": 2.503712427671668e-05, + "loss": 2.0751, + "step": 78000 + }, + { + "epoch": 0.5, + "eval_loss": 2.2817015647888184, + "eval_runtime": 19.5765, + "eval_samples_per_second": 102.163, + "eval_steps_per_second": 1.635, + "step": 78000 + }, + { + "epoch": 0.51, + "learning_rate": 2.4717087408469457e-05, + "loss": 2.0875, + "step": 79000 + }, + { + "epoch": 0.51, + "eval_loss": 2.288637161254883, + "eval_runtime": 19.6173, + "eval_samples_per_second": 101.951, + "eval_steps_per_second": 1.631, + "step": 79000 + }, + { + "epoch": 0.51, + "learning_rate": 2.4397050540222236e-05, + "loss": 2.0847, + "step": 80000 + }, + { + "epoch": 0.51, + "eval_loss": 2.3093936443328857, + "eval_runtime": 19.3962, + "eval_samples_per_second": 103.113, + "eval_steps_per_second": 1.65, + "step": 80000 + }, + { + "epoch": 0.52, + "learning_rate": 2.4077013671975014e-05, + "loss": 2.0861, + "step": 81000 + }, + { + "epoch": 0.52, + "eval_loss": 2.294950246810913, + "eval_runtime": 19.5483, + "eval_samples_per_second": 102.311, + "eval_steps_per_second": 1.637, + "step": 81000 + }, + { + "epoch": 0.52, + "learning_rate": 2.375697680372779e-05, + "loss": 2.0689, + "step": 82000 + }, + { + "epoch": 0.52, + "eval_loss": 2.293389320373535, + "eval_runtime": 19.51, + "eval_samples_per_second": 102.512, + "eval_steps_per_second": 1.64, + "step": 82000 + }, + { + "epoch": 0.53, + "learning_rate": 2.3436939935480567e-05, + "loss": 2.0767, + "step": 83000 + }, + { + "epoch": 0.53, + "eval_loss": 2.304983615875244, + "eval_runtime": 19.26, + "eval_samples_per_second": 103.842, + "eval_steps_per_second": 1.661, + "step": 83000 + }, + { + "epoch": 0.54, + "learning_rate": 2.311690306723335e-05, + "loss": 2.0711, + "step": 84000 + }, + { + "epoch": 0.54, + "eval_loss": 2.2823355197906494, + "eval_runtime": 20.4429, + "eval_samples_per_second": 97.834, + "eval_steps_per_second": 1.565, + "step": 84000 + }, + { + "epoch": 0.54, + "learning_rate": 2.2796866198986124e-05, + "loss": 2.0654, + "step": 85000 + }, + { + "epoch": 0.54, + "eval_loss": 2.280226469039917, + "eval_runtime": 19.5501, + "eval_samples_per_second": 102.301, + "eval_steps_per_second": 1.637, + "step": 85000 + }, + { + "epoch": 0.55, + "learning_rate": 2.2476829330738902e-05, + "loss": 2.0627, + "step": 86000 + }, + { + "epoch": 0.55, + "eval_loss": 2.2770543098449707, + "eval_runtime": 19.4549, + "eval_samples_per_second": 102.802, + "eval_steps_per_second": 1.645, + "step": 86000 + }, + { + "epoch": 0.56, + "learning_rate": 2.215679246249168e-05, + "loss": 2.0656, + "step": 87000 + }, + { + "epoch": 0.56, + "eval_loss": 2.2922134399414062, + "eval_runtime": 19.3407, + "eval_samples_per_second": 103.409, + "eval_steps_per_second": 1.655, + "step": 87000 + }, + { + "epoch": 0.56, + "learning_rate": 2.1836755594244456e-05, + "loss": 2.07, + "step": 88000 + }, + { + "epoch": 0.56, + "eval_loss": 2.268709897994995, + "eval_runtime": 19.4551, + "eval_samples_per_second": 102.801, + "eval_steps_per_second": 1.645, + "step": 88000 + }, + { + "epoch": 0.57, + "learning_rate": 2.1516718725997238e-05, + "loss": 2.0661, + "step": 89000 + }, + { + "epoch": 0.57, + "eval_loss": 2.247802972793579, + "eval_runtime": 19.273, + "eval_samples_per_second": 103.772, + "eval_steps_per_second": 1.66, + "step": 89000 + }, + { + "epoch": 0.58, + "learning_rate": 2.1196681857750016e-05, + "loss": 2.0511, + "step": 90000 + }, + { + "epoch": 0.58, + "eval_loss": 2.3074941635131836, + "eval_runtime": 19.2075, + "eval_samples_per_second": 104.126, + "eval_steps_per_second": 1.666, + "step": 90000 + }, + { + "epoch": 0.58, + "learning_rate": 2.087664498950279e-05, + "loss": 2.0582, + "step": 91000 + }, + { + "epoch": 0.58, + "eval_loss": 2.248690605163574, + "eval_runtime": 19.2432, + "eval_samples_per_second": 103.933, + "eval_steps_per_second": 1.663, + "step": 91000 + }, + { + "epoch": 0.59, + "learning_rate": 2.055660812125557e-05, + "loss": 2.0626, + "step": 92000 + }, + { + "epoch": 0.59, + "eval_loss": 2.2588484287261963, + "eval_runtime": 19.4441, + "eval_samples_per_second": 102.859, + "eval_steps_per_second": 1.646, + "step": 92000 + }, + { + "epoch": 0.6, + "learning_rate": 2.0236571253008348e-05, + "loss": 2.0562, + "step": 93000 + }, + { + "epoch": 0.6, + "eval_loss": 2.274319887161255, + "eval_runtime": 19.4979, + "eval_samples_per_second": 102.575, + "eval_steps_per_second": 1.641, + "step": 93000 + }, + { + "epoch": 0.6, + "learning_rate": 1.9916534384761126e-05, + "loss": 2.0511, + "step": 94000 + }, + { + "epoch": 0.6, + "eval_loss": 2.276171922683716, + "eval_runtime": 19.331, + "eval_samples_per_second": 103.461, + "eval_steps_per_second": 1.655, + "step": 94000 + }, + { + "epoch": 0.61, + "learning_rate": 1.9596497516513904e-05, + "loss": 2.0413, + "step": 95000 + }, + { + "epoch": 0.61, + "eval_loss": 2.2398881912231445, + "eval_runtime": 19.5099, + "eval_samples_per_second": 102.512, + "eval_steps_per_second": 1.64, + "step": 95000 + }, + { + "epoch": 0.61, + "learning_rate": 1.927646064826668e-05, + "loss": 2.0496, + "step": 96000 + }, + { + "epoch": 0.61, + "eval_loss": 2.271150588989258, + "eval_runtime": 19.317, + "eval_samples_per_second": 103.536, + "eval_steps_per_second": 1.657, + "step": 96000 + }, + { + "epoch": 0.62, + "learning_rate": 1.8956423780019458e-05, + "loss": 2.0564, + "step": 97000 + }, + { + "epoch": 0.62, + "eval_loss": 2.2770469188690186, + "eval_runtime": 19.1141, + "eval_samples_per_second": 104.635, + "eval_steps_per_second": 1.674, + "step": 97000 + }, + { + "epoch": 0.63, + "learning_rate": 1.863638691177224e-05, + "loss": 2.0505, + "step": 98000 + }, + { + "epoch": 0.63, + "eval_loss": 2.2885847091674805, + "eval_runtime": 19.6608, + "eval_samples_per_second": 101.725, + "eval_steps_per_second": 1.628, + "step": 98000 + }, + { + "epoch": 0.63, + "learning_rate": 1.8316350043525015e-05, + "loss": 2.0504, + "step": 99000 + }, + { + "epoch": 0.63, + "eval_loss": 2.3180038928985596, + "eval_runtime": 19.4021, + "eval_samples_per_second": 103.082, + "eval_steps_per_second": 1.649, + "step": 99000 + }, + { + "epoch": 0.64, + "learning_rate": 1.7996313175277793e-05, + "loss": 2.0439, + "step": 100000 + }, + { + "epoch": 0.64, + "eval_loss": 2.2651731967926025, + "eval_runtime": 19.3214, + "eval_samples_per_second": 103.512, + "eval_steps_per_second": 1.656, + "step": 100000 + }, + { + "epoch": 0.65, + "learning_rate": 1.767627630703057e-05, + "loss": 2.0461, + "step": 101000 + }, + { + "epoch": 0.65, + "eval_loss": 2.222968101501465, + "eval_runtime": 19.2774, + "eval_samples_per_second": 103.749, + "eval_steps_per_second": 1.66, + "step": 101000 + }, + { + "epoch": 0.65, + "learning_rate": 1.7356239438783346e-05, + "loss": 2.0405, + "step": 102000 + }, + { + "epoch": 0.65, + "eval_loss": 2.2448790073394775, + "eval_runtime": 21.2727, + "eval_samples_per_second": 94.017, + "eval_steps_per_second": 1.504, + "step": 102000 + }, + { + "epoch": 0.66, + "learning_rate": 1.7036202570536128e-05, + "loss": 2.038, + "step": 103000 + }, + { + "epoch": 0.66, + "eval_loss": 2.2096433639526367, + "eval_runtime": 19.292, + "eval_samples_per_second": 103.67, + "eval_steps_per_second": 1.659, + "step": 103000 + }, + { + "epoch": 0.67, + "learning_rate": 1.6716165702288906e-05, + "loss": 2.0205, + "step": 104000 + }, + { + "epoch": 0.67, + "eval_loss": 2.2131240367889404, + "eval_runtime": 19.1995, + "eval_samples_per_second": 104.169, + "eval_steps_per_second": 1.667, + "step": 104000 + }, + { + "epoch": 0.67, + "learning_rate": 1.639612883404168e-05, + "loss": 2.0196, + "step": 105000 + }, + { + "epoch": 0.67, + "eval_loss": 2.2505383491516113, + "eval_runtime": 19.4936, + "eval_samples_per_second": 102.598, + "eval_steps_per_second": 1.642, + "step": 105000 + }, + { + "epoch": 0.68, + "learning_rate": 1.607609196579446e-05, + "loss": 2.0272, + "step": 106000 + }, + { + "epoch": 0.68, + "eval_loss": 2.243058681488037, + "eval_runtime": 19.4712, + "eval_samples_per_second": 102.716, + "eval_steps_per_second": 1.643, + "step": 106000 + }, + { + "epoch": 0.68, + "learning_rate": 1.5756055097547238e-05, + "loss": 2.0276, + "step": 107000 + }, + { + "epoch": 0.68, + "eval_loss": 2.2137022018432617, + "eval_runtime": 18.6801, + "eval_samples_per_second": 107.066, + "eval_steps_per_second": 1.713, + "step": 107000 + }, + { + "epoch": 0.69, + "learning_rate": 1.5436018229300017e-05, + "loss": 2.0224, + "step": 108000 + }, + { + "epoch": 0.69, + "eval_loss": 2.2309203147888184, + "eval_runtime": 18.8357, + "eval_samples_per_second": 106.181, + "eval_steps_per_second": 1.699, + "step": 108000 + }, + { + "epoch": 0.7, + "learning_rate": 1.5115981361052795e-05, + "loss": 2.0253, + "step": 109000 + }, + { + "epoch": 0.7, + "eval_loss": 2.2213120460510254, + "eval_runtime": 19.2801, + "eval_samples_per_second": 103.734, + "eval_steps_per_second": 1.66, + "step": 109000 + }, + { + "epoch": 0.7, + "learning_rate": 1.4795944492805572e-05, + "loss": 2.0199, + "step": 110000 + }, + { + "epoch": 0.7, + "eval_loss": 2.2416763305664062, + "eval_runtime": 18.8526, + "eval_samples_per_second": 106.086, + "eval_steps_per_second": 1.697, + "step": 110000 + }, + { + "epoch": 0.71, + "learning_rate": 1.4475907624558348e-05, + "loss": 2.0216, + "step": 111000 + }, + { + "epoch": 0.71, + "eval_loss": 2.24078369140625, + "eval_runtime": 18.5093, + "eval_samples_per_second": 108.054, + "eval_steps_per_second": 1.729, + "step": 111000 + }, + { + "epoch": 0.72, + "learning_rate": 1.4155870756311127e-05, + "loss": 2.0236, + "step": 112000 + }, + { + "epoch": 0.72, + "eval_loss": 2.2598512172698975, + "eval_runtime": 19.0496, + "eval_samples_per_second": 104.989, + "eval_steps_per_second": 1.68, + "step": 112000 + }, + { + "epoch": 0.72, + "learning_rate": 1.3835833888063907e-05, + "loss": 2.0247, + "step": 113000 + }, + { + "epoch": 0.72, + "eval_loss": 2.2282919883728027, + "eval_runtime": 18.7751, + "eval_samples_per_second": 106.524, + "eval_steps_per_second": 1.704, + "step": 113000 + }, + { + "epoch": 0.73, + "learning_rate": 1.3515797019816683e-05, + "loss": 2.0263, + "step": 114000 + }, + { + "epoch": 0.73, + "eval_loss": 2.248234748840332, + "eval_runtime": 18.8567, + "eval_samples_per_second": 106.063, + "eval_steps_per_second": 1.697, + "step": 114000 + }, + { + "epoch": 0.74, + "learning_rate": 1.3195760151569462e-05, + "loss": 2.014, + "step": 115000 + }, + { + "epoch": 0.74, + "eval_loss": 2.194716691970825, + "eval_runtime": 18.7872, + "eval_samples_per_second": 106.455, + "eval_steps_per_second": 1.703, + "step": 115000 + }, + { + "epoch": 0.74, + "learning_rate": 1.2875723283322239e-05, + "loss": 2.0076, + "step": 116000 + }, + { + "epoch": 0.74, + "eval_loss": 2.233458995819092, + "eval_runtime": 18.8711, + "eval_samples_per_second": 105.982, + "eval_steps_per_second": 1.696, + "step": 116000 + }, + { + "epoch": 0.75, + "learning_rate": 1.2555686415075015e-05, + "loss": 2.011, + "step": 117000 + }, + { + "epoch": 0.75, + "eval_loss": 2.213284492492676, + "eval_runtime": 19.5167, + "eval_samples_per_second": 102.477, + "eval_steps_per_second": 1.64, + "step": 117000 + }, + { + "epoch": 0.76, + "learning_rate": 1.2235649546827795e-05, + "loss": 2.0216, + "step": 118000 + }, + { + "epoch": 0.76, + "eval_loss": 2.210317373275757, + "eval_runtime": 18.6333, + "eval_samples_per_second": 107.334, + "eval_steps_per_second": 1.717, + "step": 118000 + }, + { + "epoch": 0.76, + "learning_rate": 1.1915612678580574e-05, + "loss": 2.0097, + "step": 119000 + }, + { + "epoch": 0.76, + "eval_loss": 2.241175651550293, + "eval_runtime": 18.694, + "eval_samples_per_second": 106.986, + "eval_steps_per_second": 1.712, + "step": 119000 + }, + { + "epoch": 0.77, + "learning_rate": 1.159557581033335e-05, + "loss": 2.0076, + "step": 120000 + }, + { + "epoch": 0.77, + "eval_loss": 2.2543113231658936, + "eval_runtime": 18.8082, + "eval_samples_per_second": 106.336, + "eval_steps_per_second": 1.701, + "step": 120000 + }, + { + "epoch": 0.77, + "learning_rate": 1.1275538942086129e-05, + "loss": 2.01, + "step": 121000 + }, + { + "epoch": 0.77, + "eval_loss": 2.2642598152160645, + "eval_runtime": 19.4522, + "eval_samples_per_second": 102.816, + "eval_steps_per_second": 1.645, + "step": 121000 + }, + { + "epoch": 0.78, + "learning_rate": 1.0955502073838907e-05, + "loss": 2.0074, + "step": 122000 + }, + { + "epoch": 0.78, + "eval_loss": 2.2413113117218018, + "eval_runtime": 18.8078, + "eval_samples_per_second": 106.339, + "eval_steps_per_second": 1.701, + "step": 122000 + }, + { + "epoch": 0.79, + "learning_rate": 1.0635465205591686e-05, + "loss": 1.9898, + "step": 123000 + }, + { + "epoch": 0.79, + "eval_loss": 2.2442147731781006, + "eval_runtime": 18.8179, + "eval_samples_per_second": 106.282, + "eval_steps_per_second": 1.701, + "step": 123000 + }, + { + "epoch": 0.79, + "learning_rate": 1.0315428337344462e-05, + "loss": 2.0119, + "step": 124000 + }, + { + "epoch": 0.79, + "eval_loss": 2.227520704269409, + "eval_runtime": 18.9241, + "eval_samples_per_second": 105.686, + "eval_steps_per_second": 1.691, + "step": 124000 + }, + { + "epoch": 0.8, + "learning_rate": 9.99539146909724e-06, + "loss": 1.993, + "step": 125000 + }, + { + "epoch": 0.8, + "eval_loss": 2.2116286754608154, + "eval_runtime": 19.6066, + "eval_samples_per_second": 102.007, + "eval_steps_per_second": 1.632, + "step": 125000 + }, + { + "epoch": 0.81, + "learning_rate": 9.675354600850019e-06, + "loss": 2.0092, + "step": 126000 + }, + { + "epoch": 0.81, + "eval_loss": 2.2108232975006104, + "eval_runtime": 18.7069, + "eval_samples_per_second": 106.912, + "eval_steps_per_second": 1.711, + "step": 126000 + }, + { + "epoch": 0.81, + "learning_rate": 9.355317732602796e-06, + "loss": 2.0019, + "step": 127000 + }, + { + "epoch": 0.81, + "eval_loss": 2.2236363887786865, + "eval_runtime": 18.8801, + "eval_samples_per_second": 105.931, + "eval_steps_per_second": 1.695, + "step": 127000 + }, + { + "epoch": 0.82, + "learning_rate": 9.035280864355574e-06, + "loss": 1.9931, + "step": 128000 + }, + { + "epoch": 0.82, + "eval_loss": 2.2105228900909424, + "eval_runtime": 21.3819, + "eval_samples_per_second": 93.537, + "eval_steps_per_second": 1.497, + "step": 128000 + }, + { + "epoch": 0.83, + "learning_rate": 8.715243996108352e-06, + "loss": 1.9851, + "step": 129000 + }, + { + "epoch": 0.83, + "eval_loss": 2.2179064750671387, + "eval_runtime": 19.3741, + "eval_samples_per_second": 103.231, + "eval_steps_per_second": 1.652, + "step": 129000 + }, + { + "epoch": 0.83, + "learning_rate": 8.395207127861129e-06, + "loss": 1.9882, + "step": 130000 + }, + { + "epoch": 0.83, + "eval_loss": 2.2303926944732666, + "eval_runtime": 18.846, + "eval_samples_per_second": 106.123, + "eval_steps_per_second": 1.698, + "step": 130000 + }, + { + "epoch": 0.84, + "learning_rate": 8.075170259613907e-06, + "loss": 1.999, + "step": 131000 + }, + { + "epoch": 0.84, + "eval_loss": 2.202813148498535, + "eval_runtime": 19.3498, + "eval_samples_per_second": 103.36, + "eval_steps_per_second": 1.654, + "step": 131000 + }, + { + "epoch": 0.84, + "learning_rate": 7.755133391366686e-06, + "loss": 1.9848, + "step": 132000 + }, + { + "epoch": 0.84, + "eval_loss": 2.1549251079559326, + "eval_runtime": 20.1588, + "eval_samples_per_second": 99.212, + "eval_steps_per_second": 1.587, + "step": 132000 + }, + { + "epoch": 0.85, + "learning_rate": 7.435096523119464e-06, + "loss": 1.9962, + "step": 133000 + }, + { + "epoch": 0.85, + "eval_loss": 2.2457568645477295, + "eval_runtime": 19.1213, + "eval_samples_per_second": 104.595, + "eval_steps_per_second": 1.674, + "step": 133000 + }, + { + "epoch": 0.86, + "learning_rate": 7.115059654872242e-06, + "loss": 1.991, + "step": 134000 + }, + { + "epoch": 0.86, + "eval_loss": 2.1861023902893066, + "eval_runtime": 19.2023, + "eval_samples_per_second": 104.154, + "eval_steps_per_second": 1.666, + "step": 134000 + }, + { + "epoch": 0.86, + "learning_rate": 6.79502278662502e-06, + "loss": 1.9901, + "step": 135000 + }, + { + "epoch": 0.86, + "eval_loss": 2.2025179862976074, + "eval_runtime": 19.2167, + "eval_samples_per_second": 104.076, + "eval_steps_per_second": 1.665, + "step": 135000 + }, + { + "epoch": 0.87, + "learning_rate": 6.474985918377798e-06, + "loss": 1.9698, + "step": 136000 + }, + { + "epoch": 0.87, + "eval_loss": 2.2299790382385254, + "eval_runtime": 20.1153, + "eval_samples_per_second": 99.427, + "eval_steps_per_second": 1.591, + "step": 136000 + }, + { + "epoch": 0.88, + "learning_rate": 6.154949050130575e-06, + "loss": 1.9772, + "step": 137000 + }, + { + "epoch": 0.88, + "eval_loss": 2.1934893131256104, + "eval_runtime": 19.0706, + "eval_samples_per_second": 104.874, + "eval_steps_per_second": 1.678, + "step": 137000 + }, + { + "epoch": 0.88, + "learning_rate": 5.8349121818833536e-06, + "loss": 1.974, + "step": 138000 + }, + { + "epoch": 0.88, + "eval_loss": 2.201178789138794, + "eval_runtime": 18.9851, + "eval_samples_per_second": 105.346, + "eval_steps_per_second": 1.686, + "step": 138000 + }, + { + "epoch": 0.89, + "learning_rate": 5.514875313636131e-06, + "loss": 1.9906, + "step": 139000 + }, + { + "epoch": 0.89, + "eval_loss": 2.2042794227600098, + "eval_runtime": 19.1406, + "eval_samples_per_second": 104.49, + "eval_steps_per_second": 1.672, + "step": 139000 + }, + { + "epoch": 0.9, + "learning_rate": 5.194838445388909e-06, + "loss": 1.9899, + "step": 140000 + }, + { + "epoch": 0.9, + "eval_loss": 2.187676191329956, + "eval_runtime": 19.4746, + "eval_samples_per_second": 102.698, + "eval_steps_per_second": 1.643, + "step": 140000 + }, + { + "epoch": 0.9, + "learning_rate": 4.874801577141687e-06, + "loss": 1.9785, + "step": 141000 + }, + { + "epoch": 0.9, + "eval_loss": 2.2104039192199707, + "eval_runtime": 19.2016, + "eval_samples_per_second": 104.158, + "eval_steps_per_second": 1.667, + "step": 141000 + }, + { + "epoch": 0.91, + "learning_rate": 4.5547647088944646e-06, + "loss": 1.9682, + "step": 142000 + }, + { + "epoch": 0.91, + "eval_loss": 2.1898605823516846, + "eval_runtime": 19.2296, + "eval_samples_per_second": 104.006, + "eval_steps_per_second": 1.664, + "step": 142000 + }, + { + "epoch": 0.92, + "learning_rate": 4.234727840647243e-06, + "loss": 1.9785, + "step": 143000 + }, + { + "epoch": 0.92, + "eval_loss": 2.183152437210083, + "eval_runtime": 19.1118, + "eval_samples_per_second": 104.647, + "eval_steps_per_second": 1.674, + "step": 143000 + }, + { + "epoch": 0.92, + "learning_rate": 3.914690972400021e-06, + "loss": 1.9795, + "step": 144000 + }, + { + "epoch": 0.92, + "eval_loss": 2.199709415435791, + "eval_runtime": 19.352, + "eval_samples_per_second": 103.348, + "eval_steps_per_second": 1.654, + "step": 144000 + }, + { + "epoch": 0.93, + "learning_rate": 3.5946541041527984e-06, + "loss": 1.9656, + "step": 145000 + }, + { + "epoch": 0.93, + "eval_loss": 2.200268507003784, + "eval_runtime": 19.3103, + "eval_samples_per_second": 103.572, + "eval_steps_per_second": 1.657, + "step": 145000 + }, + { + "epoch": 0.93, + "learning_rate": 3.2746172359055764e-06, + "loss": 1.9813, + "step": 146000 + }, + { + "epoch": 0.93, + "eval_loss": 2.1825687885284424, + "eval_runtime": 19.0952, + "eval_samples_per_second": 104.739, + "eval_steps_per_second": 1.676, + "step": 146000 + }, + { + "epoch": 0.94, + "learning_rate": 2.9545803676583543e-06, + "loss": 1.9719, + "step": 147000 + }, + { + "epoch": 0.94, + "eval_loss": 2.1915125846862793, + "eval_runtime": 19.3108, + "eval_samples_per_second": 103.569, + "eval_steps_per_second": 1.657, + "step": 147000 + }, + { + "epoch": 0.95, + "learning_rate": 2.6345434994111323e-06, + "loss": 1.962, + "step": 148000 + }, + { + "epoch": 0.95, + "eval_loss": 2.196523904800415, + "eval_runtime": 19.1234, + "eval_samples_per_second": 104.584, + "eval_steps_per_second": 1.673, + "step": 148000 + }, + { + "epoch": 0.95, + "learning_rate": 2.3145066311639102e-06, + "loss": 1.9657, + "step": 149000 + }, + { + "epoch": 0.95, + "eval_loss": 2.1772007942199707, + "eval_runtime": 19.0921, + "eval_samples_per_second": 104.756, + "eval_steps_per_second": 1.676, + "step": 149000 + }, + { + "epoch": 0.96, + "learning_rate": 1.994469762916688e-06, + "loss": 1.9662, + "step": 150000 + }, + { + "epoch": 0.96, + "eval_loss": 2.151597261428833, + "eval_runtime": 19.1055, + "eval_samples_per_second": 104.682, + "eval_steps_per_second": 1.675, + "step": 150000 + }, + { + "epoch": 0.97, + "learning_rate": 1.674432894669466e-06, + "loss": 1.9631, + "step": 151000 + }, + { + "epoch": 0.97, + "eval_loss": 2.1692702770233154, + "eval_runtime": 19.4031, + "eval_samples_per_second": 103.077, + "eval_steps_per_second": 1.649, + "step": 151000 + }, + { + "epoch": 0.97, + "learning_rate": 1.354396026422244e-06, + "loss": 1.9651, + "step": 152000 + }, + { + "epoch": 0.97, + "eval_loss": 2.174436330795288, + "eval_runtime": 19.3133, + "eval_samples_per_second": 103.555, + "eval_steps_per_second": 1.657, + "step": 152000 + }, + { + "epoch": 0.98, + "learning_rate": 1.0343591581750219e-06, + "loss": 1.9761, + "step": 153000 + }, + { + "epoch": 0.98, + "eval_loss": 2.1922249794006348, + "eval_runtime": 19.2415, + "eval_samples_per_second": 103.942, + "eval_steps_per_second": 1.663, + "step": 153000 + }, + { + "epoch": 0.99, + "learning_rate": 7.143222899277997e-07, + "loss": 1.9602, + "step": 154000 + }, + { + "epoch": 0.99, + "eval_loss": 2.177457571029663, + "eval_runtime": 19.1279, + "eval_samples_per_second": 104.559, + "eval_steps_per_second": 1.673, + "step": 154000 + }, + { + "epoch": 0.99, + "learning_rate": 3.9428542168057766e-07, + "loss": 1.9429, + "step": 155000 + }, + { + "epoch": 0.99, + "eval_loss": 2.167567491531372, + "eval_runtime": 19.5087, + "eval_samples_per_second": 102.518, + "eval_steps_per_second": 1.64, + "step": 155000 + }, + { + "epoch": 1.0, + "learning_rate": 7.424855343335553e-08, + "loss": 1.9662, + "step": 156000 + }, + { + "epoch": 1.0, + "eval_loss": 2.179702043533325, + "eval_runtime": 19.3655, + "eval_samples_per_second": 103.276, + "eval_steps_per_second": 1.652, + "step": 156000 + }, + { + "epoch": 1.0, + "learning_rate": 2.4877105842593068e-05, + "loss": 1.9883, + "step": 157000 + }, + { + "epoch": 1.0, + "eval_loss": 2.1984949111938477, + "eval_runtime": 19.5918, + "eval_samples_per_second": 102.084, + "eval_steps_per_second": 1.633, + "step": 157000 + }, + { + "epoch": 1.01, + "learning_rate": 2.4717087408469457e-05, + "loss": 2.0127, + "step": 158000 + }, + { + "epoch": 1.01, + "eval_loss": 2.264371633529663, + "eval_runtime": 19.1742, + "eval_samples_per_second": 104.307, + "eval_steps_per_second": 1.669, + "step": 158000 + }, + { + "epoch": 1.02, + "learning_rate": 2.4557068974345846e-05, + "loss": 2.013, + "step": 159000 + }, + { + "epoch": 1.02, + "eval_loss": 2.263242721557617, + "eval_runtime": 19.0844, + "eval_samples_per_second": 104.798, + "eval_steps_per_second": 1.677, + "step": 159000 + }, + { + "epoch": 1.02, + "learning_rate": 2.4397050540222236e-05, + "loss": 2.0243, + "step": 160000 + }, + { + "epoch": 1.02, + "eval_loss": 2.267091751098633, + "eval_runtime": 19.3957, + "eval_samples_per_second": 103.116, + "eval_steps_per_second": 1.65, + "step": 160000 + }, + { + "epoch": 1.03, + "learning_rate": 2.423703210609862e-05, + "loss": 2.021, + "step": 161000 + }, + { + "epoch": 1.03, + "eval_loss": 2.2471094131469727, + "eval_runtime": 19.2438, + "eval_samples_per_second": 103.929, + "eval_steps_per_second": 1.663, + "step": 161000 + }, + { + "epoch": 1.04, + "learning_rate": 2.4077013671975014e-05, + "loss": 2.0278, + "step": 162000 + }, + { + "epoch": 1.04, + "eval_loss": 2.2140402793884277, + "eval_runtime": 19.0312, + "eval_samples_per_second": 105.091, + "eval_steps_per_second": 1.681, + "step": 162000 + }, + { + "epoch": 1.04, + "learning_rate": 2.3916995237851403e-05, + "loss": 2.0109, + "step": 163000 + }, + { + "epoch": 1.04, + "eval_loss": 2.2622554302215576, + "eval_runtime": 19.0334, + "eval_samples_per_second": 105.078, + "eval_steps_per_second": 1.681, + "step": 163000 + }, + { + "epoch": 1.05, + "learning_rate": 2.375697680372779e-05, + "loss": 2.023, + "step": 164000 + }, + { + "epoch": 1.05, + "eval_loss": 2.245877981185913, + "eval_runtime": 19.4264, + "eval_samples_per_second": 102.953, + "eval_steps_per_second": 1.647, + "step": 164000 + }, + { + "epoch": 1.06, + "learning_rate": 2.359695836960418e-05, + "loss": 2.0187, + "step": 165000 + }, + { + "epoch": 1.06, + "eval_loss": 2.25624942779541, + "eval_runtime": 19.303, + "eval_samples_per_second": 103.611, + "eval_steps_per_second": 1.658, + "step": 165000 + }, + { + "epoch": 1.06, + "learning_rate": 2.3436939935480567e-05, + "loss": 2.019, + "step": 166000 + }, + { + "epoch": 1.06, + "eval_loss": 2.2587056159973145, + "eval_runtime": 18.8102, + "eval_samples_per_second": 106.325, + "eval_steps_per_second": 1.701, + "step": 166000 + }, + { + "epoch": 1.07, + "learning_rate": 2.3276921501356956e-05, + "loss": 2.0208, + "step": 167000 + }, + { + "epoch": 1.07, + "eval_loss": 2.2842631340026855, + "eval_runtime": 19.22, + "eval_samples_per_second": 104.058, + "eval_steps_per_second": 1.665, + "step": 167000 + }, + { + "epoch": 1.08, + "learning_rate": 2.311690306723335e-05, + "loss": 2.0043, + "step": 168000 + }, + { + "epoch": 1.08, + "eval_loss": 2.2638208866119385, + "eval_runtime": 19.6646, + "eval_samples_per_second": 101.706, + "eval_steps_per_second": 1.627, + "step": 168000 + }, + { + "epoch": 1.08, + "learning_rate": 2.2956884633109735e-05, + "loss": 2.0171, + "step": 169000 + }, + { + "epoch": 1.08, + "eval_loss": 2.2604892253875732, + "eval_runtime": 19.2438, + "eval_samples_per_second": 103.93, + "eval_steps_per_second": 1.663, + "step": 169000 + }, + { + "epoch": 1.09, + "learning_rate": 2.2796866198986124e-05, + "loss": 2.0351, + "step": 170000 + }, + { + "epoch": 1.09, + "eval_loss": 2.2608911991119385, + "eval_runtime": 19.3036, + "eval_samples_per_second": 103.607, + "eval_steps_per_second": 1.658, + "step": 170000 + }, + { + "epoch": 1.09, + "learning_rate": 2.2636847764862513e-05, + "loss": 2.0166, + "step": 171000 + }, + { + "epoch": 1.09, + "eval_loss": 2.2317748069763184, + "eval_runtime": 19.0555, + "eval_samples_per_second": 104.957, + "eval_steps_per_second": 1.679, + "step": 171000 + }, + { + "epoch": 1.1, + "learning_rate": 2.2476829330738902e-05, + "loss": 2.0102, + "step": 172000 + }, + { + "epoch": 1.1, + "eval_loss": 2.2210681438446045, + "eval_runtime": 19.4253, + "eval_samples_per_second": 102.958, + "eval_steps_per_second": 1.647, + "step": 172000 + }, + { + "epoch": 1.11, + "learning_rate": 2.231681089661529e-05, + "loss": 2.0226, + "step": 173000 + }, + { + "epoch": 1.11, + "eval_loss": 2.2446329593658447, + "eval_runtime": 19.1758, + "eval_samples_per_second": 104.298, + "eval_steps_per_second": 1.669, + "step": 173000 + }, + { + "epoch": 1.11, + "learning_rate": 2.215679246249168e-05, + "loss": 2.0293, + "step": 174000 + }, + { + "epoch": 1.11, + "eval_loss": 2.2327494621276855, + "eval_runtime": 19.0577, + "eval_samples_per_second": 104.945, + "eval_steps_per_second": 1.679, + "step": 174000 + }, + { + "epoch": 1.12, + "learning_rate": 2.199677402836807e-05, + "loss": 2.0269, + "step": 175000 + }, + { + "epoch": 1.12, + "eval_loss": 2.223355293273926, + "eval_runtime": 19.372, + "eval_samples_per_second": 103.242, + "eval_steps_per_second": 1.652, + "step": 175000 + }, + { + "epoch": 1.13, + "learning_rate": 2.1836755594244456e-05, + "loss": 2.0232, + "step": 176000 + }, + { + "epoch": 1.13, + "eval_loss": 2.2283060550689697, + "eval_runtime": 19.4986, + "eval_samples_per_second": 102.572, + "eval_steps_per_second": 1.641, + "step": 176000 + }, + { + "epoch": 1.13, + "learning_rate": 2.167673716012085e-05, + "loss": 2.0155, + "step": 177000 + }, + { + "epoch": 1.13, + "eval_loss": 2.241269588470459, + "eval_runtime": 19.5594, + "eval_samples_per_second": 102.253, + "eval_steps_per_second": 1.636, + "step": 177000 + }, + { + "epoch": 1.14, + "learning_rate": 2.1516718725997238e-05, + "loss": 2.0148, + "step": 178000 + }, + { + "epoch": 1.14, + "eval_loss": 2.2584030628204346, + "eval_runtime": 18.9767, + "eval_samples_per_second": 105.392, + "eval_steps_per_second": 1.686, + "step": 178000 + }, + { + "epoch": 1.15, + "learning_rate": 2.1356700291873623e-05, + "loss": 2.0167, + "step": 179000 + }, + { + "epoch": 1.15, + "eval_loss": 2.2308297157287598, + "eval_runtime": 19.318, + "eval_samples_per_second": 103.531, + "eval_steps_per_second": 1.656, + "step": 179000 + }, + { + "epoch": 1.15, + "learning_rate": 2.1196681857750016e-05, + "loss": 2.0204, + "step": 180000 + }, + { + "epoch": 1.15, + "eval_loss": 2.2320470809936523, + "eval_runtime": 19.5088, + "eval_samples_per_second": 102.518, + "eval_steps_per_second": 1.64, + "step": 180000 + }, + { + "epoch": 1.16, + "learning_rate": 2.1036663423626402e-05, + "loss": 2.014, + "step": 181000 + }, + { + "epoch": 1.16, + "eval_loss": 2.25752854347229, + "eval_runtime": 19.1454, + "eval_samples_per_second": 104.464, + "eval_steps_per_second": 1.671, + "step": 181000 + }, + { + "epoch": 1.16, + "learning_rate": 2.087664498950279e-05, + "loss": 2.0149, + "step": 182000 + }, + { + "epoch": 1.16, + "eval_loss": 2.2161190509796143, + "eval_runtime": 19.0879, + "eval_samples_per_second": 104.779, + "eval_steps_per_second": 1.676, + "step": 182000 + }, + { + "epoch": 1.17, + "learning_rate": 2.071662655537918e-05, + "loss": 2.0082, + "step": 183000 + }, + { + "epoch": 1.17, + "eval_loss": 2.2062742710113525, + "eval_runtime": 19.2713, + "eval_samples_per_second": 103.781, + "eval_steps_per_second": 1.66, + "step": 183000 + }, + { + "epoch": 1.18, + "learning_rate": 2.055660812125557e-05, + "loss": 2.0017, + "step": 184000 + }, + { + "epoch": 1.18, + "eval_loss": 2.2289586067199707, + "eval_runtime": 19.4499, + "eval_samples_per_second": 102.828, + "eval_steps_per_second": 1.645, + "step": 184000 + }, + { + "epoch": 1.18, + "learning_rate": 2.039658968713196e-05, + "loss": 2.0146, + "step": 185000 + }, + { + "epoch": 1.18, + "eval_loss": 2.2288384437561035, + "eval_runtime": 19.335, + "eval_samples_per_second": 103.439, + "eval_steps_per_second": 1.655, + "step": 185000 + }, + { + "epoch": 1.19, + "learning_rate": 2.0236571253008348e-05, + "loss": 2.024, + "step": 186000 + }, + { + "epoch": 1.19, + "eval_loss": 2.194934606552124, + "eval_runtime": 19.5009, + "eval_samples_per_second": 102.559, + "eval_steps_per_second": 1.641, + "step": 186000 + }, + { + "epoch": 1.2, + "learning_rate": 2.0076552818884737e-05, + "loss": 2.0016, + "step": 187000 + }, + { + "epoch": 1.2, + "eval_loss": 2.197631597518921, + "eval_runtime": 19.2128, + "eval_samples_per_second": 104.097, + "eval_steps_per_second": 1.666, + "step": 187000 + }, + { + "epoch": 1.2, + "learning_rate": 1.9916534384761126e-05, + "loss": 2.0066, + "step": 188000 + }, + { + "epoch": 1.2, + "eval_loss": 2.238746166229248, + "eval_runtime": 19.4524, + "eval_samples_per_second": 102.815, + "eval_steps_per_second": 1.645, + "step": 188000 + }, + { + "epoch": 1.21, + "learning_rate": 1.9756515950637515e-05, + "loss": 2.0168, + "step": 189000 + }, + { + "epoch": 1.21, + "eval_loss": 2.2261757850646973, + "eval_runtime": 19.645, + "eval_samples_per_second": 101.807, + "eval_steps_per_second": 1.629, + "step": 189000 + }, + { + "epoch": 1.22, + "learning_rate": 1.9596497516513904e-05, + "loss": 2.0023, + "step": 190000 + }, + { + "epoch": 1.22, + "eval_loss": 2.2070722579956055, + "eval_runtime": 19.0874, + "eval_samples_per_second": 104.781, + "eval_steps_per_second": 1.676, + "step": 190000 + }, + { + "epoch": 1.22, + "learning_rate": 1.943647908239029e-05, + "loss": 1.9917, + "step": 191000 + }, + { + "epoch": 1.22, + "eval_loss": 2.2613461017608643, + "eval_runtime": 19.1099, + "eval_samples_per_second": 104.658, + "eval_steps_per_second": 1.675, + "step": 191000 + }, + { + "epoch": 1.23, + "learning_rate": 1.927646064826668e-05, + "loss": 2.01, + "step": 192000 + }, + { + "epoch": 1.23, + "eval_loss": 2.2324349880218506, + "eval_runtime": 20.8611, + "eval_samples_per_second": 95.872, + "eval_steps_per_second": 1.534, + "step": 192000 + }, + { + "epoch": 1.24, + "learning_rate": 1.9116442214143072e-05, + "loss": 2.0023, + "step": 193000 + }, + { + "epoch": 1.24, + "eval_loss": 2.2707834243774414, + "eval_runtime": 19.7356, + "eval_samples_per_second": 101.34, + "eval_steps_per_second": 1.621, + "step": 193000 + }, + { + "epoch": 1.24, + "learning_rate": 1.8956423780019458e-05, + "loss": 2.0037, + "step": 194000 + }, + { + "epoch": 1.24, + "eval_loss": 2.2384769916534424, + "eval_runtime": 19.0414, + "eval_samples_per_second": 105.034, + "eval_steps_per_second": 1.681, + "step": 194000 + }, + { + "epoch": 1.25, + "learning_rate": 1.8796405345895847e-05, + "loss": 1.9994, + "step": 195000 + }, + { + "epoch": 1.25, + "eval_loss": 2.192796230316162, + "eval_runtime": 19.0496, + "eval_samples_per_second": 104.989, + "eval_steps_per_second": 1.68, + "step": 195000 + }, + { + "epoch": 1.25, + "learning_rate": 1.863638691177224e-05, + "loss": 1.994, + "step": 196000 + }, + { + "epoch": 1.25, + "eval_loss": 2.170961618423462, + "eval_runtime": 19.6903, + "eval_samples_per_second": 101.573, + "eval_steps_per_second": 1.625, + "step": 196000 + }, + { + "epoch": 1.26, + "learning_rate": 1.8476368477648625e-05, + "loss": 2.0016, + "step": 197000 + }, + { + "epoch": 1.26, + "eval_loss": 2.2660317420959473, + "eval_runtime": 19.6654, + "eval_samples_per_second": 101.702, + "eval_steps_per_second": 1.627, + "step": 197000 + }, + { + "epoch": 1.27, + "learning_rate": 1.8316350043525015e-05, + "loss": 2.0044, + "step": 198000 + }, + { + "epoch": 1.27, + "eval_loss": 2.204163074493408, + "eval_runtime": 18.9759, + "eval_samples_per_second": 105.397, + "eval_steps_per_second": 1.686, + "step": 198000 + }, + { + "epoch": 1.27, + "learning_rate": 1.8156331609401404e-05, + "loss": 1.9962, + "step": 199000 + }, + { + "epoch": 1.27, + "eval_loss": 2.214494228363037, + "eval_runtime": 19.1044, + "eval_samples_per_second": 104.688, + "eval_steps_per_second": 1.675, + "step": 199000 + }, + { + "epoch": 1.28, + "learning_rate": 1.7996313175277793e-05, + "loss": 2.002, + "step": 200000 + }, + { + "epoch": 1.28, + "eval_loss": 2.231771230697632, + "eval_runtime": 19.3683, + "eval_samples_per_second": 103.262, + "eval_steps_per_second": 1.652, + "step": 200000 + }, + { + "epoch": 1.29, + "learning_rate": 1.7836294741154182e-05, + "loss": 1.9933, + "step": 201000 + }, + { + "epoch": 1.29, + "eval_loss": 2.2037816047668457, + "eval_runtime": 19.3894, + "eval_samples_per_second": 103.149, + "eval_steps_per_second": 1.65, + "step": 201000 + }, + { + "epoch": 1.29, + "learning_rate": 1.767627630703057e-05, + "loss": 2.01, + "step": 202000 + }, + { + "epoch": 1.29, + "eval_loss": 2.1932146549224854, + "eval_runtime": 19.0804, + "eval_samples_per_second": 104.819, + "eval_steps_per_second": 1.677, + "step": 202000 + }, + { + "epoch": 1.3, + "learning_rate": 1.751625787290696e-05, + "loss": 1.9876, + "step": 203000 + }, + { + "epoch": 1.3, + "eval_loss": 2.1909868717193604, + "eval_runtime": 19.2334, + "eval_samples_per_second": 103.986, + "eval_steps_per_second": 1.664, + "step": 203000 + }, + { + "epoch": 1.31, + "learning_rate": 1.7356239438783346e-05, + "loss": 1.9959, + "step": 204000 + }, + { + "epoch": 1.31, + "eval_loss": 2.226149559020996, + "eval_runtime": 19.403, + "eval_samples_per_second": 103.077, + "eval_steps_per_second": 1.649, + "step": 204000 + }, + { + "epoch": 1.31, + "learning_rate": 1.719622100465974e-05, + "loss": 1.9966, + "step": 205000 + }, + { + "epoch": 1.31, + "eval_loss": 2.250934600830078, + "eval_runtime": 19.4964, + "eval_samples_per_second": 102.583, + "eval_steps_per_second": 1.641, + "step": 205000 + }, + { + "epoch": 1.32, + "learning_rate": 1.7036202570536128e-05, + "loss": 2.001, + "step": 206000 + }, + { + "epoch": 1.32, + "eval_loss": 2.1994211673736572, + "eval_runtime": 19.1839, + "eval_samples_per_second": 104.254, + "eval_steps_per_second": 1.668, + "step": 206000 + }, + { + "epoch": 1.32, + "learning_rate": 1.6876184136412514e-05, + "loss": 1.9883, + "step": 207000 + }, + { + "epoch": 1.32, + "eval_loss": 2.196751356124878, + "eval_runtime": 19.6979, + "eval_samples_per_second": 101.534, + "eval_steps_per_second": 1.625, + "step": 207000 + }, + { + "epoch": 1.33, + "learning_rate": 1.6716165702288906e-05, + "loss": 1.9968, + "step": 208000 + }, + { + "epoch": 1.33, + "eval_loss": 2.248135805130005, + "eval_runtime": 19.2411, + "eval_samples_per_second": 103.944, + "eval_steps_per_second": 1.663, + "step": 208000 + }, + { + "epoch": 1.34, + "learning_rate": 1.6556147268165292e-05, + "loss": 1.9951, + "step": 209000 + }, + { + "epoch": 1.34, + "eval_loss": 2.213362216949463, + "eval_runtime": 19.146, + "eval_samples_per_second": 104.46, + "eval_steps_per_second": 1.671, + "step": 209000 + }, + { + "epoch": 1.34, + "learning_rate": 1.639612883404168e-05, + "loss": 1.9941, + "step": 210000 + }, + { + "epoch": 1.34, + "eval_loss": 2.219302177429199, + "eval_runtime": 19.0054, + "eval_samples_per_second": 105.233, + "eval_steps_per_second": 1.684, + "step": 210000 + }, + { + "epoch": 1.35, + "learning_rate": 1.6236110399918074e-05, + "loss": 1.9875, + "step": 211000 + }, + { + "epoch": 1.35, + "eval_loss": 2.2148916721343994, + "eval_runtime": 19.4732, + "eval_samples_per_second": 102.705, + "eval_steps_per_second": 1.643, + "step": 211000 + }, + { + "epoch": 1.36, + "learning_rate": 1.607609196579446e-05, + "loss": 2.0026, + "step": 212000 + }, + { + "epoch": 1.36, + "eval_loss": 2.197999954223633, + "eval_runtime": 19.3649, + "eval_samples_per_second": 103.28, + "eval_steps_per_second": 1.652, + "step": 212000 + }, + { + "epoch": 1.36, + "learning_rate": 1.591607353167085e-05, + "loss": 1.9908, + "step": 213000 + }, + { + "epoch": 1.36, + "eval_loss": 2.2245354652404785, + "eval_runtime": 19.4688, + "eval_samples_per_second": 102.728, + "eval_steps_per_second": 1.644, + "step": 213000 + }, + { + "epoch": 1.37, + "learning_rate": 1.5756055097547238e-05, + "loss": 1.979, + "step": 214000 + }, + { + "epoch": 1.37, + "eval_loss": 2.186586856842041, + "eval_runtime": 19.6234, + "eval_samples_per_second": 101.919, + "eval_steps_per_second": 1.631, + "step": 214000 + }, + { + "epoch": 1.38, + "learning_rate": 1.5596036663423627e-05, + "loss": 1.99, + "step": 215000 + }, + { + "epoch": 1.38, + "eval_loss": 2.182631015777588, + "eval_runtime": 19.4018, + "eval_samples_per_second": 103.083, + "eval_steps_per_second": 1.649, + "step": 215000 + }, + { + "epoch": 1.38, + "learning_rate": 1.5436018229300017e-05, + "loss": 1.9816, + "step": 216000 + }, + { + "epoch": 1.38, + "eval_loss": 2.187858819961548, + "eval_runtime": 19.4098, + "eval_samples_per_second": 103.041, + "eval_steps_per_second": 1.649, + "step": 216000 + }, + { + "epoch": 1.39, + "learning_rate": 1.5275999795176406e-05, + "loss": 1.989, + "step": 217000 + }, + { + "epoch": 1.39, + "eval_loss": 2.232002019882202, + "eval_runtime": 19.4529, + "eval_samples_per_second": 102.813, + "eval_steps_per_second": 1.645, + "step": 217000 + }, + { + "epoch": 1.4, + "learning_rate": 1.5115981361052795e-05, + "loss": 1.9931, + "step": 218000 + }, + { + "epoch": 1.4, + "eval_loss": 2.1929688453674316, + "eval_runtime": 19.3402, + "eval_samples_per_second": 103.411, + "eval_steps_per_second": 1.655, + "step": 218000 + }, + { + "epoch": 1.4, + "learning_rate": 1.4955962926929182e-05, + "loss": 1.9804, + "step": 219000 + }, + { + "epoch": 1.4, + "eval_loss": 2.2313404083251953, + "eval_runtime": 19.6691, + "eval_samples_per_second": 101.682, + "eval_steps_per_second": 1.627, + "step": 219000 + }, + { + "epoch": 1.41, + "learning_rate": 1.4795944492805572e-05, + "loss": 1.9902, + "step": 220000 + }, + { + "epoch": 1.41, + "eval_loss": 2.1808815002441406, + "eval_runtime": 19.8875, + "eval_samples_per_second": 100.566, + "eval_steps_per_second": 1.609, + "step": 220000 + }, + { + "epoch": 1.41, + "learning_rate": 1.4635926058681963e-05, + "loss": 1.9791, + "step": 221000 + }, + { + "epoch": 1.41, + "eval_loss": 2.1454262733459473, + "eval_runtime": 19.9595, + "eval_samples_per_second": 100.203, + "eval_steps_per_second": 1.603, + "step": 221000 + }, + { + "epoch": 1.42, + "learning_rate": 1.4475907624558348e-05, + "loss": 1.9702, + "step": 222000 + }, + { + "epoch": 1.42, + "eval_loss": 2.220078468322754, + "eval_runtime": 19.5477, + "eval_samples_per_second": 102.314, + "eval_steps_per_second": 1.637, + "step": 222000 + }, + { + "epoch": 1.43, + "learning_rate": 1.431588919043474e-05, + "loss": 1.9848, + "step": 223000 + }, + { + "epoch": 1.43, + "eval_loss": 2.198873281478882, + "eval_runtime": 19.8165, + "eval_samples_per_second": 100.926, + "eval_steps_per_second": 1.615, + "step": 223000 + }, + { + "epoch": 1.43, + "learning_rate": 1.4155870756311127e-05, + "loss": 1.9813, + "step": 224000 + }, + { + "epoch": 1.43, + "eval_loss": 2.197327136993408, + "eval_runtime": 21.9598, + "eval_samples_per_second": 91.076, + "eval_steps_per_second": 1.457, + "step": 224000 + }, + { + "epoch": 1.44, + "learning_rate": 1.3995852322187516e-05, + "loss": 1.9784, + "step": 225000 + }, + { + "epoch": 1.44, + "eval_loss": 2.189138889312744, + "eval_runtime": 19.3319, + "eval_samples_per_second": 103.456, + "eval_steps_per_second": 1.655, + "step": 225000 + }, + { + "epoch": 1.45, + "learning_rate": 1.3835833888063907e-05, + "loss": 1.9766, + "step": 226000 + }, + { + "epoch": 1.45, + "eval_loss": 2.20912504196167, + "eval_runtime": 19.5253, + "eval_samples_per_second": 102.431, + "eval_steps_per_second": 1.639, + "step": 226000 + }, + { + "epoch": 1.45, + "learning_rate": 1.3675815453940294e-05, + "loss": 1.9732, + "step": 227000 + }, + { + "epoch": 1.45, + "eval_loss": 2.140838384628296, + "eval_runtime": 19.1497, + "eval_samples_per_second": 104.44, + "eval_steps_per_second": 1.671, + "step": 227000 + }, + { + "epoch": 1.46, + "learning_rate": 1.3515797019816683e-05, + "loss": 1.9621, + "step": 228000 + }, + { + "epoch": 1.46, + "eval_loss": 2.226170063018799, + "eval_runtime": 19.0166, + "eval_samples_per_second": 105.171, + "eval_steps_per_second": 1.683, + "step": 228000 + }, + { + "epoch": 1.47, + "learning_rate": 1.3355778585693071e-05, + "loss": 1.9739, + "step": 229000 + }, + { + "epoch": 1.47, + "eval_loss": 2.2281548976898193, + "eval_runtime": 19.3581, + "eval_samples_per_second": 103.316, + "eval_steps_per_second": 1.653, + "step": 229000 + }, + { + "epoch": 1.47, + "learning_rate": 1.3195760151569462e-05, + "loss": 1.968, + "step": 230000 + }, + { + "epoch": 1.47, + "eval_loss": 2.205911636352539, + "eval_runtime": 19.2592, + "eval_samples_per_second": 103.846, + "eval_steps_per_second": 1.662, + "step": 230000 + }, + { + "epoch": 1.48, + "learning_rate": 1.3035741717445851e-05, + "loss": 1.9656, + "step": 231000 + }, + { + "epoch": 1.48, + "eval_loss": 2.2183620929718018, + "eval_runtime": 19.2973, + "eval_samples_per_second": 103.641, + "eval_steps_per_second": 1.658, + "step": 231000 + }, + { + "epoch": 1.48, + "learning_rate": 1.2875723283322239e-05, + "loss": 1.9728, + "step": 232000 + }, + { + "epoch": 1.48, + "eval_loss": 2.1920948028564453, + "eval_runtime": 19.4211, + "eval_samples_per_second": 102.981, + "eval_steps_per_second": 1.648, + "step": 232000 + }, + { + "epoch": 1.49, + "learning_rate": 1.271570484919863e-05, + "loss": 1.9577, + "step": 233000 + }, + { + "epoch": 1.49, + "eval_loss": 2.191782236099243, + "eval_runtime": 19.3617, + "eval_samples_per_second": 103.296, + "eval_steps_per_second": 1.653, + "step": 233000 + }, + { + "epoch": 1.5, + "learning_rate": 1.2555686415075015e-05, + "loss": 1.9777, + "step": 234000 + }, + { + "epoch": 1.5, + "eval_loss": 2.209336042404175, + "eval_runtime": 19.3939, + "eval_samples_per_second": 103.125, + "eval_steps_per_second": 1.65, + "step": 234000 + }, + { + "epoch": 1.5, + "learning_rate": 1.2395667980951406e-05, + "loss": 1.9662, + "step": 235000 + }, + { + "epoch": 1.5, + "eval_loss": 2.152353048324585, + "eval_runtime": 19.7245, + "eval_samples_per_second": 101.397, + "eval_steps_per_second": 1.622, + "step": 235000 + }, + { + "epoch": 1.51, + "learning_rate": 1.2235649546827795e-05, + "loss": 1.9681, + "step": 236000 + }, + { + "epoch": 1.51, + "eval_loss": 2.1999175548553467, + "eval_runtime": 18.9532, + "eval_samples_per_second": 105.523, + "eval_steps_per_second": 1.688, + "step": 236000 + }, + { + "epoch": 1.52, + "learning_rate": 1.2075631112704184e-05, + "loss": 1.9543, + "step": 237000 + }, + { + "epoch": 1.52, + "eval_loss": 2.1981661319732666, + "eval_runtime": 19.2785, + "eval_samples_per_second": 103.742, + "eval_steps_per_second": 1.66, + "step": 237000 + }, + { + "epoch": 1.52, + "learning_rate": 1.1915612678580574e-05, + "loss": 1.9636, + "step": 238000 + }, + { + "epoch": 1.52, + "eval_loss": 2.197685956954956, + "eval_runtime": 19.3506, + "eval_samples_per_second": 103.356, + "eval_steps_per_second": 1.654, + "step": 238000 + }, + { + "epoch": 1.53, + "learning_rate": 1.1755594244456961e-05, + "loss": 1.9623, + "step": 239000 + }, + { + "epoch": 1.53, + "eval_loss": 2.207620620727539, + "eval_runtime": 19.1912, + "eval_samples_per_second": 104.214, + "eval_steps_per_second": 1.667, + "step": 239000 + }, + { + "epoch": 1.54, + "learning_rate": 1.159557581033335e-05, + "loss": 1.9645, + "step": 240000 + }, + { + "epoch": 1.54, + "eval_loss": 2.1756386756896973, + "eval_runtime": 19.1978, + "eval_samples_per_second": 104.178, + "eval_steps_per_second": 1.667, + "step": 240000 + }, + { + "epoch": 1.54, + "learning_rate": 1.143555737620974e-05, + "loss": 1.9676, + "step": 241000 + }, + { + "epoch": 1.54, + "eval_loss": 2.1699678897857666, + "eval_runtime": 19.2027, + "eval_samples_per_second": 104.152, + "eval_steps_per_second": 1.666, + "step": 241000 + }, + { + "epoch": 1.55, + "learning_rate": 1.1275538942086129e-05, + "loss": 1.9552, + "step": 242000 + }, + { + "epoch": 1.55, + "eval_loss": 2.1813385486602783, + "eval_runtime": 19.1939, + "eval_samples_per_second": 104.2, + "eval_steps_per_second": 1.667, + "step": 242000 + }, + { + "epoch": 1.56, + "learning_rate": 1.1115520507962518e-05, + "loss": 1.9675, + "step": 243000 + }, + { + "epoch": 1.56, + "eval_loss": 2.1804428100585938, + "eval_runtime": 19.3246, + "eval_samples_per_second": 103.495, + "eval_steps_per_second": 1.656, + "step": 243000 + }, + { + "epoch": 1.56, + "learning_rate": 1.0955502073838907e-05, + "loss": 1.9707, + "step": 244000 + }, + { + "epoch": 1.56, + "eval_loss": 2.1776347160339355, + "eval_runtime": 19.4613, + "eval_samples_per_second": 102.768, + "eval_steps_per_second": 1.644, + "step": 244000 + }, + { + "epoch": 1.57, + "learning_rate": 1.0795483639715295e-05, + "loss": 1.9609, + "step": 245000 + }, + { + "epoch": 1.57, + "eval_loss": 2.2101809978485107, + "eval_runtime": 19.232, + "eval_samples_per_second": 103.993, + "eval_steps_per_second": 1.664, + "step": 245000 + }, + { + "epoch": 1.57, + "learning_rate": 1.0635465205591686e-05, + "loss": 1.9584, + "step": 246000 + }, + { + "epoch": 1.57, + "eval_loss": 2.18208384513855, + "eval_runtime": 19.1408, + "eval_samples_per_second": 104.489, + "eval_steps_per_second": 1.672, + "step": 246000 + }, + { + "epoch": 1.58, + "learning_rate": 1.0475446771468075e-05, + "loss": 1.9568, + "step": 247000 + }, + { + "epoch": 1.58, + "eval_loss": 2.164984941482544, + "eval_runtime": 19.2986, + "eval_samples_per_second": 103.634, + "eval_steps_per_second": 1.658, + "step": 247000 + }, + { + "epoch": 1.59, + "learning_rate": 1.0315428337344462e-05, + "loss": 1.9514, + "step": 248000 + }, + { + "epoch": 1.59, + "eval_loss": 2.218735456466675, + "eval_runtime": 19.5707, + "eval_samples_per_second": 102.193, + "eval_steps_per_second": 1.635, + "step": 248000 + }, + { + "epoch": 1.59, + "learning_rate": 1.0155409903220851e-05, + "loss": 1.9567, + "step": 249000 + }, + { + "epoch": 1.59, + "eval_loss": 2.1572988033294678, + "eval_runtime": 19.0634, + "eval_samples_per_second": 104.913, + "eval_steps_per_second": 1.679, + "step": 249000 + }, + { + "epoch": 1.6, + "learning_rate": 9.99539146909724e-06, + "loss": 1.9555, + "step": 250000 + }, + { + "epoch": 1.6, + "eval_loss": 2.1475002765655518, + "eval_runtime": 19.0267, + "eval_samples_per_second": 105.115, + "eval_steps_per_second": 1.682, + "step": 250000 + }, + { + "epoch": 1.61, + "learning_rate": 9.83537303497363e-06, + "loss": 1.965, + "step": 251000 + }, + { + "epoch": 1.61, + "eval_loss": 2.1785731315612793, + "eval_runtime": 19.7697, + "eval_samples_per_second": 101.165, + "eval_steps_per_second": 1.619, + "step": 251000 + }, + { + "epoch": 1.61, + "learning_rate": 9.675354600850019e-06, + "loss": 1.9508, + "step": 252000 + }, + { + "epoch": 1.61, + "eval_loss": 2.1723153591156006, + "eval_runtime": 19.1786, + "eval_samples_per_second": 104.283, + "eval_steps_per_second": 1.669, + "step": 252000 + }, + { + "epoch": 1.62, + "learning_rate": 9.515336166726408e-06, + "loss": 1.9522, + "step": 253000 + }, + { + "epoch": 1.62, + "eval_loss": 2.180307626724243, + "eval_runtime": 18.9009, + "eval_samples_per_second": 105.815, + "eval_steps_per_second": 1.693, + "step": 253000 + }, + { + "epoch": 1.63, + "learning_rate": 9.355317732602796e-06, + "loss": 1.9637, + "step": 254000 + }, + { + "epoch": 1.63, + "eval_loss": 2.179806709289551, + "eval_runtime": 19.3455, + "eval_samples_per_second": 103.383, + "eval_steps_per_second": 1.654, + "step": 254000 + }, + { + "epoch": 1.63, + "learning_rate": 9.195299298479185e-06, + "loss": 1.9588, + "step": 255000 + }, + { + "epoch": 1.63, + "eval_loss": 2.200853109359741, + "eval_runtime": 19.4782, + "eval_samples_per_second": 102.679, + "eval_steps_per_second": 1.643, + "step": 255000 + }, + { + "epoch": 1.64, + "learning_rate": 9.035280864355574e-06, + "loss": 1.9553, + "step": 256000 + }, + { + "epoch": 1.64, + "eval_loss": 2.1626343727111816, + "eval_runtime": 19.24, + "eval_samples_per_second": 103.95, + "eval_steps_per_second": 1.663, + "step": 256000 + }, + { + "epoch": 1.64, + "learning_rate": 8.875262430231963e-06, + "loss": 1.946, + "step": 257000 + }, + { + "epoch": 1.64, + "eval_loss": 2.1843950748443604, + "eval_runtime": 19.1181, + "eval_samples_per_second": 104.613, + "eval_steps_per_second": 1.674, + "step": 257000 + }, + { + "epoch": 1.65, + "learning_rate": 8.715243996108352e-06, + "loss": 1.9493, + "step": 258000 + }, + { + "epoch": 1.65, + "eval_loss": 2.150207757949829, + "eval_runtime": 19.2502, + "eval_samples_per_second": 103.895, + "eval_steps_per_second": 1.662, + "step": 258000 + }, + { + "epoch": 1.66, + "learning_rate": 8.55522556198474e-06, + "loss": 1.9442, + "step": 259000 + }, + { + "epoch": 1.66, + "eval_loss": 2.1614534854888916, + "eval_runtime": 19.2393, + "eval_samples_per_second": 103.954, + "eval_steps_per_second": 1.663, + "step": 259000 + }, + { + "epoch": 1.66, + "learning_rate": 8.395207127861129e-06, + "loss": 1.945, + "step": 260000 + }, + { + "epoch": 1.66, + "eval_loss": 2.178889751434326, + "eval_runtime": 19.4657, + "eval_samples_per_second": 102.745, + "eval_steps_per_second": 1.644, + "step": 260000 + }, + { + "epoch": 1.67, + "learning_rate": 8.23518869373752e-06, + "loss": 1.9368, + "step": 261000 + }, + { + "epoch": 1.67, + "eval_loss": 2.172461986541748, + "eval_runtime": 19.2788, + "eval_samples_per_second": 103.741, + "eval_steps_per_second": 1.66, + "step": 261000 + }, + { + "epoch": 1.68, + "learning_rate": 8.075170259613907e-06, + "loss": 1.9393, + "step": 262000 + }, + { + "epoch": 1.68, + "eval_loss": 2.169734001159668, + "eval_runtime": 19.3666, + "eval_samples_per_second": 103.27, + "eval_steps_per_second": 1.652, + "step": 262000 + }, + { + "epoch": 1.68, + "learning_rate": 7.915151825490297e-06, + "loss": 1.9525, + "step": 263000 + }, + { + "epoch": 1.68, + "eval_loss": 2.1597206592559814, + "eval_runtime": 19.3459, + "eval_samples_per_second": 103.381, + "eval_steps_per_second": 1.654, + "step": 263000 + }, + { + "epoch": 1.69, + "learning_rate": 7.755133391366686e-06, + "loss": 1.9444, + "step": 264000 + }, + { + "epoch": 1.69, + "eval_loss": 2.1798765659332275, + "eval_runtime": 19.0083, + "eval_samples_per_second": 105.217, + "eval_steps_per_second": 1.683, + "step": 264000 + }, + { + "epoch": 1.7, + "learning_rate": 7.595114957243074e-06, + "loss": 1.9352, + "step": 265000 + }, + { + "epoch": 1.7, + "eval_loss": 2.164872169494629, + "eval_runtime": 19.1384, + "eval_samples_per_second": 104.502, + "eval_steps_per_second": 1.672, + "step": 265000 + }, + { + "epoch": 1.7, + "learning_rate": 7.435096523119464e-06, + "loss": 1.9537, + "step": 266000 + }, + { + "epoch": 1.7, + "eval_loss": 2.1663596630096436, + "eval_runtime": 19.6791, + "eval_samples_per_second": 101.63, + "eval_steps_per_second": 1.626, + "step": 266000 + }, + { + "epoch": 1.71, + "learning_rate": 7.2750780889958526e-06, + "loss": 1.9399, + "step": 267000 + }, + { + "epoch": 1.71, + "eval_loss": 2.1855850219726562, + "eval_runtime": 19.3954, + "eval_samples_per_second": 103.117, + "eval_steps_per_second": 1.65, + "step": 267000 + }, + { + "epoch": 1.72, + "learning_rate": 7.115059654872242e-06, + "loss": 1.9325, + "step": 268000 + }, + { + "epoch": 1.72, + "eval_loss": 2.1838717460632324, + "eval_runtime": 19.1074, + "eval_samples_per_second": 104.671, + "eval_steps_per_second": 1.675, + "step": 268000 + }, + { + "epoch": 1.72, + "learning_rate": 6.95504122074863e-06, + "loss": 1.9466, + "step": 269000 + }, + { + "epoch": 1.72, + "eval_loss": 2.1524887084960938, + "eval_runtime": 19.375, + "eval_samples_per_second": 103.226, + "eval_steps_per_second": 1.652, + "step": 269000 + }, + { + "epoch": 1.73, + "learning_rate": 6.79502278662502e-06, + "loss": 1.9403, + "step": 270000 + }, + { + "epoch": 1.73, + "eval_loss": 2.1773369312286377, + "eval_runtime": 19.1103, + "eval_samples_per_second": 104.656, + "eval_steps_per_second": 1.674, + "step": 270000 + }, + { + "epoch": 1.73, + "learning_rate": 6.6350043525014085e-06, + "loss": 1.9391, + "step": 271000 + }, + { + "epoch": 1.73, + "eval_loss": 2.212693452835083, + "eval_runtime": 19.2143, + "eval_samples_per_second": 104.089, + "eval_steps_per_second": 1.665, + "step": 271000 + }, + { + "epoch": 1.74, + "learning_rate": 6.474985918377798e-06, + "loss": 1.9419, + "step": 272000 + }, + { + "epoch": 1.74, + "eval_loss": 2.1781909465789795, + "eval_runtime": 19.4708, + "eval_samples_per_second": 102.718, + "eval_steps_per_second": 1.643, + "step": 272000 + }, + { + "epoch": 1.75, + "learning_rate": 6.314967484254186e-06, + "loss": 1.9454, + "step": 273000 + }, + { + "epoch": 1.75, + "eval_loss": 2.1962130069732666, + "eval_runtime": 18.6565, + "eval_samples_per_second": 107.201, + "eval_steps_per_second": 1.715, + "step": 273000 + }, + { + "epoch": 1.75, + "learning_rate": 6.154949050130575e-06, + "loss": 1.946, + "step": 274000 + }, + { + "epoch": 1.75, + "eval_loss": 2.157792091369629, + "eval_runtime": 19.0429, + "eval_samples_per_second": 105.026, + "eval_steps_per_second": 1.68, + "step": 274000 + }, + { + "epoch": 1.76, + "learning_rate": 5.994930616006964e-06, + "loss": 1.9339, + "step": 275000 + }, + { + "epoch": 1.76, + "eval_loss": 2.190920829772949, + "eval_runtime": 18.7174, + "eval_samples_per_second": 106.853, + "eval_steps_per_second": 1.71, + "step": 275000 + }, + { + "epoch": 1.77, + "learning_rate": 5.8349121818833536e-06, + "loss": 1.9289, + "step": 276000 + }, + { + "epoch": 1.77, + "eval_loss": 2.169802665710449, + "eval_runtime": 19.7624, + "eval_samples_per_second": 101.202, + "eval_steps_per_second": 1.619, + "step": 276000 + }, + { + "epoch": 1.77, + "learning_rate": 5.674893747759742e-06, + "loss": 1.9284, + "step": 277000 + }, + { + "epoch": 1.77, + "eval_loss": 2.149372100830078, + "eval_runtime": 18.847, + "eval_samples_per_second": 106.118, + "eval_steps_per_second": 1.698, + "step": 277000 + }, + { + "epoch": 1.78, + "learning_rate": 5.514875313636131e-06, + "loss": 1.9423, + "step": 278000 + }, + { + "epoch": 1.78, + "eval_loss": 2.163377046585083, + "eval_runtime": 19.097, + "eval_samples_per_second": 104.728, + "eval_steps_per_second": 1.676, + "step": 278000 + }, + { + "epoch": 1.79, + "learning_rate": 5.35485687951252e-06, + "loss": 1.9317, + "step": 279000 + }, + { + "epoch": 1.79, + "eval_loss": 2.129027843475342, + "eval_runtime": 18.715, + "eval_samples_per_second": 106.866, + "eval_steps_per_second": 1.71, + "step": 279000 + }, + { + "epoch": 1.79, + "learning_rate": 5.194838445388909e-06, + "loss": 1.9216, + "step": 280000 + }, + { + "epoch": 1.79, + "eval_loss": 2.171983480453491, + "eval_runtime": 18.8986, + "eval_samples_per_second": 105.828, + "eval_steps_per_second": 1.693, + "step": 280000 + }, + { + "epoch": 1.8, + "learning_rate": 5.034820011265298e-06, + "loss": 1.9176, + "step": 281000 + }, + { + "epoch": 1.8, + "eval_loss": 2.1561877727508545, + "eval_runtime": 18.6229, + "eval_samples_per_second": 107.395, + "eval_steps_per_second": 1.718, + "step": 281000 + }, + { + "epoch": 1.81, + "learning_rate": 4.874801577141687e-06, + "loss": 1.9345, + "step": 282000 + }, + { + "epoch": 1.81, + "eval_loss": 2.1655592918395996, + "eval_runtime": 18.6917, + "eval_samples_per_second": 106.999, + "eval_steps_per_second": 1.712, + "step": 282000 + }, + { + "epoch": 1.81, + "learning_rate": 4.714783143018076e-06, + "loss": 1.9431, + "step": 283000 + }, + { + "epoch": 1.81, + "eval_loss": 2.1130497455596924, + "eval_runtime": 18.7533, + "eval_samples_per_second": 106.648, + "eval_steps_per_second": 1.706, + "step": 283000 + }, + { + "epoch": 1.82, + "learning_rate": 4.5547647088944646e-06, + "loss": 1.936, + "step": 284000 + }, + { + "epoch": 1.82, + "eval_loss": 2.1281943321228027, + "eval_runtime": 18.4643, + "eval_samples_per_second": 108.317, + "eval_steps_per_second": 1.733, + "step": 284000 + }, + { + "epoch": 1.82, + "learning_rate": 4.394746274770854e-06, + "loss": 1.9344, + "step": 285000 + }, + { + "epoch": 1.82, + "eval_loss": 2.142157554626465, + "eval_runtime": 18.6731, + "eval_samples_per_second": 107.106, + "eval_steps_per_second": 1.714, + "step": 285000 + }, + { + "epoch": 1.83, + "learning_rate": 4.234727840647243e-06, + "loss": 1.9237, + "step": 286000 + }, + { + "epoch": 1.83, + "eval_loss": 2.1462085247039795, + "eval_runtime": 18.787, + "eval_samples_per_second": 106.457, + "eval_steps_per_second": 1.703, + "step": 286000 + }, + { + "epoch": 1.84, + "learning_rate": 4.074709406523631e-06, + "loss": 1.9309, + "step": 287000 + }, + { + "epoch": 1.84, + "eval_loss": 2.1435041427612305, + "eval_runtime": 18.7845, + "eval_samples_per_second": 106.471, + "eval_steps_per_second": 1.704, + "step": 287000 + }, + { + "epoch": 1.84, + "learning_rate": 3.914690972400021e-06, + "loss": 1.9239, + "step": 288000 + }, + { + "epoch": 1.84, + "eval_loss": 2.152646064758301, + "eval_runtime": 18.6983, + "eval_samples_per_second": 106.961, + "eval_steps_per_second": 1.711, + "step": 288000 + }, + { + "epoch": 1.85, + "learning_rate": 3.7546725382764097e-06, + "loss": 1.9168, + "step": 289000 + }, + { + "epoch": 1.85, + "eval_loss": 2.1280956268310547, + "eval_runtime": 18.8639, + "eval_samples_per_second": 106.023, + "eval_steps_per_second": 1.696, + "step": 289000 + }, + { + "epoch": 1.86, + "learning_rate": 3.5946541041527984e-06, + "loss": 1.9232, + "step": 290000 + }, + { + "epoch": 1.86, + "eval_loss": 2.143430471420288, + "eval_runtime": 18.873, + "eval_samples_per_second": 105.971, + "eval_steps_per_second": 1.696, + "step": 290000 + }, + { + "epoch": 1.86, + "learning_rate": 3.4346356700291876e-06, + "loss": 1.9338, + "step": 291000 + }, + { + "epoch": 1.86, + "eval_loss": 2.1642520427703857, + "eval_runtime": 18.6105, + "eval_samples_per_second": 107.466, + "eval_steps_per_second": 1.719, + "step": 291000 + }, + { + "epoch": 1.87, + "learning_rate": 3.2746172359055764e-06, + "loss": 1.9241, + "step": 292000 + }, + { + "epoch": 1.87, + "eval_loss": 2.120400905609131, + "eval_runtime": 18.6654, + "eval_samples_per_second": 107.15, + "eval_steps_per_second": 1.714, + "step": 292000 + }, + { + "epoch": 1.88, + "learning_rate": 3.114598801781965e-06, + "loss": 1.9209, + "step": 293000 + }, + { + "epoch": 1.88, + "eval_loss": 2.1418490409851074, + "eval_runtime": 18.986, + "eval_samples_per_second": 105.341, + "eval_steps_per_second": 1.685, + "step": 293000 + }, + { + "epoch": 1.88, + "learning_rate": 2.9545803676583543e-06, + "loss": 1.928, + "step": 294000 + }, + { + "epoch": 1.88, + "eval_loss": 2.1255481243133545, + "eval_runtime": 18.6322, + "eval_samples_per_second": 107.341, + "eval_steps_per_second": 1.717, + "step": 294000 + }, + { + "epoch": 1.89, + "learning_rate": 2.7945619335347435e-06, + "loss": 1.9482, + "step": 295000 + }, + { + "epoch": 1.89, + "eval_loss": 2.185188055038452, + "eval_runtime": 18.6065, + "eval_samples_per_second": 107.489, + "eval_steps_per_second": 1.72, + "step": 295000 + }, + { + "epoch": 1.89, + "learning_rate": 2.6345434994111323e-06, + "loss": 1.9276, + "step": 296000 + }, + { + "epoch": 1.89, + "eval_loss": 2.1754209995269775, + "eval_runtime": 18.6892, + "eval_samples_per_second": 107.014, + "eval_steps_per_second": 1.712, + "step": 296000 + }, + { + "epoch": 1.9, + "learning_rate": 2.4745250652875215e-06, + "loss": 1.9214, + "step": 297000 + }, + { + "epoch": 1.9, + "eval_loss": 2.124568462371826, + "eval_runtime": 18.6607, + "eval_samples_per_second": 107.177, + "eval_steps_per_second": 1.715, + "step": 297000 + }, + { + "epoch": 1.91, + "learning_rate": 2.3145066311639102e-06, + "loss": 1.9296, + "step": 298000 + }, + { + "epoch": 1.91, + "eval_loss": 2.1418752670288086, + "eval_runtime": 18.8993, + "eval_samples_per_second": 105.824, + "eval_steps_per_second": 1.693, + "step": 298000 + }, + { + "epoch": 1.91, + "learning_rate": 2.154488197040299e-06, + "loss": 1.9182, + "step": 299000 + }, + { + "epoch": 1.91, + "eval_loss": 2.1427695751190186, + "eval_runtime": 18.6439, + "eval_samples_per_second": 107.273, + "eval_steps_per_second": 1.716, + "step": 299000 + }, + { + "epoch": 1.92, + "learning_rate": 1.994469762916688e-06, + "loss": 1.9172, + "step": 300000 + }, + { + "epoch": 1.92, + "eval_loss": 2.17488956451416, + "eval_runtime": 20.0248, + "eval_samples_per_second": 99.876, + "eval_steps_per_second": 1.598, + "step": 300000 + }, + { + "epoch": 1.93, + "learning_rate": 1.834451328793077e-06, + "loss": 1.9054, + "step": 301000 + }, + { + "epoch": 1.93, + "eval_loss": 2.1516401767730713, + "eval_runtime": 19.1509, + "eval_samples_per_second": 104.434, + "eval_steps_per_second": 1.671, + "step": 301000 + }, + { + "epoch": 1.93, + "learning_rate": 1.674432894669466e-06, + "loss": 1.9209, + "step": 302000 + }, + { + "epoch": 1.93, + "eval_loss": 2.1247944831848145, + "eval_runtime": 19.0766, + "eval_samples_per_second": 104.84, + "eval_steps_per_second": 1.677, + "step": 302000 + }, + { + "epoch": 1.94, + "learning_rate": 1.5144144605458551e-06, + "loss": 1.9191, + "step": 303000 + }, + { + "epoch": 1.94, + "eval_loss": 2.1422977447509766, + "eval_runtime": 19.0887, + "eval_samples_per_second": 104.774, + "eval_steps_per_second": 1.676, + "step": 303000 + }, + { + "epoch": 1.95, + "learning_rate": 1.354396026422244e-06, + "loss": 1.9143, + "step": 304000 + }, + { + "epoch": 1.95, + "eval_loss": 2.1302106380462646, + "eval_runtime": 19.5033, + "eval_samples_per_second": 102.547, + "eval_steps_per_second": 1.641, + "step": 304000 + }, + { + "epoch": 1.95, + "learning_rate": 1.1943775922986329e-06, + "loss": 1.9163, + "step": 305000 + }, + { + "epoch": 1.95, + "eval_loss": 2.16552472114563, + "eval_runtime": 18.815, + "eval_samples_per_second": 106.298, + "eval_steps_per_second": 1.701, + "step": 305000 + }, + { + "epoch": 1.96, + "learning_rate": 1.0343591581750219e-06, + "loss": 1.915, + "step": 306000 + }, + { + "epoch": 1.96, + "eval_loss": 2.1272425651550293, + "eval_runtime": 19.1159, + "eval_samples_per_second": 104.625, + "eval_steps_per_second": 1.674, + "step": 306000 + }, + { + "epoch": 1.97, + "learning_rate": 8.743407240514107e-07, + "loss": 1.9193, + "step": 307000 + }, + { + "epoch": 1.97, + "eval_loss": 2.151264190673828, + "eval_runtime": 18.961, + "eval_samples_per_second": 105.48, + "eval_steps_per_second": 1.688, + "step": 307000 + }, + { + "epoch": 1.97, + "learning_rate": 7.143222899277997e-07, + "loss": 1.9238, + "step": 308000 + }, + { + "epoch": 1.97, + "eval_loss": 2.145237922668457, + "eval_runtime": 19.4596, + "eval_samples_per_second": 102.777, + "eval_steps_per_second": 1.644, + "step": 308000 + }, + { + "epoch": 1.98, + "learning_rate": 5.543038558041887e-07, + "loss": 1.9129, + "step": 309000 + }, + { + "epoch": 1.98, + "eval_loss": 2.132681369781494, + "eval_runtime": 18.9129, + "eval_samples_per_second": 105.748, + "eval_steps_per_second": 1.692, + "step": 309000 + }, + { + "epoch": 1.98, + "learning_rate": 3.9428542168057766e-07, + "loss": 1.92, + "step": 310000 + }, + { + "epoch": 1.98, + "eval_loss": 2.1479594707489014, + "eval_runtime": 18.8663, + "eval_samples_per_second": 106.009, + "eval_steps_per_second": 1.696, + "step": 310000 + }, + { + "epoch": 1.99, + "learning_rate": 2.342669875569666e-07, + "loss": 1.9098, + "step": 311000 + }, + { + "epoch": 1.99, + "eval_loss": 2.171926736831665, + "eval_runtime": 19.0151, + "eval_samples_per_second": 105.179, + "eval_steps_per_second": 1.683, + "step": 311000 + }, + { + "epoch": 2.0, + "learning_rate": 7.424855343335553e-08, + "loss": 1.9105, + "step": 312000 + }, + { + "epoch": 2.0, + "eval_loss": 2.1461212635040283, + "eval_runtime": 19.4871, + "eval_samples_per_second": 102.632, + "eval_steps_per_second": 1.642, + "step": 312000 + }, + { + "epoch": 2.0, + "learning_rate": 1.6609486746206498e-05, + "loss": 1.9453, + "step": 313000 + }, + { + "epoch": 2.0, + "eval_loss": 2.190183162689209, + "eval_runtime": 16.3864, + "eval_samples_per_second": 122.052, + "eval_steps_per_second": 1.953, + "step": 313000 + }, + { + "epoch": 2.01, + "learning_rate": 1.650280779012409e-05, + "loss": 1.9458, + "step": 314000 + }, + { + "epoch": 2.01, + "eval_loss": 2.1692402362823486, + "eval_runtime": 15.9646, + "eval_samples_per_second": 125.277, + "eval_steps_per_second": 2.004, + "step": 314000 + }, + { + "epoch": 2.02, + "learning_rate": 1.639612883404168e-05, + "loss": 1.9428, + "step": 315000 + }, + { + "epoch": 2.02, + "eval_loss": 2.1538236141204834, + "eval_runtime": 16.0153, + "eval_samples_per_second": 124.881, + "eval_steps_per_second": 1.998, + "step": 315000 + }, + { + "epoch": 2.02, + "learning_rate": 1.6289449877959276e-05, + "loss": 1.9488, + "step": 316000 + }, + { + "epoch": 2.02, + "eval_loss": 2.153665542602539, + "eval_runtime": 16.1132, + "eval_samples_per_second": 124.122, + "eval_steps_per_second": 1.986, + "step": 316000 + }, + { + "epoch": 2.03, + "learning_rate": 1.6182770921876865e-05, + "loss": 1.9437, + "step": 317000 + }, + { + "epoch": 2.03, + "eval_loss": 2.1973447799682617, + "eval_runtime": 17.5461, + "eval_samples_per_second": 113.986, + "eval_steps_per_second": 1.824, + "step": 317000 + }, + { + "epoch": 2.04, + "learning_rate": 1.607609196579446e-05, + "loss": 1.9487, + "step": 318000 + }, + { + "epoch": 2.04, + "eval_loss": 2.1677041053771973, + "eval_runtime": 15.9539, + "eval_samples_per_second": 125.361, + "eval_steps_per_second": 2.006, + "step": 318000 + }, + { + "epoch": 2.04, + "learning_rate": 1.596941300971205e-05, + "loss": 1.9559, + "step": 319000 + }, + { + "epoch": 2.04, + "eval_loss": 2.155820369720459, + "eval_runtime": 16.1853, + "eval_samples_per_second": 123.569, + "eval_steps_per_second": 1.977, + "step": 319000 + }, + { + "epoch": 2.05, + "learning_rate": 1.5862734053629647e-05, + "loss": 1.9662, + "step": 320000 + }, + { + "epoch": 2.05, + "eval_loss": 2.1629369258880615, + "eval_runtime": 16.6642, + "eval_samples_per_second": 120.018, + "eval_steps_per_second": 1.92, + "step": 320000 + }, + { + "epoch": 2.05, + "learning_rate": 1.5756055097547238e-05, + "loss": 1.9556, + "step": 321000 + }, + { + "epoch": 2.05, + "eval_loss": 2.1815614700317383, + "eval_runtime": 16.5814, + "eval_samples_per_second": 120.617, + "eval_steps_per_second": 1.93, + "step": 321000 + }, + { + "epoch": 2.06, + "learning_rate": 1.564937614146483e-05, + "loss": 1.9512, + "step": 322000 + }, + { + "epoch": 2.06, + "eval_loss": 2.1164066791534424, + "eval_runtime": 15.9298, + "eval_samples_per_second": 125.551, + "eval_steps_per_second": 2.009, + "step": 322000 + }, + { + "epoch": 2.07, + "learning_rate": 1.5542697185382425e-05, + "loss": 1.9544, + "step": 323000 + }, + { + "epoch": 2.07, + "eval_loss": 2.165865659713745, + "eval_runtime": 15.883, + "eval_samples_per_second": 125.921, + "eval_steps_per_second": 2.015, + "step": 323000 + }, + { + "epoch": 2.07, + "learning_rate": 1.5436018229300017e-05, + "loss": 1.9568, + "step": 324000 + }, + { + "epoch": 2.07, + "eval_loss": 2.1747090816497803, + "eval_runtime": 15.9331, + "eval_samples_per_second": 125.525, + "eval_steps_per_second": 2.008, + "step": 324000 + }, + { + "epoch": 2.08, + "learning_rate": 1.5329339273217608e-05, + "loss": 1.9449, + "step": 325000 + }, + { + "epoch": 2.08, + "eval_loss": 2.1862990856170654, + "eval_runtime": 16.6952, + "eval_samples_per_second": 119.795, + "eval_steps_per_second": 1.917, + "step": 325000 + }, + { + "epoch": 2.09, + "learning_rate": 1.5222660317135202e-05, + "loss": 1.9491, + "step": 326000 + }, + { + "epoch": 2.09, + "eval_loss": 2.1621673107147217, + "eval_runtime": 15.9325, + "eval_samples_per_second": 125.529, + "eval_steps_per_second": 2.008, + "step": 326000 + }, + { + "epoch": 2.09, + "learning_rate": 1.5115981361052795e-05, + "loss": 1.9526, + "step": 327000 + }, + { + "epoch": 2.09, + "eval_loss": 2.1826863288879395, + "eval_runtime": 15.8228, + "eval_samples_per_second": 126.4, + "eval_steps_per_second": 2.022, + "step": 327000 + }, + { + "epoch": 2.1, + "learning_rate": 1.5009302404970388e-05, + "loss": 1.952, + "step": 328000 + }, + { + "epoch": 2.1, + "eval_loss": 2.1913392543792725, + "eval_runtime": 16.1458, + "eval_samples_per_second": 123.871, + "eval_steps_per_second": 1.982, + "step": 328000 + }, + { + "epoch": 2.11, + "learning_rate": 1.4902623448887978e-05, + "loss": 1.9545, + "step": 329000 + }, + { + "epoch": 2.11, + "eval_loss": 2.18023681640625, + "eval_runtime": 16.3141, + "eval_samples_per_second": 122.593, + "eval_steps_per_second": 1.961, + "step": 329000 + }, + { + "epoch": 2.11, + "learning_rate": 1.4795944492805572e-05, + "loss": 1.9616, + "step": 330000 + }, + { + "epoch": 2.11, + "eval_loss": 2.178854465484619, + "eval_runtime": 15.9442, + "eval_samples_per_second": 125.438, + "eval_steps_per_second": 2.007, + "step": 330000 + }, + { + "epoch": 2.12, + "learning_rate": 1.4689265536723165e-05, + "loss": 1.9515, + "step": 331000 + }, + { + "epoch": 2.12, + "eval_loss": 2.1725854873657227, + "eval_runtime": 15.939, + "eval_samples_per_second": 125.478, + "eval_steps_per_second": 2.008, + "step": 331000 + }, + { + "epoch": 2.13, + "learning_rate": 1.4582586580640758e-05, + "loss": 1.9484, + "step": 332000 + }, + { + "epoch": 2.13, + "eval_loss": 2.1632540225982666, + "eval_runtime": 16.7042, + "eval_samples_per_second": 119.731, + "eval_steps_per_second": 1.916, + "step": 332000 + }, + { + "epoch": 2.13, + "learning_rate": 1.4475907624558348e-05, + "loss": 1.962, + "step": 333000 + }, + { + "epoch": 2.13, + "eval_loss": 2.1514594554901123, + "eval_runtime": 15.9872, + "eval_samples_per_second": 125.1, + "eval_steps_per_second": 2.002, + "step": 333000 + }, + { + "epoch": 2.14, + "learning_rate": 1.4369228668475942e-05, + "loss": 1.9563, + "step": 334000 + }, + { + "epoch": 2.14, + "eval_loss": 2.18198299407959, + "eval_runtime": 16.1962, + "eval_samples_per_second": 123.486, + "eval_steps_per_second": 1.976, + "step": 334000 + }, + { + "epoch": 2.14, + "learning_rate": 1.4262549712393535e-05, + "loss": 1.9544, + "step": 335000 + }, + { + "epoch": 2.14, + "eval_loss": 2.168269634246826, + "eval_runtime": 16.3411, + "eval_samples_per_second": 122.391, + "eval_steps_per_second": 1.958, + "step": 335000 + }, + { + "epoch": 2.15, + "learning_rate": 1.4155870756311127e-05, + "loss": 1.9509, + "step": 336000 + }, + { + "epoch": 2.15, + "eval_loss": 2.157496690750122, + "eval_runtime": 16.7663, + "eval_samples_per_second": 119.287, + "eval_steps_per_second": 1.909, + "step": 336000 + }, + { + "epoch": 2.16, + "learning_rate": 1.404919180022872e-05, + "loss": 1.9527, + "step": 337000 + }, + { + "epoch": 2.16, + "eval_loss": 2.162778377532959, + "eval_runtime": 16.4574, + "eval_samples_per_second": 121.526, + "eval_steps_per_second": 1.944, + "step": 337000 + }, + { + "epoch": 2.16, + "learning_rate": 1.3942512844146313e-05, + "loss": 1.9455, + "step": 338000 + }, + { + "epoch": 2.16, + "eval_loss": 2.2115304470062256, + "eval_runtime": 15.9425, + "eval_samples_per_second": 125.451, + "eval_steps_per_second": 2.007, + "step": 338000 + }, + { + "epoch": 2.17, + "learning_rate": 1.3835833888063907e-05, + "loss": 1.9443, + "step": 339000 + }, + { + "epoch": 2.17, + "eval_loss": 2.1575698852539062, + "eval_runtime": 16.1638, + "eval_samples_per_second": 123.734, + "eval_steps_per_second": 1.98, + "step": 339000 + }, + { + "epoch": 2.18, + "learning_rate": 1.3729154931981497e-05, + "loss": 1.9471, + "step": 340000 + }, + { + "epoch": 2.18, + "eval_loss": 2.163440465927124, + "eval_runtime": 16.5887, + "eval_samples_per_second": 120.564, + "eval_steps_per_second": 1.929, + "step": 340000 + }, + { + "epoch": 2.18, + "learning_rate": 1.362247597589909e-05, + "loss": 1.9385, + "step": 341000 + }, + { + "epoch": 2.18, + "eval_loss": 2.1808547973632812, + "eval_runtime": 16.0292, + "eval_samples_per_second": 124.773, + "eval_steps_per_second": 1.996, + "step": 341000 + }, + { + "epoch": 2.19, + "learning_rate": 1.3515797019816683e-05, + "loss": 1.9472, + "step": 342000 + }, + { + "epoch": 2.19, + "eval_loss": 2.1804370880126953, + "eval_runtime": 16.1599, + "eval_samples_per_second": 123.763, + "eval_steps_per_second": 1.98, + "step": 342000 + }, + { + "epoch": 2.2, + "learning_rate": 1.3409118063734277e-05, + "loss": 1.9578, + "step": 343000 + }, + { + "epoch": 2.2, + "eval_loss": 2.172938346862793, + "eval_runtime": 16.4066, + "eval_samples_per_second": 121.902, + "eval_steps_per_second": 1.95, + "step": 343000 + }, + { + "epoch": 2.2, + "learning_rate": 1.3302439107651868e-05, + "loss": 1.9501, + "step": 344000 + }, + { + "epoch": 2.2, + "eval_loss": 2.1206016540527344, + "eval_runtime": 16.7126, + "eval_samples_per_second": 119.67, + "eval_steps_per_second": 1.915, + "step": 344000 + }, + { + "epoch": 2.21, + "learning_rate": 1.3195760151569462e-05, + "loss": 1.9363, + "step": 345000 + }, + { + "epoch": 2.21, + "eval_loss": 2.1700916290283203, + "eval_runtime": 15.9452, + "eval_samples_per_second": 125.43, + "eval_steps_per_second": 2.007, + "step": 345000 + }, + { + "epoch": 2.21, + "learning_rate": 1.3089081195487055e-05, + "loss": 1.9452, + "step": 346000 + }, + { + "epoch": 2.21, + "eval_loss": 2.1466197967529297, + "eval_runtime": 16.0688, + "eval_samples_per_second": 124.465, + "eval_steps_per_second": 1.991, + "step": 346000 + }, + { + "epoch": 2.22, + "learning_rate": 1.2982402239404649e-05, + "loss": 1.9544, + "step": 347000 + }, + { + "epoch": 2.22, + "eval_loss": 2.118955135345459, + "eval_runtime": 16.4559, + "eval_samples_per_second": 121.537, + "eval_steps_per_second": 1.945, + "step": 347000 + }, + { + "epoch": 2.23, + "learning_rate": 1.2875723283322239e-05, + "loss": 1.9442, + "step": 348000 + }, + { + "epoch": 2.23, + "eval_loss": 2.2223548889160156, + "eval_runtime": 15.9342, + "eval_samples_per_second": 125.516, + "eval_steps_per_second": 2.008, + "step": 348000 + }, + { + "epoch": 2.23, + "learning_rate": 1.2769044327239832e-05, + "loss": 1.949, + "step": 349000 + }, + { + "epoch": 2.23, + "eval_loss": 2.1240313053131104, + "eval_runtime": 16.1322, + "eval_samples_per_second": 123.975, + "eval_steps_per_second": 1.984, + "step": 349000 + }, + { + "epoch": 2.24, + "learning_rate": 1.2662365371157425e-05, + "loss": 1.9524, + "step": 350000 + }, + { + "epoch": 2.24, + "eval_loss": 2.2078564167022705, + "eval_runtime": 15.9714, + "eval_samples_per_second": 125.224, + "eval_steps_per_second": 2.004, + "step": 350000 + }, + { + "epoch": 2.25, + "learning_rate": 1.2555686415075015e-05, + "loss": 1.9371, + "step": 351000 + }, + { + "epoch": 2.25, + "eval_loss": 2.1884605884552, + "eval_runtime": 17.7436, + "eval_samples_per_second": 112.717, + "eval_steps_per_second": 1.803, + "step": 351000 + }, + { + "epoch": 2.25, + "learning_rate": 1.2449007458992609e-05, + "loss": 1.9474, + "step": 352000 + }, + { + "epoch": 2.25, + "eval_loss": 2.165747880935669, + "eval_runtime": 15.9774, + "eval_samples_per_second": 125.177, + "eval_steps_per_second": 2.003, + "step": 352000 + }, + { + "epoch": 2.26, + "learning_rate": 1.2342328502910202e-05, + "loss": 1.9444, + "step": 353000 + }, + { + "epoch": 2.26, + "eval_loss": 2.180070161819458, + "eval_runtime": 15.9059, + "eval_samples_per_second": 125.74, + "eval_steps_per_second": 2.012, + "step": 353000 + }, + { + "epoch": 2.27, + "learning_rate": 1.2235649546827795e-05, + "loss": 1.9381, + "step": 354000 + }, + { + "epoch": 2.27, + "eval_loss": 2.195138931274414, + "eval_runtime": 15.4982, + "eval_samples_per_second": 129.047, + "eval_steps_per_second": 2.065, + "step": 354000 + }, + { + "epoch": 2.27, + "learning_rate": 1.2128970590745389e-05, + "loss": 1.9462, + "step": 355000 + }, + { + "epoch": 2.27, + "eval_loss": 2.197645902633667, + "eval_runtime": 16.2722, + "eval_samples_per_second": 122.909, + "eval_steps_per_second": 1.967, + "step": 355000 + }, + { + "epoch": 2.28, + "learning_rate": 1.202229163466298e-05, + "loss": 1.9312, + "step": 356000 + }, + { + "epoch": 2.28, + "eval_loss": 2.1800289154052734, + "eval_runtime": 15.5962, + "eval_samples_per_second": 128.236, + "eval_steps_per_second": 2.052, + "step": 356000 + }, + { + "epoch": 2.29, + "learning_rate": 1.1915612678580574e-05, + "loss": 1.9379, + "step": 357000 + }, + { + "epoch": 2.29, + "eval_loss": 2.175736427307129, + "eval_runtime": 15.9665, + "eval_samples_per_second": 125.262, + "eval_steps_per_second": 2.004, + "step": 357000 + }, + { + "epoch": 2.29, + "learning_rate": 1.1808933722498165e-05, + "loss": 1.9435, + "step": 358000 + }, + { + "epoch": 2.29, + "eval_loss": 2.205449104309082, + "eval_runtime": 15.7121, + "eval_samples_per_second": 127.291, + "eval_steps_per_second": 2.037, + "step": 358000 + }, + { + "epoch": 2.3, + "learning_rate": 1.1702254766415759e-05, + "loss": 1.9448, + "step": 359000 + }, + { + "epoch": 2.3, + "eval_loss": 2.173300266265869, + "eval_runtime": 16.397, + "eval_samples_per_second": 121.974, + "eval_steps_per_second": 1.952, + "step": 359000 + }, + { + "epoch": 2.3, + "learning_rate": 1.159557581033335e-05, + "loss": 1.9529, + "step": 360000 + }, + { + "epoch": 2.3, + "eval_loss": 2.145735263824463, + "eval_runtime": 15.5694, + "eval_samples_per_second": 128.457, + "eval_steps_per_second": 2.055, + "step": 360000 + }, + { + "epoch": 2.31, + "learning_rate": 1.1488896854250944e-05, + "loss": 1.9444, + "step": 361000 + }, + { + "epoch": 2.31, + "eval_loss": 2.1839778423309326, + "eval_runtime": 15.6504, + "eval_samples_per_second": 127.792, + "eval_steps_per_second": 2.045, + "step": 361000 + }, + { + "epoch": 2.32, + "learning_rate": 1.1382217898168535e-05, + "loss": 1.9439, + "step": 362000 + }, + { + "epoch": 2.32, + "eval_loss": 2.128485918045044, + "eval_runtime": 15.7866, + "eval_samples_per_second": 126.69, + "eval_steps_per_second": 2.027, + "step": 362000 + }, + { + "epoch": 2.32, + "learning_rate": 1.1275538942086129e-05, + "loss": 1.9345, + "step": 363000 + }, + { + "epoch": 2.32, + "eval_loss": 2.16981840133667, + "eval_runtime": 16.0509, + "eval_samples_per_second": 124.604, + "eval_steps_per_second": 1.994, + "step": 363000 + }, + { + "epoch": 2.33, + "learning_rate": 1.1168859986003722e-05, + "loss": 1.9355, + "step": 364000 + }, + { + "epoch": 2.33, + "eval_loss": 2.1235830783843994, + "eval_runtime": 15.5068, + "eval_samples_per_second": 128.975, + "eval_steps_per_second": 2.064, + "step": 364000 + }, + { + "epoch": 2.34, + "learning_rate": 1.1062181029921315e-05, + "loss": 1.9385, + "step": 365000 + }, + { + "epoch": 2.34, + "eval_loss": 2.1465463638305664, + "eval_runtime": 15.3143, + "eval_samples_per_second": 130.597, + "eval_steps_per_second": 2.09, + "step": 365000 + }, + { + "epoch": 2.34, + "learning_rate": 1.0955502073838907e-05, + "loss": 1.9425, + "step": 366000 + }, + { + "epoch": 2.34, + "eval_loss": 2.1613283157348633, + "eval_runtime": 15.466, + "eval_samples_per_second": 129.316, + "eval_steps_per_second": 2.069, + "step": 366000 + }, + { + "epoch": 2.35, + "learning_rate": 1.08488231177565e-05, + "loss": 1.9304, + "step": 367000 + }, + { + "epoch": 2.35, + "eval_loss": 2.172750949859619, + "eval_runtime": 15.5842, + "eval_samples_per_second": 128.335, + "eval_steps_per_second": 2.053, + "step": 367000 + }, + { + "epoch": 2.36, + "learning_rate": 1.0742144161674092e-05, + "loss": 1.9339, + "step": 368000 + }, + { + "epoch": 2.36, + "eval_loss": 2.148078680038452, + "eval_runtime": 15.9481, + "eval_samples_per_second": 125.407, + "eval_steps_per_second": 2.007, + "step": 368000 + }, + { + "epoch": 2.36, + "learning_rate": 1.0635465205591686e-05, + "loss": 1.9463, + "step": 369000 + }, + { + "epoch": 2.36, + "eval_loss": 2.1650550365448, + "eval_runtime": 15.3617, + "eval_samples_per_second": 130.194, + "eval_steps_per_second": 2.083, + "step": 369000 + }, + { + "epoch": 2.37, + "learning_rate": 1.0528786249509277e-05, + "loss": 1.9407, + "step": 370000 + }, + { + "epoch": 2.37, + "eval_loss": 2.1432077884674072, + "eval_runtime": 15.1001, + "eval_samples_per_second": 132.45, + "eval_steps_per_second": 2.119, + "step": 370000 + }, + { + "epoch": 2.37, + "learning_rate": 1.0422107293426869e-05, + "loss": 1.9453, + "step": 371000 + }, + { + "epoch": 2.37, + "eval_loss": 2.147706985473633, + "eval_runtime": 15.9626, + "eval_samples_per_second": 125.293, + "eval_steps_per_second": 2.005, + "step": 371000 + }, + { + "epoch": 2.38, + "learning_rate": 1.0315428337344462e-05, + "loss": 1.9368, + "step": 372000 + }, + { + "epoch": 2.38, + "eval_loss": 2.184664249420166, + "eval_runtime": 15.5454, + "eval_samples_per_second": 128.656, + "eval_steps_per_second": 2.058, + "step": 372000 + }, + { + "epoch": 2.39, + "learning_rate": 1.0208749381262054e-05, + "loss": 1.9407, + "step": 373000 + }, + { + "epoch": 2.39, + "eval_loss": 2.1857311725616455, + "eval_runtime": 15.3498, + "eval_samples_per_second": 130.295, + "eval_steps_per_second": 2.085, + "step": 373000 + }, + { + "epoch": 2.39, + "learning_rate": 1.0102070425179647e-05, + "loss": 1.934, + "step": 374000 + }, + { + "epoch": 2.39, + "eval_loss": 2.119173765182495, + "eval_runtime": 15.4006, + "eval_samples_per_second": 129.865, + "eval_steps_per_second": 2.078, + "step": 374000 + }, + { + "epoch": 2.4, + "learning_rate": 9.99539146909724e-06, + "loss": 1.9297, + "step": 375000 + }, + { + "epoch": 2.4, + "eval_loss": 2.1658694744110107, + "eval_runtime": 15.796, + "eval_samples_per_second": 126.615, + "eval_steps_per_second": 2.026, + "step": 375000 + }, + { + "epoch": 2.41, + "learning_rate": 9.888712513014834e-06, + "loss": 1.9298, + "step": 376000 + }, + { + "epoch": 2.41, + "eval_loss": 2.171632766723633, + "eval_runtime": 15.3482, + "eval_samples_per_second": 130.308, + "eval_steps_per_second": 2.085, + "step": 376000 + }, + { + "epoch": 2.41, + "learning_rate": 9.782033556932426e-06, + "loss": 1.9267, + "step": 377000 + }, + { + "epoch": 2.41, + "eval_loss": 2.1282413005828857, + "eval_runtime": 15.2611, + "eval_samples_per_second": 131.052, + "eval_steps_per_second": 2.097, + "step": 377000 + }, + { + "epoch": 2.42, + "learning_rate": 9.675354600850019e-06, + "loss": 1.9387, + "step": 378000 + }, + { + "epoch": 2.42, + "eval_loss": 2.175699472427368, + "eval_runtime": 15.3352, + "eval_samples_per_second": 130.419, + "eval_steps_per_second": 2.087, + "step": 378000 + }, + { + "epoch": 2.43, + "learning_rate": 9.56867564476761e-06, + "loss": 1.9235, + "step": 379000 + }, + { + "epoch": 2.43, + "eval_loss": 2.1758999824523926, + "eval_runtime": 16.089, + "eval_samples_per_second": 124.309, + "eval_steps_per_second": 1.989, + "step": 379000 + }, + { + "epoch": 2.43, + "learning_rate": 9.461996688685204e-06, + "loss": 1.9265, + "step": 380000 + }, + { + "epoch": 2.43, + "eval_loss": 2.163534164428711, + "eval_runtime": 15.2326, + "eval_samples_per_second": 131.297, + "eval_steps_per_second": 2.101, + "step": 380000 + }, + { + "epoch": 2.44, + "learning_rate": 9.355317732602796e-06, + "loss": 1.9151, + "step": 381000 + }, + { + "epoch": 2.44, + "eval_loss": 2.1671011447906494, + "eval_runtime": 15.2621, + "eval_samples_per_second": 131.044, + "eval_steps_per_second": 2.097, + "step": 381000 + }, + { + "epoch": 2.45, + "learning_rate": 9.248638776520389e-06, + "loss": 1.9262, + "step": 382000 + }, + { + "epoch": 2.45, + "eval_loss": 2.144550323486328, + "eval_runtime": 15.6946, + "eval_samples_per_second": 127.432, + "eval_steps_per_second": 2.039, + "step": 382000 + }, + { + "epoch": 2.45, + "learning_rate": 9.14195982043798e-06, + "loss": 1.9311, + "step": 383000 + }, + { + "epoch": 2.45, + "eval_loss": 2.1890273094177246, + "eval_runtime": 15.377, + "eval_samples_per_second": 130.065, + "eval_steps_per_second": 2.081, + "step": 383000 + }, + { + "epoch": 2.46, + "learning_rate": 9.035280864355574e-06, + "loss": 1.9305, + "step": 384000 + }, + { + "epoch": 2.46, + "eval_loss": 2.166837692260742, + "eval_runtime": 15.3262, + "eval_samples_per_second": 130.496, + "eval_steps_per_second": 2.088, + "step": 384000 + }, + { + "epoch": 2.46, + "learning_rate": 8.928601908273167e-06, + "loss": 1.9237, + "step": 385000 + }, + { + "epoch": 2.46, + "eval_loss": 2.0922629833221436, + "eval_runtime": 15.1049, + "eval_samples_per_second": 132.408, + "eval_steps_per_second": 2.119, + "step": 385000 + }, + { + "epoch": 2.47, + "learning_rate": 8.82192295219076e-06, + "loss": 1.9256, + "step": 386000 + }, + { + "epoch": 2.47, + "eval_loss": 2.1387295722961426, + "eval_runtime": 15.8611, + "eval_samples_per_second": 126.095, + "eval_steps_per_second": 2.018, + "step": 386000 + }, + { + "epoch": 2.48, + "learning_rate": 8.715243996108352e-06, + "loss": 1.9339, + "step": 387000 + }, + { + "epoch": 2.48, + "eval_loss": 2.160367250442505, + "eval_runtime": 15.4895, + "eval_samples_per_second": 129.12, + "eval_steps_per_second": 2.066, + "step": 387000 + }, + { + "epoch": 2.48, + "learning_rate": 8.608565040025944e-06, + "loss": 1.925, + "step": 388000 + }, + { + "epoch": 2.48, + "eval_loss": 2.1711387634277344, + "eval_runtime": 15.39, + "eval_samples_per_second": 129.955, + "eval_steps_per_second": 2.079, + "step": 388000 + }, + { + "epoch": 2.49, + "learning_rate": 8.501886083943537e-06, + "loss": 1.9185, + "step": 389000 + }, + { + "epoch": 2.49, + "eval_loss": 2.1491212844848633, + "eval_runtime": 15.607, + "eval_samples_per_second": 128.147, + "eval_steps_per_second": 2.05, + "step": 389000 + }, + { + "epoch": 2.5, + "learning_rate": 8.395207127861129e-06, + "loss": 1.9214, + "step": 390000 + }, + { + "epoch": 2.5, + "eval_loss": 2.1444971561431885, + "eval_runtime": 15.4605, + "eval_samples_per_second": 129.362, + "eval_steps_per_second": 2.07, + "step": 390000 + }, + { + "epoch": 2.5, + "learning_rate": 8.288528171778722e-06, + "loss": 1.928, + "step": 391000 + }, + { + "epoch": 2.5, + "eval_loss": 2.1359145641326904, + "eval_runtime": 15.6126, + "eval_samples_per_second": 128.102, + "eval_steps_per_second": 2.05, + "step": 391000 + }, + { + "epoch": 2.51, + "learning_rate": 8.181849215696314e-06, + "loss": 1.9243, + "step": 392000 + }, + { + "epoch": 2.51, + "eval_loss": 2.156005620956421, + "eval_runtime": 15.7238, + "eval_samples_per_second": 127.196, + "eval_steps_per_second": 2.035, + "step": 392000 + }, + { + "epoch": 2.52, + "learning_rate": 8.075170259613907e-06, + "loss": 1.9096, + "step": 393000 + }, + { + "epoch": 2.52, + "eval_loss": 2.1110196113586426, + "eval_runtime": 15.176, + "eval_samples_per_second": 131.787, + "eval_steps_per_second": 2.109, + "step": 393000 + }, + { + "epoch": 2.52, + "learning_rate": 7.9684913035315e-06, + "loss": 1.9254, + "step": 394000 + }, + { + "epoch": 2.52, + "eval_loss": 2.135141611099243, + "eval_runtime": 15.4036, + "eval_samples_per_second": 129.84, + "eval_steps_per_second": 2.077, + "step": 394000 + }, + { + "epoch": 2.53, + "learning_rate": 7.861812347449094e-06, + "loss": 1.9214, + "step": 395000 + }, + { + "epoch": 2.53, + "eval_loss": 2.1366610527038574, + "eval_runtime": 15.7136, + "eval_samples_per_second": 127.279, + "eval_steps_per_second": 2.036, + "step": 395000 + }, + { + "epoch": 2.53, + "learning_rate": 7.755133391366686e-06, + "loss": 1.9229, + "step": 396000 + }, + { + "epoch": 2.53, + "eval_loss": 2.1293559074401855, + "eval_runtime": 15.3708, + "eval_samples_per_second": 130.117, + "eval_steps_per_second": 2.082, + "step": 396000 + }, + { + "epoch": 2.54, + "learning_rate": 7.64845443528428e-06, + "loss": 1.9166, + "step": 397000 + }, + { + "epoch": 2.54, + "eval_loss": 2.1272215843200684, + "eval_runtime": 15.7644, + "eval_samples_per_second": 126.868, + "eval_steps_per_second": 2.03, + "step": 397000 + }, + { + "epoch": 2.55, + "learning_rate": 7.541775479201871e-06, + "loss": 1.9152, + "step": 398000 + }, + { + "epoch": 2.55, + "eval_loss": 2.1080117225646973, + "eval_runtime": 15.3816, + "eval_samples_per_second": 130.026, + "eval_steps_per_second": 2.08, + "step": 398000 + }, + { + "epoch": 2.55, + "learning_rate": 7.435096523119464e-06, + "loss": 1.9138, + "step": 399000 + }, + { + "epoch": 2.55, + "eval_loss": 2.156583309173584, + "eval_runtime": 15.5093, + "eval_samples_per_second": 128.955, + "eval_steps_per_second": 2.063, + "step": 399000 + }, + { + "epoch": 2.56, + "learning_rate": 7.328417567037056e-06, + "loss": 1.9193, + "step": 400000 + }, + { + "epoch": 2.56, + "eval_loss": 2.1462528705596924, + "eval_runtime": 15.6345, + "eval_samples_per_second": 127.923, + "eval_steps_per_second": 2.047, + "step": 400000 + }, + { + "epoch": 2.57, + "learning_rate": 7.221738610954649e-06, + "loss": 1.9216, + "step": 401000 + }, + { + "epoch": 2.57, + "eval_loss": 2.1311724185943604, + "eval_runtime": 15.5304, + "eval_samples_per_second": 128.78, + "eval_steps_per_second": 2.06, + "step": 401000 + }, + { + "epoch": 2.57, + "learning_rate": 7.115059654872242e-06, + "loss": 1.9171, + "step": 402000 + }, + { + "epoch": 2.57, + "eval_loss": 2.1334073543548584, + "eval_runtime": 15.6034, + "eval_samples_per_second": 128.177, + "eval_steps_per_second": 2.051, + "step": 402000 + }, + { + "epoch": 2.58, + "learning_rate": 7.008380698789835e-06, + "loss": 1.9148, + "step": 403000 + }, + { + "epoch": 2.58, + "eval_loss": 2.1480307579040527, + "eval_runtime": 15.4786, + "eval_samples_per_second": 129.211, + "eval_steps_per_second": 2.067, + "step": 403000 + }, + { + "epoch": 2.59, + "learning_rate": 6.901701742707427e-06, + "loss": 1.9204, + "step": 404000 + }, + { + "epoch": 2.59, + "eval_loss": 2.1620922088623047, + "eval_runtime": 17.9933, + "eval_samples_per_second": 111.152, + "eval_steps_per_second": 1.778, + "step": 404000 + }, + { + "epoch": 2.59, + "learning_rate": 6.79502278662502e-06, + "loss": 1.9163, + "step": 405000 + }, + { + "epoch": 2.59, + "eval_loss": 2.1261579990386963, + "eval_runtime": 15.7916, + "eval_samples_per_second": 126.65, + "eval_steps_per_second": 2.026, + "step": 405000 + }, + { + "epoch": 2.6, + "learning_rate": 6.688343830542612e-06, + "loss": 1.9147, + "step": 406000 + }, + { + "epoch": 2.6, + "eval_loss": 2.134714365005493, + "eval_runtime": 15.563, + "eval_samples_per_second": 128.51, + "eval_steps_per_second": 2.056, + "step": 406000 + }, + { + "epoch": 2.61, + "learning_rate": 6.581664874460204e-06, + "loss": 1.9107, + "step": 407000 + }, + { + "epoch": 2.61, + "eval_loss": 2.094939947128296, + "eval_runtime": 15.3395, + "eval_samples_per_second": 130.383, + "eval_steps_per_second": 2.086, + "step": 407000 + }, + { + "epoch": 2.61, + "learning_rate": 6.474985918377798e-06, + "loss": 1.9185, + "step": 408000 + }, + { + "epoch": 2.61, + "eval_loss": 2.1135287284851074, + "eval_runtime": 15.2587, + "eval_samples_per_second": 131.072, + "eval_steps_per_second": 2.097, + "step": 408000 + }, + { + "epoch": 2.62, + "learning_rate": 6.368306962295389e-06, + "loss": 1.9134, + "step": 409000 + }, + { + "epoch": 2.62, + "eval_loss": 2.1412642002105713, + "eval_runtime": 15.702, + "eval_samples_per_second": 127.372, + "eval_steps_per_second": 2.038, + "step": 409000 + }, + { + "epoch": 2.62, + "learning_rate": 6.261628006212983e-06, + "loss": 1.9144, + "step": 410000 + }, + { + "epoch": 2.62, + "eval_loss": 2.1682534217834473, + "eval_runtime": 15.4072, + "eval_samples_per_second": 129.81, + "eval_steps_per_second": 2.077, + "step": 410000 + }, + { + "epoch": 2.63, + "learning_rate": 6.154949050130575e-06, + "loss": 1.9086, + "step": 411000 + }, + { + "epoch": 2.63, + "eval_loss": 2.141894578933716, + "eval_runtime": 15.208, + "eval_samples_per_second": 131.51, + "eval_steps_per_second": 2.104, + "step": 411000 + }, + { + "epoch": 2.64, + "learning_rate": 6.0482700940481686e-06, + "loss": 1.9101, + "step": 412000 + }, + { + "epoch": 2.64, + "eval_loss": 2.1342506408691406, + "eval_runtime": 15.2405, + "eval_samples_per_second": 131.229, + "eval_steps_per_second": 2.1, + "step": 412000 + }, + { + "epoch": 2.64, + "learning_rate": 5.941591137965761e-06, + "loss": 1.9086, + "step": 413000 + }, + { + "epoch": 2.64, + "eval_loss": 2.097320318222046, + "eval_runtime": 15.5657, + "eval_samples_per_second": 128.488, + "eval_steps_per_second": 2.056, + "step": 413000 + }, + { + "epoch": 2.65, + "learning_rate": 5.8349121818833536e-06, + "loss": 1.9089, + "step": 414000 + }, + { + "epoch": 2.65, + "eval_loss": 2.1229472160339355, + "eval_runtime": 15.1808, + "eval_samples_per_second": 131.746, + "eval_steps_per_second": 2.108, + "step": 414000 + }, + { + "epoch": 2.66, + "learning_rate": 5.728233225800946e-06, + "loss": 1.915, + "step": 415000 + }, + { + "epoch": 2.66, + "eval_loss": 2.1642491817474365, + "eval_runtime": 15.6522, + "eval_samples_per_second": 127.777, + "eval_steps_per_second": 2.044, + "step": 415000 + }, + { + "epoch": 2.66, + "learning_rate": 5.621554269718539e-06, + "loss": 1.914, + "step": 416000 + }, + { + "epoch": 2.66, + "eval_loss": 2.1208455562591553, + "eval_runtime": 15.453, + "eval_samples_per_second": 129.425, + "eval_steps_per_second": 2.071, + "step": 416000 + }, + { + "epoch": 2.67, + "learning_rate": 5.514875313636131e-06, + "loss": 1.9031, + "step": 417000 + }, + { + "epoch": 2.67, + "eval_loss": 2.103487253189087, + "eval_runtime": 15.4394, + "eval_samples_per_second": 129.539, + "eval_steps_per_second": 2.073, + "step": 417000 + }, + { + "epoch": 2.68, + "learning_rate": 5.408196357553724e-06, + "loss": 1.9015, + "step": 418000 + }, + { + "epoch": 2.68, + "eval_loss": 2.1312220096588135, + "eval_runtime": 15.3068, + "eval_samples_per_second": 130.661, + "eval_steps_per_second": 2.091, + "step": 418000 + }, + { + "epoch": 2.68, + "learning_rate": 5.301517401471316e-06, + "loss": 1.9069, + "step": 419000 + }, + { + "epoch": 2.68, + "eval_loss": 2.1444790363311768, + "eval_runtime": 15.4574, + "eval_samples_per_second": 129.388, + "eval_steps_per_second": 2.07, + "step": 419000 + }, + { + "epoch": 2.69, + "learning_rate": 5.194838445388909e-06, + "loss": 1.9016, + "step": 420000 + }, + { + "epoch": 2.69, + "eval_loss": 2.1105127334594727, + "eval_runtime": 15.3042, + "eval_samples_per_second": 130.683, + "eval_steps_per_second": 2.091, + "step": 420000 + }, + { + "epoch": 2.69, + "learning_rate": 5.088159489306501e-06, + "loss": 1.8882, + "step": 421000 + }, + { + "epoch": 2.69, + "eval_loss": 2.151632785797119, + "eval_runtime": 15.8977, + "eval_samples_per_second": 125.805, + "eval_steps_per_second": 2.013, + "step": 421000 + }, + { + "epoch": 2.7, + "learning_rate": 4.9814805332240945e-06, + "loss": 1.9158, + "step": 422000 + }, + { + "epoch": 2.7, + "eval_loss": 2.1242105960845947, + "eval_runtime": 15.298, + "eval_samples_per_second": 130.736, + "eval_steps_per_second": 2.092, + "step": 422000 + }, + { + "epoch": 2.71, + "learning_rate": 4.874801577141687e-06, + "loss": 1.9136, + "step": 423000 + }, + { + "epoch": 2.71, + "eval_loss": 2.1192123889923096, + "eval_runtime": 15.1175, + "eval_samples_per_second": 132.297, + "eval_steps_per_second": 2.117, + "step": 423000 + }, + { + "epoch": 2.71, + "learning_rate": 4.7681226210592795e-06, + "loss": 1.916, + "step": 424000 + }, + { + "epoch": 2.71, + "eval_loss": 2.1400868892669678, + "eval_runtime": 15.3165, + "eval_samples_per_second": 130.578, + "eval_steps_per_second": 2.089, + "step": 424000 + }, + { + "epoch": 2.72, + "learning_rate": 4.661443664976872e-06, + "loss": 1.8986, + "step": 425000 + }, + { + "epoch": 2.72, + "eval_loss": 2.158984899520874, + "eval_runtime": 15.2786, + "eval_samples_per_second": 130.902, + "eval_steps_per_second": 2.094, + "step": 425000 + }, + { + "epoch": 2.73, + "learning_rate": 4.5547647088944646e-06, + "loss": 1.9046, + "step": 426000 + }, + { + "epoch": 2.73, + "eval_loss": 2.1008715629577637, + "eval_runtime": 15.3482, + "eval_samples_per_second": 130.309, + "eval_steps_per_second": 2.085, + "step": 426000 + }, + { + "epoch": 2.73, + "learning_rate": 4.448085752812058e-06, + "loss": 1.9019, + "step": 427000 + }, + { + "epoch": 2.73, + "eval_loss": 2.1234779357910156, + "eval_runtime": 15.3947, + "eval_samples_per_second": 129.915, + "eval_steps_per_second": 2.079, + "step": 427000 + }, + { + "epoch": 2.74, + "learning_rate": 4.34140679672965e-06, + "loss": 1.9075, + "step": 428000 + }, + { + "epoch": 2.74, + "eval_loss": 2.1445555686950684, + "eval_runtime": 15.263, + "eval_samples_per_second": 131.036, + "eval_steps_per_second": 2.097, + "step": 428000 + }, + { + "epoch": 2.75, + "learning_rate": 4.234727840647243e-06, + "loss": 1.9023, + "step": 429000 + }, + { + "epoch": 2.75, + "eval_loss": 2.1059927940368652, + "eval_runtime": 15.6241, + "eval_samples_per_second": 128.007, + "eval_steps_per_second": 2.048, + "step": 429000 + }, + { + "epoch": 2.75, + "learning_rate": 4.1280488845648354e-06, + "loss": 1.9096, + "step": 430000 + }, + { + "epoch": 2.75, + "eval_loss": 2.124612331390381, + "eval_runtime": 15.4182, + "eval_samples_per_second": 129.717, + "eval_steps_per_second": 2.075, + "step": 430000 + }, + { + "epoch": 2.76, + "learning_rate": 4.021369928482428e-06, + "loss": 1.9021, + "step": 431000 + }, + { + "epoch": 2.76, + "eval_loss": 2.1339197158813477, + "eval_runtime": 15.3184, + "eval_samples_per_second": 130.562, + "eval_steps_per_second": 2.089, + "step": 431000 + }, + { + "epoch": 2.77, + "learning_rate": 3.914690972400021e-06, + "loss": 1.9051, + "step": 432000 + }, + { + "epoch": 2.77, + "eval_loss": 2.150739908218384, + "eval_runtime": 15.3685, + "eval_samples_per_second": 130.137, + "eval_steps_per_second": 2.082, + "step": 432000 + }, + { + "epoch": 2.77, + "learning_rate": 3.808012016317614e-06, + "loss": 1.8959, + "step": 433000 + }, + { + "epoch": 2.77, + "eval_loss": 2.1340439319610596, + "eval_runtime": 15.9351, + "eval_samples_per_second": 125.509, + "eval_steps_per_second": 2.008, + "step": 433000 + }, + { + "epoch": 2.78, + "learning_rate": 3.7013330602352055e-06, + "loss": 1.8924, + "step": 434000 + }, + { + "epoch": 2.78, + "eval_loss": 2.1609554290771484, + "eval_runtime": 15.2114, + "eval_samples_per_second": 131.48, + "eval_steps_per_second": 2.104, + "step": 434000 + }, + { + "epoch": 2.78, + "learning_rate": 3.5946541041527984e-06, + "loss": 1.9091, + "step": 435000 + }, + { + "epoch": 2.78, + "eval_loss": 2.147794008255005, + "eval_runtime": 15.5411, + "eval_samples_per_second": 128.691, + "eval_steps_per_second": 2.059, + "step": 435000 + }, + { + "epoch": 2.79, + "learning_rate": 3.487975148070391e-06, + "loss": 1.8908, + "step": 436000 + }, + { + "epoch": 2.79, + "eval_loss": 2.100537061691284, + "eval_runtime": 15.6967, + "eval_samples_per_second": 127.415, + "eval_steps_per_second": 2.039, + "step": 436000 + }, + { + "epoch": 2.8, + "learning_rate": 3.3812961919879834e-06, + "loss": 1.8946, + "step": 437000 + }, + { + "epoch": 2.8, + "eval_loss": 2.111453056335449, + "eval_runtime": 15.3824, + "eval_samples_per_second": 130.019, + "eval_steps_per_second": 2.08, + "step": 437000 + }, + { + "epoch": 2.8, + "learning_rate": 3.2746172359055764e-06, + "loss": 1.8977, + "step": 438000 + }, + { + "epoch": 2.8, + "eval_loss": 2.130976676940918, + "eval_runtime": 15.1954, + "eval_samples_per_second": 131.618, + "eval_steps_per_second": 2.106, + "step": 438000 + }, + { + "epoch": 2.81, + "learning_rate": 3.167938279823169e-06, + "loss": 1.9021, + "step": 439000 + }, + { + "epoch": 2.81, + "eval_loss": 2.1252684593200684, + "eval_runtime": 15.3946, + "eval_samples_per_second": 129.916, + "eval_steps_per_second": 2.079, + "step": 439000 + }, + { + "epoch": 2.82, + "learning_rate": 3.061259323740762e-06, + "loss": 1.9019, + "step": 440000 + }, + { + "epoch": 2.82, + "eval_loss": 2.1282765865325928, + "eval_runtime": 15.6319, + "eval_samples_per_second": 127.943, + "eval_steps_per_second": 2.047, + "step": 440000 + }, + { + "epoch": 2.82, + "learning_rate": 2.9545803676583543e-06, + "loss": 1.8947, + "step": 441000 + }, + { + "epoch": 2.82, + "eval_loss": 2.1524507999420166, + "eval_runtime": 15.3337, + "eval_samples_per_second": 130.432, + "eval_steps_per_second": 2.087, + "step": 441000 + }, + { + "epoch": 2.83, + "learning_rate": 2.847901411575947e-06, + "loss": 1.8854, + "step": 442000 + }, + { + "epoch": 2.83, + "eval_loss": 2.1064517498016357, + "eval_runtime": 15.2656, + "eval_samples_per_second": 131.013, + "eval_steps_per_second": 2.096, + "step": 442000 + }, + { + "epoch": 2.84, + "learning_rate": 2.7412224554935398e-06, + "loss": 1.9007, + "step": 443000 + }, + { + "epoch": 2.84, + "eval_loss": 2.0694828033447266, + "eval_runtime": 15.8869, + "eval_samples_per_second": 125.89, + "eval_steps_per_second": 2.014, + "step": 443000 + }, + { + "epoch": 2.84, + "learning_rate": 2.6345434994111323e-06, + "loss": 1.8981, + "step": 444000 + }, + { + "epoch": 2.84, + "eval_loss": 2.1273715496063232, + "eval_runtime": 15.1985, + "eval_samples_per_second": 131.592, + "eval_steps_per_second": 2.105, + "step": 444000 + }, + { + "epoch": 2.85, + "learning_rate": 2.527864543328725e-06, + "loss": 1.8872, + "step": 445000 + }, + { + "epoch": 2.85, + "eval_loss": 2.1042518615722656, + "eval_runtime": 15.3793, + "eval_samples_per_second": 130.045, + "eval_steps_per_second": 2.081, + "step": 445000 + }, + { + "epoch": 2.85, + "learning_rate": 2.4211855872463177e-06, + "loss": 1.8957, + "step": 446000 + }, + { + "epoch": 2.85, + "eval_loss": 2.0750997066497803, + "eval_runtime": 15.5989, + "eval_samples_per_second": 128.214, + "eval_steps_per_second": 2.051, + "step": 446000 + }, + { + "epoch": 2.86, + "learning_rate": 2.3145066311639102e-06, + "loss": 1.9031, + "step": 447000 + }, + { + "epoch": 2.86, + "eval_loss": 2.127918004989624, + "eval_runtime": 15.6223, + "eval_samples_per_second": 128.022, + "eval_steps_per_second": 2.048, + "step": 447000 + }, + { + "epoch": 2.87, + "learning_rate": 2.2078276750815028e-06, + "loss": 1.9001, + "step": 448000 + }, + { + "epoch": 2.87, + "eval_loss": 2.1019787788391113, + "eval_runtime": 15.4843, + "eval_samples_per_second": 129.163, + "eval_steps_per_second": 2.067, + "step": 448000 + }, + { + "epoch": 2.87, + "learning_rate": 2.1011487189990953e-06, + "loss": 1.8964, + "step": 449000 + }, + { + "epoch": 2.87, + "eval_loss": 2.0935049057006836, + "eval_runtime": 15.5998, + "eval_samples_per_second": 128.206, + "eval_steps_per_second": 2.051, + "step": 449000 + }, + { + "epoch": 2.88, + "learning_rate": 1.994469762916688e-06, + "loss": 1.9003, + "step": 450000 + }, + { + "epoch": 2.88, + "eval_loss": 2.1466352939605713, + "eval_runtime": 15.1432, + "eval_samples_per_second": 132.072, + "eval_steps_per_second": 2.113, + "step": 450000 + }, + { + "epoch": 2.89, + "learning_rate": 1.8877908068342807e-06, + "loss": 1.9041, + "step": 451000 + }, + { + "epoch": 2.89, + "eval_loss": 2.1213934421539307, + "eval_runtime": 15.5486, + "eval_samples_per_second": 128.629, + "eval_steps_per_second": 2.058, + "step": 451000 + }, + { + "epoch": 2.89, + "learning_rate": 1.7811118507518734e-06, + "loss": 1.8972, + "step": 452000 + }, + { + "epoch": 2.89, + "eval_loss": 2.139911651611328, + "eval_runtime": 17.4254, + "eval_samples_per_second": 114.775, + "eval_steps_per_second": 1.836, + "step": 452000 + }, + { + "epoch": 2.9, + "learning_rate": 1.674432894669466e-06, + "loss": 1.9001, + "step": 453000 + }, + { + "epoch": 2.9, + "eval_loss": 2.1135449409484863, + "eval_runtime": 15.3928, + "eval_samples_per_second": 129.931, + "eval_steps_per_second": 2.079, + "step": 453000 + }, + { + "epoch": 2.91, + "learning_rate": 1.5677539385870587e-06, + "loss": 1.9034, + "step": 454000 + }, + { + "epoch": 2.91, + "eval_loss": 2.0974974632263184, + "eval_runtime": 15.5392, + "eval_samples_per_second": 128.707, + "eval_steps_per_second": 2.059, + "step": 454000 + }, + { + "epoch": 2.91, + "learning_rate": 1.4610749825046512e-06, + "loss": 1.88, + "step": 455000 + }, + { + "epoch": 2.91, + "eval_loss": 2.086946725845337, + "eval_runtime": 15.3909, + "eval_samples_per_second": 129.947, + "eval_steps_per_second": 2.079, + "step": 455000 + }, + { + "epoch": 2.92, + "learning_rate": 1.354396026422244e-06, + "loss": 1.894, + "step": 456000 + }, + { + "epoch": 2.92, + "eval_loss": 2.0814855098724365, + "eval_runtime": 16.0281, + "eval_samples_per_second": 124.781, + "eval_steps_per_second": 1.996, + "step": 456000 + }, + { + "epoch": 2.93, + "learning_rate": 1.2477170703398366e-06, + "loss": 1.8956, + "step": 457000 + }, + { + "epoch": 2.93, + "eval_loss": 2.1207478046417236, + "eval_runtime": 16.265, + "eval_samples_per_second": 122.964, + "eval_steps_per_second": 1.967, + "step": 457000 + }, + { + "epoch": 2.93, + "learning_rate": 1.1410381142574291e-06, + "loss": 1.8882, + "step": 458000 + }, + { + "epoch": 2.93, + "eval_loss": 2.1136324405670166, + "eval_runtime": 15.2771, + "eval_samples_per_second": 130.915, + "eval_steps_per_second": 2.095, + "step": 458000 + }, + { + "epoch": 2.94, + "learning_rate": 1.0343591581750219e-06, + "loss": 1.8924, + "step": 459000 + }, + { + "epoch": 2.94, + "eval_loss": 2.137352466583252, + "eval_runtime": 15.981, + "eval_samples_per_second": 125.149, + "eval_steps_per_second": 2.002, + "step": 459000 + }, + { + "epoch": 2.94, + "learning_rate": 9.276802020926144e-07, + "loss": 1.8953, + "step": 460000 + }, + { + "epoch": 2.94, + "eval_loss": 2.1012661457061768, + "eval_runtime": 15.3369, + "eval_samples_per_second": 130.404, + "eval_steps_per_second": 2.086, + "step": 460000 + }, + { + "epoch": 2.95, + "learning_rate": 8.210012460102071e-07, + "loss": 1.893, + "step": 461000 + }, + { + "epoch": 2.95, + "eval_loss": 2.135178804397583, + "eval_runtime": 15.8046, + "eval_samples_per_second": 126.546, + "eval_steps_per_second": 2.025, + "step": 461000 + }, + { + "epoch": 2.96, + "learning_rate": 7.143222899277997e-07, + "loss": 1.8903, + "step": 462000 + }, + { + "epoch": 2.96, + "eval_loss": 2.1333072185516357, + "eval_runtime": 15.5282, + "eval_samples_per_second": 128.798, + "eval_steps_per_second": 2.061, + "step": 462000 + }, + { + "epoch": 2.96, + "learning_rate": 6.076433338453923e-07, + "loss": 1.8895, + "step": 463000 + }, + { + "epoch": 2.96, + "eval_loss": 2.1294093132019043, + "eval_runtime": 15.3716, + "eval_samples_per_second": 130.11, + "eval_steps_per_second": 2.082, + "step": 463000 + }, + { + "epoch": 2.97, + "learning_rate": 5.009643777629849e-07, + "loss": 1.8939, + "step": 464000 + }, + { + "epoch": 2.97, + "eval_loss": 2.1235413551330566, + "eval_runtime": 15.3293, + "eval_samples_per_second": 130.469, + "eval_steps_per_second": 2.088, + "step": 464000 + }, + { + "epoch": 2.98, + "learning_rate": 3.9428542168057766e-07, + "loss": 1.8915, + "step": 465000 + }, + { + "epoch": 2.98, + "eval_loss": 2.0933895111083984, + "eval_runtime": 15.9617, + "eval_samples_per_second": 125.3, + "eval_steps_per_second": 2.005, + "step": 465000 + }, + { + "epoch": 2.98, + "learning_rate": 2.8760646559817023e-07, + "loss": 1.8884, + "step": 466000 + }, + { + "epoch": 2.98, + "eval_loss": 2.1353940963745117, + "eval_runtime": 15.6819, + "eval_samples_per_second": 127.536, + "eval_steps_per_second": 2.041, + "step": 466000 + }, + { + "epoch": 2.99, + "learning_rate": 1.809275095157629e-07, + "loss": 1.8932, + "step": 467000 + }, + { + "epoch": 2.99, + "eval_loss": 2.1101338863372803, + "eval_runtime": 15.545, + "eval_samples_per_second": 128.659, + "eval_steps_per_second": 2.059, + "step": 467000 + }, + { + "epoch": 3.0, + "learning_rate": 7.424855343335553e-08, + "loss": 1.9, + "step": 468000 + }, + { + "epoch": 3.0, + "eval_loss": 2.130716562271118, + "eval_runtime": 15.4114, + "eval_samples_per_second": 129.774, + "eval_steps_per_second": 2.076, + "step": 468000 + }, + { + "epoch": 3.0, + "step": 468696, + "total_flos": 6.219491681834838e+18, + "train_loss": 0.6429893864398178, + "train_runtime": 172266.1403, + "train_samples_per_second": 174.128, + "train_steps_per_second": 2.721 + } + ], + "max_steps": 468696, + "num_train_epochs": 3, + "total_flos": 6.219491681834838e+18, + "trial_name": null, + "trial_params": null +}