diff --git "a/checkpoint-5500/trainer_state.json" "b/checkpoint-5500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-5500/trainer_state.json" @@ -0,0 +1,3803 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 285.7142857142857, + "eval_steps": 500, + "global_step": 5500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.99, + "eval_gen_len": 14.487, + "eval_loss": 21.441953659057617, + "eval_rouge1": 0.0832, + "eval_rouge2": 0.0126, + "eval_rougeL": 0.063, + "eval_rougeLsum": 0.0631, + "eval_runtime": 16.3828, + "eval_samples_per_second": 7.02, + "eval_steps_per_second": 1.221, + "step": 19 + }, + { + "epoch": 1.97, + "eval_gen_len": 14.6261, + "eval_loss": 21.211212158203125, + "eval_rouge1": 0.0858, + "eval_rouge2": 0.014, + "eval_rougeL": 0.0648, + "eval_rougeLsum": 0.0652, + "eval_runtime": 15.2254, + "eval_samples_per_second": 7.553, + "eval_steps_per_second": 1.314, + "step": 38 + }, + { + "epoch": 2.96, + "eval_gen_len": 14.6783, + "eval_loss": 20.936405181884766, + "eval_rouge1": 0.0866, + "eval_rouge2": 0.0147, + "eval_rougeL": 0.0655, + "eval_rougeLsum": 0.066, + "eval_runtime": 18.5865, + "eval_samples_per_second": 6.187, + "eval_steps_per_second": 1.076, + "step": 57 + }, + { + "epoch": 4.0, + "eval_gen_len": 14.8522, + "eval_loss": 20.667041778564453, + "eval_rouge1": 0.088, + "eval_rouge2": 0.0145, + "eval_rougeL": 0.0659, + "eval_rougeLsum": 0.0663, + "eval_runtime": 12.6199, + "eval_samples_per_second": 9.113, + "eval_steps_per_second": 1.585, + "step": 77 + }, + { + "epoch": 4.99, + "eval_gen_len": 15.113, + "eval_loss": 20.46696662902832, + "eval_rouge1": 0.0912, + "eval_rouge2": 0.0145, + "eval_rougeL": 0.0677, + "eval_rougeLsum": 0.0677, + "eval_runtime": 15.7216, + "eval_samples_per_second": 7.315, + "eval_steps_per_second": 1.272, + "step": 96 + }, + { + "epoch": 5.97, + "eval_gen_len": 15.2087, + "eval_loss": 20.282737731933594, + "eval_rouge1": 0.0913, + "eval_rouge2": 0.0145, + "eval_rougeL": 0.0679, + "eval_rougeLsum": 0.068, + "eval_runtime": 15.2977, + "eval_samples_per_second": 7.517, + "eval_steps_per_second": 1.307, + "step": 115 + }, + { + "epoch": 6.96, + "eval_gen_len": 15.4087, + "eval_loss": 20.09146499633789, + "eval_rouge1": 0.0918, + "eval_rouge2": 0.0137, + "eval_rougeL": 0.0686, + "eval_rougeLsum": 0.0687, + "eval_runtime": 16.9649, + "eval_samples_per_second": 6.779, + "eval_steps_per_second": 1.179, + "step": 134 + }, + { + "epoch": 8.0, + "eval_gen_len": 16.0435, + "eval_loss": 19.872163772583008, + "eval_rouge1": 0.0969, + "eval_rouge2": 0.0164, + "eval_rougeL": 0.0736, + "eval_rougeLsum": 0.0737, + "eval_runtime": 15.3373, + "eval_samples_per_second": 7.498, + "eval_steps_per_second": 1.304, + "step": 154 + }, + { + "epoch": 8.99, + "eval_gen_len": 16.5739, + "eval_loss": 19.655122756958008, + "eval_rouge1": 0.1052, + "eval_rouge2": 0.0198, + "eval_rougeL": 0.0799, + "eval_rougeLsum": 0.0796, + "eval_runtime": 13.8235, + "eval_samples_per_second": 8.319, + "eval_steps_per_second": 1.447, + "step": 173 + }, + { + "epoch": 9.97, + "eval_gen_len": 17.0435, + "eval_loss": 19.420446395874023, + "eval_rouge1": 0.1071, + "eval_rouge2": 0.0188, + "eval_rougeL": 0.0809, + "eval_rougeLsum": 0.0808, + "eval_runtime": 14.7139, + "eval_samples_per_second": 7.816, + "eval_steps_per_second": 1.359, + "step": 192 + }, + { + "epoch": 10.96, + "eval_gen_len": 17.1913, + "eval_loss": 19.156597137451172, + "eval_rouge1": 0.1061, + "eval_rouge2": 0.0185, + "eval_rougeL": 0.0815, + "eval_rougeLsum": 0.0819, + "eval_runtime": 14.1553, + "eval_samples_per_second": 8.124, + "eval_steps_per_second": 1.413, + "step": 211 + }, + { + "epoch": 12.0, + "eval_gen_len": 17.2522, + "eval_loss": 18.833667755126953, + "eval_rouge1": 0.1069, + "eval_rouge2": 0.0213, + "eval_rougeL": 0.0826, + "eval_rougeLsum": 0.0828, + "eval_runtime": 15.232, + "eval_samples_per_second": 7.55, + "eval_steps_per_second": 1.313, + "step": 231 + }, + { + "epoch": 12.99, + "eval_gen_len": 17.287, + "eval_loss": 18.463964462280273, + "eval_rouge1": 0.1105, + "eval_rouge2": 0.0234, + "eval_rougeL": 0.0858, + "eval_rougeLsum": 0.0852, + "eval_runtime": 15.0679, + "eval_samples_per_second": 7.632, + "eval_steps_per_second": 1.327, + "step": 250 + }, + { + "epoch": 13.97, + "eval_gen_len": 17.4696, + "eval_loss": 18.000520706176758, + "eval_rouge1": 0.1101, + "eval_rouge2": 0.0232, + "eval_rougeL": 0.0873, + "eval_rougeLsum": 0.0872, + "eval_runtime": 14.073, + "eval_samples_per_second": 8.172, + "eval_steps_per_second": 1.421, + "step": 269 + }, + { + "epoch": 14.96, + "eval_gen_len": 17.2261, + "eval_loss": 17.395872116088867, + "eval_rouge1": 0.103, + "eval_rouge2": 0.023, + "eval_rougeL": 0.0821, + "eval_rougeLsum": 0.0819, + "eval_runtime": 17.5824, + "eval_samples_per_second": 6.541, + "eval_steps_per_second": 1.138, + "step": 288 + }, + { + "epoch": 16.0, + "eval_gen_len": 17.6783, + "eval_loss": 16.634456634521484, + "eval_rouge1": 0.1034, + "eval_rouge2": 0.0209, + "eval_rougeL": 0.0804, + "eval_rougeLsum": 0.0802, + "eval_runtime": 14.5519, + "eval_samples_per_second": 7.903, + "eval_steps_per_second": 1.374, + "step": 308 + }, + { + "epoch": 16.99, + "eval_gen_len": 16.3565, + "eval_loss": 15.872416496276855, + "eval_rouge1": 0.0841, + "eval_rouge2": 0.0149, + "eval_rougeL": 0.0674, + "eval_rougeLsum": 0.0674, + "eval_runtime": 15.4052, + "eval_samples_per_second": 7.465, + "eval_steps_per_second": 1.298, + "step": 327 + }, + { + "epoch": 17.97, + "eval_gen_len": 15.2609, + "eval_loss": 15.058935165405273, + "eval_rouge1": 0.0697, + "eval_rouge2": 0.0097, + "eval_rougeL": 0.0554, + "eval_rougeLsum": 0.0556, + "eval_runtime": 22.9079, + "eval_samples_per_second": 5.02, + "eval_steps_per_second": 0.873, + "step": 346 + }, + { + "epoch": 18.96, + "eval_gen_len": 14.7304, + "eval_loss": 14.074901580810547, + "eval_rouge1": 0.0584, + "eval_rouge2": 0.0065, + "eval_rougeL": 0.047, + "eval_rougeLsum": 0.0472, + "eval_runtime": 13.8432, + "eval_samples_per_second": 8.307, + "eval_steps_per_second": 1.445, + "step": 365 + }, + { + "epoch": 20.0, + "eval_gen_len": 12.0783, + "eval_loss": 12.981775283813477, + "eval_rouge1": 0.037, + "eval_rouge2": 0.004, + "eval_rougeL": 0.0314, + "eval_rougeLsum": 0.0312, + "eval_runtime": 17.4992, + "eval_samples_per_second": 6.572, + "eval_steps_per_second": 1.143, + "step": 385 + }, + { + "epoch": 20.99, + "eval_gen_len": 13.3043, + "eval_loss": 12.14104175567627, + "eval_rouge1": 0.0327, + "eval_rouge2": 0.0027, + "eval_rougeL": 0.0287, + "eval_rougeLsum": 0.0288, + "eval_runtime": 14.6695, + "eval_samples_per_second": 7.839, + "eval_steps_per_second": 1.363, + "step": 404 + }, + { + "epoch": 21.97, + "eval_gen_len": 14.1565, + "eval_loss": 11.347674369812012, + "eval_rouge1": 0.0206, + "eval_rouge2": 0.0006, + "eval_rougeL": 0.0188, + "eval_rougeLsum": 0.019, + "eval_runtime": 14.3559, + "eval_samples_per_second": 8.011, + "eval_steps_per_second": 1.393, + "step": 423 + }, + { + "epoch": 22.96, + "eval_gen_len": 14.5652, + "eval_loss": 10.547377586364746, + "eval_rouge1": 0.0136, + "eval_rouge2": 0.0008, + "eval_rougeL": 0.0121, + "eval_rougeLsum": 0.0123, + "eval_runtime": 16.36, + "eval_samples_per_second": 7.029, + "eval_steps_per_second": 1.222, + "step": 442 + }, + { + "epoch": 24.0, + "eval_gen_len": 15.9391, + "eval_loss": 9.721901893615723, + "eval_rouge1": 0.0056, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0051, + "eval_rougeLsum": 0.0051, + "eval_runtime": 17.3804, + "eval_samples_per_second": 6.617, + "eval_steps_per_second": 1.151, + "step": 462 + }, + { + "epoch": 24.99, + "eval_gen_len": 17.0522, + "eval_loss": 8.976031303405762, + "eval_rouge1": 0.0029, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0026, + "eval_rougeLsum": 0.0026, + "eval_runtime": 15.7718, + "eval_samples_per_second": 7.291, + "eval_steps_per_second": 1.268, + "step": 481 + }, + { + "epoch": 25.97, + "grad_norm": 6.211065292358398, + "learning_rate": 1.8252631578947372e-05, + "loss": 16.8471, + "step": 500 + }, + { + "epoch": 25.97, + "eval_gen_len": 18.0261, + "eval_loss": 8.254261016845703, + "eval_rouge1": 0.001, + "eval_rouge2": 0.0, + "eval_rougeL": 0.001, + "eval_rougeLsum": 0.001, + "eval_runtime": 18.4696, + "eval_samples_per_second": 6.226, + "eval_steps_per_second": 1.083, + "step": 500 + }, + { + "epoch": 26.96, + "eval_gen_len": 18.8609, + "eval_loss": 7.542705059051514, + "eval_rouge1": 0.0008, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0009, + "eval_rougeLsum": 0.0008, + "eval_runtime": 14.9383, + "eval_samples_per_second": 7.698, + "eval_steps_per_second": 1.339, + "step": 519 + }, + { + "epoch": 28.0, + "eval_gen_len": 19.0, + "eval_loss": 6.831495761871338, + "eval_rouge1": 0.0007, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0007, + "eval_rougeLsum": 0.0007, + "eval_runtime": 15.5617, + "eval_samples_per_second": 7.39, + "eval_steps_per_second": 1.285, + "step": 539 + }, + { + "epoch": 28.99, + "eval_gen_len": 19.0, + "eval_loss": 6.190303325653076, + "eval_rouge1": 0.0002, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0002, + "eval_rougeLsum": 0.0002, + "eval_runtime": 15.014, + "eval_samples_per_second": 7.66, + "eval_steps_per_second": 1.332, + "step": 558 + }, + { + "epoch": 29.97, + "eval_gen_len": 19.0, + "eval_loss": 5.610296726226807, + "eval_rouge1": 0.0018, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0016, + "eval_rougeLsum": 0.0016, + "eval_runtime": 15.3165, + "eval_samples_per_second": 7.508, + "eval_steps_per_second": 1.306, + "step": 577 + }, + { + "epoch": 30.96, + "eval_gen_len": 19.0, + "eval_loss": 5.068519592285156, + "eval_rouge1": 0.0011, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0011, + "eval_rougeLsum": 0.0011, + "eval_runtime": 16.2303, + "eval_samples_per_second": 7.086, + "eval_steps_per_second": 1.232, + "step": 596 + }, + { + "epoch": 32.0, + "eval_gen_len": 19.0, + "eval_loss": 4.54244327545166, + "eval_rouge1": 0.0009, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0009, + "eval_rougeLsum": 0.0009, + "eval_runtime": 14.4723, + "eval_samples_per_second": 7.946, + "eval_steps_per_second": 1.382, + "step": 616 + }, + { + "epoch": 32.99, + "eval_gen_len": 19.0, + "eval_loss": 4.084940433502197, + "eval_rouge1": 0.0008, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0008, + "eval_rougeLsum": 0.0008, + "eval_runtime": 17.3788, + "eval_samples_per_second": 6.617, + "eval_steps_per_second": 1.151, + "step": 635 + }, + { + "epoch": 33.97, + "eval_gen_len": 19.0, + "eval_loss": 3.7023561000823975, + "eval_rouge1": 0.0014, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0015, + "eval_rougeLsum": 0.0015, + "eval_runtime": 17.4976, + "eval_samples_per_second": 6.572, + "eval_steps_per_second": 1.143, + "step": 654 + }, + { + "epoch": 34.96, + "eval_gen_len": 19.0, + "eval_loss": 3.3644134998321533, + "eval_rouge1": 0.0035, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0035, + "eval_rougeLsum": 0.0035, + "eval_runtime": 15.0993, + "eval_samples_per_second": 7.616, + "eval_steps_per_second": 1.325, + "step": 673 + }, + { + "epoch": 36.0, + "eval_gen_len": 19.0, + "eval_loss": 3.0496110916137695, + "eval_rouge1": 0.0064, + "eval_rouge2": 0.0002, + "eval_rougeL": 0.0063, + "eval_rougeLsum": 0.0064, + "eval_runtime": 26.2608, + "eval_samples_per_second": 4.379, + "eval_steps_per_second": 0.762, + "step": 693 + }, + { + "epoch": 36.99, + "eval_gen_len": 18.9913, + "eval_loss": 2.7962286472320557, + "eval_rouge1": 0.0073, + "eval_rouge2": 0.0002, + "eval_rougeL": 0.0073, + "eval_rougeLsum": 0.0074, + "eval_runtime": 16.0565, + "eval_samples_per_second": 7.162, + "eval_steps_per_second": 1.246, + "step": 712 + }, + { + "epoch": 37.97, + "eval_gen_len": 18.8435, + "eval_loss": 2.5821166038513184, + "eval_rouge1": 0.0078, + "eval_rouge2": 0.0002, + "eval_rougeL": 0.0076, + "eval_rougeLsum": 0.0078, + "eval_runtime": 24.2703, + "eval_samples_per_second": 4.738, + "eval_steps_per_second": 0.824, + "step": 731 + }, + { + "epoch": 38.96, + "eval_gen_len": 16.9043, + "eval_loss": 2.4025700092315674, + "eval_rouge1": 0.0063, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0063, + "eval_rougeLsum": 0.0063, + "eval_runtime": 12.9295, + "eval_samples_per_second": 8.894, + "eval_steps_per_second": 1.547, + "step": 750 + }, + { + "epoch": 40.0, + "eval_gen_len": 9.6696, + "eval_loss": 2.2464537620544434, + "eval_rouge1": 0.0008, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0008, + "eval_rougeLsum": 0.0008, + "eval_runtime": 14.656, + "eval_samples_per_second": 7.847, + "eval_steps_per_second": 1.365, + "step": 770 + }, + { + "epoch": 40.99, + "eval_gen_len": 7.4435, + "eval_loss": 2.124486207962036, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 21.0326, + "eval_samples_per_second": 5.468, + "eval_steps_per_second": 0.951, + "step": 789 + }, + { + "epoch": 41.97, + "eval_gen_len": 6.9478, + "eval_loss": 2.022434949874878, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 12.9888, + "eval_samples_per_second": 8.854, + "eval_steps_per_second": 1.54, + "step": 808 + }, + { + "epoch": 42.96, + "eval_gen_len": 6.4696, + "eval_loss": 1.9459978342056274, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 10.4716, + "eval_samples_per_second": 10.982, + "eval_steps_per_second": 1.91, + "step": 827 + }, + { + "epoch": 44.0, + "eval_gen_len": 6.1304, + "eval_loss": 1.8852447271347046, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 10.5001, + "eval_samples_per_second": 10.952, + "eval_steps_per_second": 1.905, + "step": 847 + }, + { + "epoch": 44.99, + "eval_gen_len": 5.9391, + "eval_loss": 1.838249921798706, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 10.0487, + "eval_samples_per_second": 11.444, + "eval_steps_per_second": 1.99, + "step": 866 + }, + { + "epoch": 45.97, + "eval_gen_len": 6.087, + "eval_loss": 1.7976738214492798, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 10.2357, + "eval_samples_per_second": 11.235, + "eval_steps_per_second": 1.954, + "step": 885 + }, + { + "epoch": 46.96, + "eval_gen_len": 6.2609, + "eval_loss": 1.7594256401062012, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 10.0331, + "eval_samples_per_second": 11.462, + "eval_steps_per_second": 1.993, + "step": 904 + }, + { + "epoch": 48.0, + "eval_gen_len": 6.3565, + "eval_loss": 1.7259361743927002, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 10.0928, + "eval_samples_per_second": 11.394, + "eval_steps_per_second": 1.982, + "step": 924 + }, + { + "epoch": 48.99, + "eval_gen_len": 6.0348, + "eval_loss": 1.7035044431686401, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 10.501, + "eval_samples_per_second": 10.951, + "eval_steps_per_second": 1.905, + "step": 943 + }, + { + "epoch": 49.97, + "eval_gen_len": 6.113, + "eval_loss": 1.681233525276184, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 19.043, + "eval_samples_per_second": 6.039, + "eval_steps_per_second": 1.05, + "step": 962 + }, + { + "epoch": 50.96, + "eval_gen_len": 5.8696, + "eval_loss": 1.6589038372039795, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 22.3499, + "eval_samples_per_second": 5.145, + "eval_steps_per_second": 0.895, + "step": 981 + }, + { + "epoch": 51.95, + "grad_norm": 2.3630588054656982, + "learning_rate": 1.650526315789474e-05, + "loss": 4.012, + "step": 1000 + }, + { + "epoch": 52.0, + "eval_gen_len": 5.713, + "eval_loss": 1.639954924583435, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 22.1264, + "eval_samples_per_second": 5.197, + "eval_steps_per_second": 0.904, + "step": 1001 + }, + { + "epoch": 52.99, + "eval_gen_len": 5.6957, + "eval_loss": 1.6223595142364502, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 19.2896, + "eval_samples_per_second": 5.962, + "eval_steps_per_second": 1.037, + "step": 1020 + }, + { + "epoch": 53.97, + "eval_gen_len": 5.887, + "eval_loss": 1.6063199043273926, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 17.9683, + "eval_samples_per_second": 6.4, + "eval_steps_per_second": 1.113, + "step": 1039 + }, + { + "epoch": 54.96, + "eval_gen_len": 5.9826, + "eval_loss": 1.5919499397277832, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 20.6085, + "eval_samples_per_second": 5.58, + "eval_steps_per_second": 0.97, + "step": 1058 + }, + { + "epoch": 56.0, + "eval_gen_len": 6.0087, + "eval_loss": 1.5780121088027954, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 22.8466, + "eval_samples_per_second": 5.034, + "eval_steps_per_second": 0.875, + "step": 1078 + }, + { + "epoch": 56.99, + "eval_gen_len": 5.9652, + "eval_loss": 1.5654348134994507, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 16.8326, + "eval_samples_per_second": 6.832, + "eval_steps_per_second": 1.188, + "step": 1097 + }, + { + "epoch": 57.97, + "eval_gen_len": 6.3304, + "eval_loss": 1.5537272691726685, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 14.7124, + "eval_samples_per_second": 7.817, + "eval_steps_per_second": 1.359, + "step": 1116 + }, + { + "epoch": 58.96, + "eval_gen_len": 6.8609, + "eval_loss": 1.5426743030548096, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 14.3263, + "eval_samples_per_second": 8.027, + "eval_steps_per_second": 1.396, + "step": 1135 + }, + { + "epoch": 60.0, + "eval_gen_len": 7.2, + "eval_loss": 1.5310094356536865, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 17.746, + "eval_samples_per_second": 6.48, + "eval_steps_per_second": 1.127, + "step": 1155 + }, + { + "epoch": 60.99, + "eval_gen_len": 7.4261, + "eval_loss": 1.519776701927185, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 16.3177, + "eval_samples_per_second": 7.048, + "eval_steps_per_second": 1.226, + "step": 1174 + }, + { + "epoch": 61.97, + "eval_gen_len": 6.9826, + "eval_loss": 1.5120151042938232, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 15.0793, + "eval_samples_per_second": 7.626, + "eval_steps_per_second": 1.326, + "step": 1193 + }, + { + "epoch": 62.96, + "eval_gen_len": 6.6957, + "eval_loss": 1.500430941581726, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 16.015, + "eval_samples_per_second": 7.181, + "eval_steps_per_second": 1.249, + "step": 1212 + }, + { + "epoch": 64.0, + "eval_gen_len": 6.9565, + "eval_loss": 1.489511489868164, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 10.2674, + "eval_samples_per_second": 11.2, + "eval_steps_per_second": 1.948, + "step": 1232 + }, + { + "epoch": 64.99, + "eval_gen_len": 7.2348, + "eval_loss": 1.4760735034942627, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 14.5412, + "eval_samples_per_second": 7.909, + "eval_steps_per_second": 1.375, + "step": 1251 + }, + { + "epoch": 65.97, + "eval_gen_len": 7.5043, + "eval_loss": 1.4650626182556152, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 19.5952, + "eval_samples_per_second": 5.869, + "eval_steps_per_second": 1.021, + "step": 1270 + }, + { + "epoch": 66.96, + "eval_gen_len": 7.4174, + "eval_loss": 1.4578195810317993, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 15.4051, + "eval_samples_per_second": 7.465, + "eval_steps_per_second": 1.298, + "step": 1289 + }, + { + "epoch": 68.0, + "eval_gen_len": 7.5304, + "eval_loss": 1.449414610862732, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 21.6421, + "eval_samples_per_second": 5.314, + "eval_steps_per_second": 0.924, + "step": 1309 + }, + { + "epoch": 68.99, + "eval_gen_len": 7.4261, + "eval_loss": 1.4453145265579224, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 15.2733, + "eval_samples_per_second": 7.529, + "eval_steps_per_second": 1.309, + "step": 1328 + }, + { + "epoch": 69.97, + "eval_gen_len": 7.5217, + "eval_loss": 1.4360324144363403, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 17.557, + "eval_samples_per_second": 6.55, + "eval_steps_per_second": 1.139, + "step": 1347 + }, + { + "epoch": 70.96, + "eval_gen_len": 7.513, + "eval_loss": 1.4272183179855347, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 15.8995, + "eval_samples_per_second": 7.233, + "eval_steps_per_second": 1.258, + "step": 1366 + }, + { + "epoch": 72.0, + "eval_gen_len": 7.5391, + "eval_loss": 1.420629620552063, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 18.7912, + "eval_samples_per_second": 6.12, + "eval_steps_per_second": 1.064, + "step": 1386 + }, + { + "epoch": 72.99, + "eval_gen_len": 7.6261, + "eval_loss": 1.4113017320632935, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 17.3516, + "eval_samples_per_second": 6.628, + "eval_steps_per_second": 1.153, + "step": 1405 + }, + { + "epoch": 73.97, + "eval_gen_len": 7.9478, + "eval_loss": 1.4024852514266968, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 15.9512, + "eval_samples_per_second": 7.21, + "eval_steps_per_second": 1.254, + "step": 1424 + }, + { + "epoch": 74.96, + "eval_gen_len": 7.687, + "eval_loss": 1.3967483043670654, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 13.9958, + "eval_samples_per_second": 8.217, + "eval_steps_per_second": 1.429, + "step": 1443 + }, + { + "epoch": 76.0, + "eval_gen_len": 7.5391, + "eval_loss": 1.390748143196106, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 21.6525, + "eval_samples_per_second": 5.311, + "eval_steps_per_second": 0.924, + "step": 1463 + }, + { + "epoch": 76.99, + "eval_gen_len": 7.687, + "eval_loss": 1.3812955617904663, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 14.5594, + "eval_samples_per_second": 7.899, + "eval_steps_per_second": 1.374, + "step": 1482 + }, + { + "epoch": 77.92, + "grad_norm": 4.4105072021484375, + "learning_rate": 1.475438596491228e-05, + "loss": 1.7845, + "step": 1500 + }, + { + "epoch": 77.97, + "eval_gen_len": 7.8174, + "eval_loss": 1.373058557510376, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 13.5779, + "eval_samples_per_second": 8.47, + "eval_steps_per_second": 1.473, + "step": 1501 + }, + { + "epoch": 78.96, + "eval_gen_len": 8.0435, + "eval_loss": 1.364722728729248, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 13.5052, + "eval_samples_per_second": 8.515, + "eval_steps_per_second": 1.481, + "step": 1520 + }, + { + "epoch": 80.0, + "eval_gen_len": 8.4087, + "eval_loss": 1.3542518615722656, + "eval_rouge1": 0.001, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0007, + "eval_rougeLsum": 0.0007, + "eval_runtime": 16.4169, + "eval_samples_per_second": 7.005, + "eval_steps_per_second": 1.218, + "step": 1540 + }, + { + "epoch": 80.99, + "eval_gen_len": 8.2, + "eval_loss": 1.3473597764968872, + "eval_rouge1": 0.0006, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 16.0157, + "eval_samples_per_second": 7.18, + "eval_steps_per_second": 1.249, + "step": 1559 + }, + { + "epoch": 81.97, + "eval_gen_len": 7.7739, + "eval_loss": 1.3397005796432495, + "eval_rouge1": 0.0006, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0006, + "eval_rougeLsum": 0.0006, + "eval_runtime": 15.1007, + "eval_samples_per_second": 7.616, + "eval_steps_per_second": 1.324, + "step": 1578 + }, + { + "epoch": 82.96, + "eval_gen_len": 7.4783, + "eval_loss": 1.3318209648132324, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 15.6414, + "eval_samples_per_second": 7.352, + "eval_steps_per_second": 1.279, + "step": 1597 + }, + { + "epoch": 84.0, + "eval_gen_len": 7.7478, + "eval_loss": 1.3251750469207764, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 15.8191, + "eval_samples_per_second": 7.27, + "eval_steps_per_second": 1.264, + "step": 1617 + }, + { + "epoch": 84.99, + "eval_gen_len": 7.8609, + "eval_loss": 1.3169076442718506, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 14.9535, + "eval_samples_per_second": 7.691, + "eval_steps_per_second": 1.337, + "step": 1636 + }, + { + "epoch": 85.97, + "eval_gen_len": 8.0609, + "eval_loss": 1.308994174003601, + "eval_rouge1": 0.0011, + "eval_rouge2": 0.0004, + "eval_rougeL": 0.0009, + "eval_rougeLsum": 0.0009, + "eval_runtime": 17.1311, + "eval_samples_per_second": 6.713, + "eval_steps_per_second": 1.167, + "step": 1655 + }, + { + "epoch": 86.96, + "eval_gen_len": 8.4174, + "eval_loss": 1.3022288084030151, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 18.0371, + "eval_samples_per_second": 6.376, + "eval_steps_per_second": 1.109, + "step": 1674 + }, + { + "epoch": 88.0, + "eval_gen_len": 8.6696, + "eval_loss": 1.2966970205307007, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 17.2294, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 1.161, + "step": 1694 + }, + { + "epoch": 88.99, + "eval_gen_len": 8.5913, + "eval_loss": 1.2914807796478271, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 16.2714, + "eval_samples_per_second": 7.068, + "eval_steps_per_second": 1.229, + "step": 1713 + }, + { + "epoch": 89.97, + "eval_gen_len": 8.4609, + "eval_loss": 1.285845398902893, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 16.2006, + "eval_samples_per_second": 7.099, + "eval_steps_per_second": 1.235, + "step": 1732 + }, + { + "epoch": 90.96, + "eval_gen_len": 8.3304, + "eval_loss": 1.2773631811141968, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 14.3032, + "eval_samples_per_second": 8.04, + "eval_steps_per_second": 1.398, + "step": 1751 + }, + { + "epoch": 92.0, + "eval_gen_len": 8.4087, + "eval_loss": 1.2694664001464844, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 14.8525, + "eval_samples_per_second": 7.743, + "eval_steps_per_second": 1.347, + "step": 1771 + }, + { + "epoch": 92.99, + "eval_gen_len": 8.5217, + "eval_loss": 1.2651293277740479, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 14.3351, + "eval_samples_per_second": 8.022, + "eval_steps_per_second": 1.395, + "step": 1790 + }, + { + "epoch": 93.97, + "eval_gen_len": 8.5217, + "eval_loss": 1.2624008655548096, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 14.2422, + "eval_samples_per_second": 8.075, + "eval_steps_per_second": 1.404, + "step": 1809 + }, + { + "epoch": 94.96, + "eval_gen_len": 8.4783, + "eval_loss": 1.2562423944473267, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 15.5045, + "eval_samples_per_second": 7.417, + "eval_steps_per_second": 1.29, + "step": 1828 + }, + { + "epoch": 96.0, + "eval_gen_len": 8.287, + "eval_loss": 1.2521991729736328, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 18.0292, + "eval_samples_per_second": 6.379, + "eval_steps_per_second": 1.109, + "step": 1848 + }, + { + "epoch": 96.99, + "eval_gen_len": 8.2522, + "eval_loss": 1.2463409900665283, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 16.1387, + "eval_samples_per_second": 7.126, + "eval_steps_per_second": 1.239, + "step": 1867 + }, + { + "epoch": 97.97, + "eval_gen_len": 8.5217, + "eval_loss": 1.2417724132537842, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 24.8398, + "eval_samples_per_second": 4.63, + "eval_steps_per_second": 0.805, + "step": 1886 + }, + { + "epoch": 98.96, + "eval_gen_len": 8.6609, + "eval_loss": 1.2342702150344849, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 13.8803, + "eval_samples_per_second": 8.285, + "eval_steps_per_second": 1.441, + "step": 1905 + }, + { + "epoch": 100.0, + "eval_gen_len": 8.687, + "eval_loss": 1.2301725149154663, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 15.943, + "eval_samples_per_second": 7.213, + "eval_steps_per_second": 1.254, + "step": 1925 + }, + { + "epoch": 100.99, + "eval_gen_len": 8.4609, + "eval_loss": 1.226989507675171, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 16.4066, + "eval_samples_per_second": 7.009, + "eval_steps_per_second": 1.219, + "step": 1944 + }, + { + "epoch": 101.97, + "eval_gen_len": 8.2957, + "eval_loss": 1.220055103302002, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 17.02, + "eval_samples_per_second": 6.757, + "eval_steps_per_second": 1.175, + "step": 1963 + }, + { + "epoch": 102.96, + "eval_gen_len": 8.1826, + "eval_loss": 1.215019702911377, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 17.3994, + "eval_samples_per_second": 6.609, + "eval_steps_per_second": 1.149, + "step": 1982 + }, + { + "epoch": 103.9, + "grad_norm": 4.967583656311035, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.5128, + "step": 2000 + }, + { + "epoch": 104.0, + "eval_gen_len": 8.2087, + "eval_loss": 1.2050235271453857, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 14.9921, + "eval_samples_per_second": 7.671, + "eval_steps_per_second": 1.334, + "step": 2002 + }, + { + "epoch": 104.99, + "eval_gen_len": 8.4696, + "eval_loss": 1.1983749866485596, + "eval_rouge1": 0.0006, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 16.8064, + "eval_samples_per_second": 6.843, + "eval_steps_per_second": 1.19, + "step": 2021 + }, + { + "epoch": 105.97, + "eval_gen_len": 8.8435, + "eval_loss": 1.1935399770736694, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 15.6582, + "eval_samples_per_second": 7.344, + "eval_steps_per_second": 1.277, + "step": 2040 + }, + { + "epoch": 106.96, + "eval_gen_len": 8.7739, + "eval_loss": 1.1894173622131348, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 16.7955, + "eval_samples_per_second": 6.847, + "eval_steps_per_second": 1.191, + "step": 2059 + }, + { + "epoch": 108.0, + "eval_gen_len": 8.5565, + "eval_loss": 1.1841349601745605, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 15.4737, + "eval_samples_per_second": 7.432, + "eval_steps_per_second": 1.293, + "step": 2079 + }, + { + "epoch": 108.99, + "eval_gen_len": 8.6435, + "eval_loss": 1.1762468814849854, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 15.4231, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 1.297, + "step": 2098 + }, + { + "epoch": 109.97, + "eval_gen_len": 8.513, + "eval_loss": 1.1688281297683716, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 23.2171, + "eval_samples_per_second": 4.953, + "eval_steps_per_second": 0.861, + "step": 2117 + }, + { + "epoch": 110.96, + "eval_gen_len": 8.4522, + "eval_loss": 1.163394570350647, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 14.9862, + "eval_samples_per_second": 7.674, + "eval_steps_per_second": 1.335, + "step": 2136 + }, + { + "epoch": 112.0, + "eval_gen_len": 8.4261, + "eval_loss": 1.1577537059783936, + "eval_rouge1": 0.001, + "eval_rouge2": 0.0, + "eval_rougeL": 0.001, + "eval_rougeLsum": 0.0009, + "eval_runtime": 18.0754, + "eval_samples_per_second": 6.362, + "eval_steps_per_second": 1.106, + "step": 2156 + }, + { + "epoch": 112.99, + "eval_gen_len": 8.4087, + "eval_loss": 1.1507985591888428, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 17.1754, + "eval_samples_per_second": 6.696, + "eval_steps_per_second": 1.164, + "step": 2175 + }, + { + "epoch": 113.97, + "eval_gen_len": 8.6696, + "eval_loss": 1.1435272693634033, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 15.4446, + "eval_samples_per_second": 7.446, + "eval_steps_per_second": 1.295, + "step": 2194 + }, + { + "epoch": 114.96, + "eval_gen_len": 8.8087, + "eval_loss": 1.1399484872817993, + "eval_rouge1": 0.0006, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 17.4221, + "eval_samples_per_second": 6.601, + "eval_steps_per_second": 1.148, + "step": 2213 + }, + { + "epoch": 116.0, + "eval_gen_len": 8.7565, + "eval_loss": 1.1332604885101318, + "eval_rouge1": 0.0006, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 16.1131, + "eval_samples_per_second": 7.137, + "eval_steps_per_second": 1.241, + "step": 2233 + }, + { + "epoch": 116.99, + "eval_gen_len": 8.7478, + "eval_loss": 1.1271406412124634, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 16.9778, + "eval_samples_per_second": 6.774, + "eval_steps_per_second": 1.178, + "step": 2252 + }, + { + "epoch": 117.97, + "eval_gen_len": 8.8609, + "eval_loss": 1.1240047216415405, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 14.2803, + "eval_samples_per_second": 8.053, + "eval_steps_per_second": 1.401, + "step": 2271 + }, + { + "epoch": 118.96, + "eval_gen_len": 8.7391, + "eval_loss": 1.1195180416107178, + "eval_rouge1": 0.0005, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0005, + "eval_rougeLsum": 0.0005, + "eval_runtime": 14.6134, + "eval_samples_per_second": 7.869, + "eval_steps_per_second": 1.369, + "step": 2290 + }, + { + "epoch": 120.0, + "eval_gen_len": 8.7043, + "eval_loss": 1.113542079925537, + "eval_rouge1": 0.0005, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0005, + "eval_rougeLsum": 0.0005, + "eval_runtime": 21.0862, + "eval_samples_per_second": 5.454, + "eval_steps_per_second": 0.948, + "step": 2310 + }, + { + "epoch": 120.99, + "eval_gen_len": 8.5043, + "eval_loss": 1.1078674793243408, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 18.0247, + "eval_samples_per_second": 6.38, + "eval_steps_per_second": 1.11, + "step": 2329 + }, + { + "epoch": 121.97, + "eval_gen_len": 8.4696, + "eval_loss": 1.0989575386047363, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 16.1592, + "eval_samples_per_second": 7.117, + "eval_steps_per_second": 1.238, + "step": 2348 + }, + { + "epoch": 122.96, + "eval_gen_len": 8.8261, + "eval_loss": 1.0940810441970825, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 29.3501, + "eval_samples_per_second": 3.918, + "eval_steps_per_second": 0.681, + "step": 2367 + }, + { + "epoch": 124.0, + "eval_gen_len": 8.7826, + "eval_loss": 1.0875351428985596, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 17.9868, + "eval_samples_per_second": 6.394, + "eval_steps_per_second": 1.112, + "step": 2387 + }, + { + "epoch": 124.99, + "eval_gen_len": 8.5913, + "eval_loss": 1.083350658416748, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 17.325, + "eval_samples_per_second": 6.638, + "eval_steps_per_second": 1.154, + "step": 2406 + }, + { + "epoch": 125.97, + "eval_gen_len": 8.9652, + "eval_loss": 1.0746002197265625, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 15.8215, + "eval_samples_per_second": 7.269, + "eval_steps_per_second": 1.264, + "step": 2425 + }, + { + "epoch": 126.96, + "eval_gen_len": 9.0696, + "eval_loss": 1.0692858695983887, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 15.6722, + "eval_samples_per_second": 7.338, + "eval_steps_per_second": 1.276, + "step": 2444 + }, + { + "epoch": 128.0, + "eval_gen_len": 9.0261, + "eval_loss": 1.0652384757995605, + "eval_rouge1": 0.0, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0, + "eval_rougeLsum": 0.0, + "eval_runtime": 24.7207, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 0.809, + "step": 2464 + }, + { + "epoch": 128.99, + "eval_gen_len": 9.0348, + "eval_loss": 1.0582802295684814, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 17.9675, + "eval_samples_per_second": 6.4, + "eval_steps_per_second": 1.113, + "step": 2483 + }, + { + "epoch": 129.87, + "grad_norm": 1.7797880172729492, + "learning_rate": 1.124561403508772e-05, + "loss": 1.3193, + "step": 2500 + }, + { + "epoch": 129.97, + "eval_gen_len": 9.1217, + "eval_loss": 1.0517534017562866, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 15.6173, + "eval_samples_per_second": 7.364, + "eval_steps_per_second": 1.281, + "step": 2502 + }, + { + "epoch": 130.96, + "eval_gen_len": 8.887, + "eval_loss": 1.0467168092727661, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 15.1436, + "eval_samples_per_second": 7.594, + "eval_steps_per_second": 1.321, + "step": 2521 + }, + { + "epoch": 132.0, + "eval_gen_len": 8.8348, + "eval_loss": 1.0417622327804565, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 14.5373, + "eval_samples_per_second": 7.911, + "eval_steps_per_second": 1.376, + "step": 2541 + }, + { + "epoch": 132.99, + "eval_gen_len": 8.7826, + "eval_loss": 1.0359249114990234, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 15.5453, + "eval_samples_per_second": 7.398, + "eval_steps_per_second": 1.287, + "step": 2560 + }, + { + "epoch": 133.97, + "eval_gen_len": 8.7217, + "eval_loss": 1.0301254987716675, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 17.6306, + "eval_samples_per_second": 6.523, + "eval_steps_per_second": 1.134, + "step": 2579 + }, + { + "epoch": 134.96, + "eval_gen_len": 8.7739, + "eval_loss": 1.0256870985031128, + "eval_rouge1": 0.0003, + "eval_rouge2": 0.0, + "eval_rougeL": 0.0003, + "eval_rougeLsum": 0.0003, + "eval_runtime": 15.6433, + "eval_samples_per_second": 7.351, + "eval_steps_per_second": 1.279, + "step": 2598 + }, + { + "epoch": 136.0, + "eval_gen_len": 9.2348, + "eval_loss": 1.0207563638687134, + "eval_rouge1": 0.0018, + "eval_rouge2": 0.0009, + "eval_rougeL": 0.0021, + "eval_rougeLsum": 0.0018, + "eval_runtime": 16.9915, + "eval_samples_per_second": 6.768, + "eval_steps_per_second": 1.177, + "step": 2618 + }, + { + "epoch": 136.99, + "eval_gen_len": 9.4783, + "eval_loss": 1.0155842304229736, + "eval_rouge1": 0.0028, + "eval_rouge2": 0.0011, + "eval_rougeL": 0.0028, + "eval_rougeLsum": 0.0028, + "eval_runtime": 15.1617, + "eval_samples_per_second": 7.585, + "eval_steps_per_second": 1.319, + "step": 2637 + }, + { + "epoch": 137.97, + "eval_gen_len": 9.4609, + "eval_loss": 1.010608434677124, + "eval_rouge1": 0.0018, + "eval_rouge2": 0.0009, + "eval_rougeL": 0.0018, + "eval_rougeLsum": 0.0018, + "eval_runtime": 17.3586, + "eval_samples_per_second": 6.625, + "eval_steps_per_second": 1.152, + "step": 2656 + }, + { + "epoch": 138.96, + "eval_gen_len": 9.4522, + "eval_loss": 1.006165862083435, + "eval_rouge1": 0.0018, + "eval_rouge2": 0.0009, + "eval_rougeL": 0.0018, + "eval_rougeLsum": 0.0018, + "eval_runtime": 15.6312, + "eval_samples_per_second": 7.357, + "eval_steps_per_second": 1.279, + "step": 2675 + }, + { + "epoch": 140.0, + "eval_gen_len": 9.4435, + "eval_loss": 1.0015084743499756, + "eval_rouge1": 0.0018, + "eval_rouge2": 0.0009, + "eval_rougeL": 0.0018, + "eval_rougeLsum": 0.0018, + "eval_runtime": 15.1443, + "eval_samples_per_second": 7.594, + "eval_steps_per_second": 1.321, + "step": 2695 + }, + { + "epoch": 140.99, + "eval_gen_len": 9.5913, + "eval_loss": 0.9966647028923035, + "eval_rouge1": 0.0015, + "eval_rouge2": 0.0003, + "eval_rougeL": 0.0018, + "eval_rougeLsum": 0.0015, + "eval_runtime": 16.3381, + "eval_samples_per_second": 7.039, + "eval_steps_per_second": 1.224, + "step": 2714 + }, + { + "epoch": 141.97, + "eval_gen_len": 9.6783, + "eval_loss": 0.9923425912857056, + "eval_rouge1": 0.0015, + "eval_rouge2": 0.0003, + "eval_rougeL": 0.0018, + "eval_rougeLsum": 0.0015, + "eval_runtime": 14.5033, + "eval_samples_per_second": 7.929, + "eval_steps_per_second": 1.379, + "step": 2733 + }, + { + "epoch": 142.96, + "eval_gen_len": 9.8, + "eval_loss": 0.9881101250648499, + "eval_rouge1": 0.0015, + "eval_rouge2": 0.0003, + "eval_rougeL": 0.0015, + "eval_rougeLsum": 0.0015, + "eval_runtime": 17.6181, + "eval_samples_per_second": 6.527, + "eval_steps_per_second": 1.135, + "step": 2752 + }, + { + "epoch": 144.0, + "eval_gen_len": 9.8435, + "eval_loss": 0.9837466478347778, + "eval_rouge1": 0.0015, + "eval_rouge2": 0.0003, + "eval_rougeL": 0.0015, + "eval_rougeLsum": 0.0015, + "eval_runtime": 15.937, + "eval_samples_per_second": 7.216, + "eval_steps_per_second": 1.255, + "step": 2772 + }, + { + "epoch": 144.99, + "eval_gen_len": 9.9304, + "eval_loss": 0.9798020720481873, + "eval_rouge1": 0.0015, + "eval_rouge2": 0.0003, + "eval_rougeL": 0.0015, + "eval_rougeLsum": 0.0015, + "eval_runtime": 16.407, + "eval_samples_per_second": 7.009, + "eval_steps_per_second": 1.219, + "step": 2791 + }, + { + "epoch": 145.97, + "eval_gen_len": 9.9826, + "eval_loss": 0.975723922252655, + "eval_rouge1": 0.0015, + "eval_rouge2": 0.0003, + "eval_rougeL": 0.0015, + "eval_rougeLsum": 0.0015, + "eval_runtime": 15.1268, + "eval_samples_per_second": 7.602, + "eval_steps_per_second": 1.322, + "step": 2810 + }, + { + "epoch": 146.96, + "eval_gen_len": 10.0261, + "eval_loss": 0.9714429378509521, + "eval_rouge1": 0.0015, + "eval_rouge2": 0.0003, + "eval_rougeL": 0.0015, + "eval_rougeLsum": 0.0015, + "eval_runtime": 16.0606, + "eval_samples_per_second": 7.16, + "eval_steps_per_second": 1.245, + "step": 2829 + }, + { + "epoch": 148.0, + "eval_gen_len": 9.9739, + "eval_loss": 0.9681385159492493, + "eval_rouge1": 0.0015, + "eval_rouge2": 0.0003, + "eval_rougeL": 0.0015, + "eval_rougeLsum": 0.0015, + "eval_runtime": 15.6543, + "eval_samples_per_second": 7.346, + "eval_steps_per_second": 1.278, + "step": 2849 + }, + { + "epoch": 148.99, + "eval_gen_len": 9.9739, + "eval_loss": 0.9637375473976135, + "eval_rouge1": 0.0015, + "eval_rouge2": 0.0003, + "eval_rougeL": 0.0015, + "eval_rougeLsum": 0.0015, + "eval_runtime": 15.1696, + "eval_samples_per_second": 7.581, + "eval_steps_per_second": 1.318, + "step": 2868 + }, + { + "epoch": 149.97, + "eval_gen_len": 10.0348, + "eval_loss": 0.9596477746963501, + "eval_rouge1": 0.0015, + "eval_rouge2": 0.0009, + "eval_rougeL": 0.0015, + "eval_rougeLsum": 0.0015, + "eval_runtime": 15.2187, + "eval_samples_per_second": 7.557, + "eval_steps_per_second": 1.314, + "step": 2887 + }, + { + "epoch": 150.96, + "eval_gen_len": 10.0174, + "eval_loss": 0.9558045268058777, + "eval_rouge1": 0.0017, + "eval_rouge2": 0.0009, + "eval_rougeL": 0.0017, + "eval_rougeLsum": 0.0017, + "eval_runtime": 16.4212, + "eval_samples_per_second": 7.003, + "eval_steps_per_second": 1.218, + "step": 2906 + }, + { + "epoch": 152.0, + "eval_gen_len": 10.1304, + "eval_loss": 0.9513251185417175, + "eval_rouge1": 0.0021, + "eval_rouge2": 0.0005, + "eval_rougeL": 0.0021, + "eval_rougeLsum": 0.0021, + "eval_runtime": 17.1931, + "eval_samples_per_second": 6.689, + "eval_steps_per_second": 1.163, + "step": 2926 + }, + { + "epoch": 152.99, + "eval_gen_len": 10.1217, + "eval_loss": 0.947124719619751, + "eval_rouge1": 0.0021, + "eval_rouge2": 0.0014, + "eval_rougeL": 0.0021, + "eval_rougeLsum": 0.0021, + "eval_runtime": 16.7224, + "eval_samples_per_second": 6.877, + "eval_steps_per_second": 1.196, + "step": 2945 + }, + { + "epoch": 153.97, + "eval_gen_len": 10.2696, + "eval_loss": 0.9428749084472656, + "eval_rouge1": 0.0043, + "eval_rouge2": 0.0014, + "eval_rougeL": 0.0037, + "eval_rougeLsum": 0.0036, + "eval_runtime": 15.3406, + "eval_samples_per_second": 7.496, + "eval_steps_per_second": 1.304, + "step": 2964 + }, + { + "epoch": 154.96, + "eval_gen_len": 10.1217, + "eval_loss": 0.939849853515625, + "eval_rouge1": 0.0021, + "eval_rouge2": 0.0014, + "eval_rougeL": 0.0021, + "eval_rougeLsum": 0.0021, + "eval_runtime": 15.8431, + "eval_samples_per_second": 7.259, + "eval_steps_per_second": 1.262, + "step": 2983 + }, + { + "epoch": 155.84, + "grad_norm": 0.8866944313049316, + "learning_rate": 9.49122807017544e-06, + "loss": 1.1379, + "step": 3000 + }, + { + "epoch": 156.0, + "eval_gen_len": 10.0522, + "eval_loss": 0.9357353448867798, + "eval_rouge1": 0.0017, + "eval_rouge2": 0.0011, + "eval_rougeL": 0.0017, + "eval_rougeLsum": 0.0017, + "eval_runtime": 16.7405, + "eval_samples_per_second": 6.87, + "eval_steps_per_second": 1.195, + "step": 3003 + }, + { + "epoch": 156.99, + "eval_gen_len": 10.1217, + "eval_loss": 0.9312177300453186, + "eval_rouge1": 0.0017, + "eval_rouge2": 0.0011, + "eval_rougeL": 0.0017, + "eval_rougeLsum": 0.0017, + "eval_runtime": 12.164, + "eval_samples_per_second": 9.454, + "eval_steps_per_second": 1.644, + "step": 3022 + }, + { + "epoch": 157.97, + "eval_gen_len": 10.2609, + "eval_loss": 0.9275165796279907, + "eval_rouge1": 0.0027, + "eval_rouge2": 0.0006, + "eval_rougeL": 0.0022, + "eval_rougeLsum": 0.0022, + "eval_runtime": 15.2713, + "eval_samples_per_second": 7.53, + "eval_steps_per_second": 1.31, + "step": 3041 + }, + { + "epoch": 158.96, + "eval_gen_len": 10.4435, + "eval_loss": 0.9236345887184143, + "eval_rouge1": 0.0036, + "eval_rouge2": 0.0006, + "eval_rougeL": 0.0028, + "eval_rougeLsum": 0.0029, + "eval_runtime": 17.5627, + "eval_samples_per_second": 6.548, + "eval_steps_per_second": 1.139, + "step": 3060 + }, + { + "epoch": 160.0, + "eval_gen_len": 10.513, + "eval_loss": 0.9195658564567566, + "eval_rouge1": 0.0049, + "eval_rouge2": 0.0012, + "eval_rougeL": 0.0044, + "eval_rougeLsum": 0.0044, + "eval_runtime": 16.5853, + "eval_samples_per_second": 6.934, + "eval_steps_per_second": 1.206, + "step": 3080 + }, + { + "epoch": 160.99, + "eval_gen_len": 10.487, + "eval_loss": 0.9164186120033264, + "eval_rouge1": 0.0046, + "eval_rouge2": 0.0007, + "eval_rougeL": 0.0038, + "eval_rougeLsum": 0.0038, + "eval_runtime": 15.397, + "eval_samples_per_second": 7.469, + "eval_steps_per_second": 1.299, + "step": 3099 + }, + { + "epoch": 161.97, + "eval_gen_len": 10.4783, + "eval_loss": 0.9130675196647644, + "eval_rouge1": 0.0039, + "eval_rouge2": 0.0007, + "eval_rougeL": 0.0034, + "eval_rougeLsum": 0.0034, + "eval_runtime": 16.4247, + "eval_samples_per_second": 7.002, + "eval_steps_per_second": 1.218, + "step": 3118 + }, + { + "epoch": 162.96, + "eval_gen_len": 10.6522, + "eval_loss": 0.9092690944671631, + "eval_rouge1": 0.007, + "eval_rouge2": 0.0023, + "eval_rougeL": 0.0066, + "eval_rougeLsum": 0.0065, + "eval_runtime": 14.8696, + "eval_samples_per_second": 7.734, + "eval_steps_per_second": 1.345, + "step": 3137 + }, + { + "epoch": 164.0, + "eval_gen_len": 10.5739, + "eval_loss": 0.9059688448905945, + "eval_rouge1": 0.005, + "eval_rouge2": 0.001, + "eval_rougeL": 0.0044, + "eval_rougeLsum": 0.0043, + "eval_runtime": 15.7264, + "eval_samples_per_second": 7.313, + "eval_steps_per_second": 1.272, + "step": 3157 + }, + { + "epoch": 164.99, + "eval_gen_len": 10.7391, + "eval_loss": 0.9024509191513062, + "eval_rouge1": 0.0074, + "eval_rouge2": 0.0023, + "eval_rougeL": 0.0068, + "eval_rougeLsum": 0.0066, + "eval_runtime": 15.813, + "eval_samples_per_second": 7.272, + "eval_steps_per_second": 1.265, + "step": 3176 + }, + { + "epoch": 165.97, + "eval_gen_len": 10.5652, + "eval_loss": 0.8994614481925964, + "eval_rouge1": 0.0054, + "eval_rouge2": 0.001, + "eval_rougeL": 0.0048, + "eval_rougeLsum": 0.0048, + "eval_runtime": 15.1711, + "eval_samples_per_second": 7.58, + "eval_steps_per_second": 1.318, + "step": 3195 + }, + { + "epoch": 166.96, + "eval_gen_len": 10.5913, + "eval_loss": 0.8970102667808533, + "eval_rouge1": 0.0061, + "eval_rouge2": 0.0014, + "eval_rougeL": 0.0053, + "eval_rougeLsum": 0.0053, + "eval_runtime": 14.0622, + "eval_samples_per_second": 8.178, + "eval_steps_per_second": 1.422, + "step": 3214 + }, + { + "epoch": 168.0, + "eval_gen_len": 10.6174, + "eval_loss": 0.894256055355072, + "eval_rouge1": 0.0082, + "eval_rouge2": 0.003, + "eval_rougeL": 0.0077, + "eval_rougeLsum": 0.0075, + "eval_runtime": 16.7533, + "eval_samples_per_second": 6.864, + "eval_steps_per_second": 1.194, + "step": 3234 + }, + { + "epoch": 168.99, + "eval_gen_len": 10.6348, + "eval_loss": 0.891488790512085, + "eval_rouge1": 0.0092, + "eval_rouge2": 0.0029, + "eval_rougeL": 0.0083, + "eval_rougeLsum": 0.0081, + "eval_runtime": 13.6019, + "eval_samples_per_second": 8.455, + "eval_steps_per_second": 1.47, + "step": 3253 + }, + { + "epoch": 169.97, + "eval_gen_len": 10.5913, + "eval_loss": 0.8882649540901184, + "eval_rouge1": 0.0073, + "eval_rouge2": 0.0022, + "eval_rougeL": 0.0068, + "eval_rougeLsum": 0.0067, + "eval_runtime": 16.0681, + "eval_samples_per_second": 7.157, + "eval_steps_per_second": 1.245, + "step": 3272 + }, + { + "epoch": 170.96, + "eval_gen_len": 10.6522, + "eval_loss": 0.8857714533805847, + "eval_rouge1": 0.009, + "eval_rouge2": 0.0025, + "eval_rougeL": 0.0081, + "eval_rougeLsum": 0.008, + "eval_runtime": 19.0222, + "eval_samples_per_second": 6.046, + "eval_steps_per_second": 1.051, + "step": 3291 + }, + { + "epoch": 172.0, + "eval_gen_len": 10.5826, + "eval_loss": 0.8824735283851624, + "eval_rouge1": 0.0073, + "eval_rouge2": 0.0018, + "eval_rougeL": 0.0068, + "eval_rougeLsum": 0.0068, + "eval_runtime": 17.2556, + "eval_samples_per_second": 6.665, + "eval_steps_per_second": 1.159, + "step": 3311 + }, + { + "epoch": 172.99, + "eval_gen_len": 10.5913, + "eval_loss": 0.8791074156761169, + "eval_rouge1": 0.0077, + "eval_rouge2": 0.0016, + "eval_rougeL": 0.0066, + "eval_rougeLsum": 0.0066, + "eval_runtime": 15.3622, + "eval_samples_per_second": 7.486, + "eval_steps_per_second": 1.302, + "step": 3330 + }, + { + "epoch": 173.97, + "eval_gen_len": 10.6174, + "eval_loss": 0.8760549426078796, + "eval_rouge1": 0.0078, + "eval_rouge2": 0.0017, + "eval_rougeL": 0.0069, + "eval_rougeLsum": 0.007, + "eval_runtime": 13.7617, + "eval_samples_per_second": 8.357, + "eval_steps_per_second": 1.453, + "step": 3349 + }, + { + "epoch": 174.96, + "eval_gen_len": 10.8348, + "eval_loss": 0.8735494017601013, + "eval_rouge1": 0.0099, + "eval_rouge2": 0.0031, + "eval_rougeL": 0.0093, + "eval_rougeLsum": 0.0093, + "eval_runtime": 16.3628, + "eval_samples_per_second": 7.028, + "eval_steps_per_second": 1.222, + "step": 3368 + }, + { + "epoch": 176.0, + "eval_gen_len": 10.8174, + "eval_loss": 0.8713410496711731, + "eval_rouge1": 0.0103, + "eval_rouge2": 0.0031, + "eval_rougeL": 0.0097, + "eval_rougeLsum": 0.0098, + "eval_runtime": 15.0408, + "eval_samples_per_second": 7.646, + "eval_steps_per_second": 1.33, + "step": 3388 + }, + { + "epoch": 176.99, + "eval_gen_len": 10.687, + "eval_loss": 0.8688496947288513, + "eval_rouge1": 0.0104, + "eval_rouge2": 0.0027, + "eval_rougeL": 0.0087, + "eval_rougeLsum": 0.0087, + "eval_runtime": 13.3269, + "eval_samples_per_second": 8.629, + "eval_steps_per_second": 1.501, + "step": 3407 + }, + { + "epoch": 177.97, + "eval_gen_len": 10.7304, + "eval_loss": 0.8659321069717407, + "eval_rouge1": 0.0102, + "eval_rouge2": 0.0022, + "eval_rougeL": 0.0085, + "eval_rougeLsum": 0.0083, + "eval_runtime": 15.8407, + "eval_samples_per_second": 7.26, + "eval_steps_per_second": 1.263, + "step": 3426 + }, + { + "epoch": 178.96, + "eval_gen_len": 10.9217, + "eval_loss": 0.8626890778541565, + "eval_rouge1": 0.0109, + "eval_rouge2": 0.0025, + "eval_rougeL": 0.0086, + "eval_rougeLsum": 0.0085, + "eval_runtime": 15.1338, + "eval_samples_per_second": 7.599, + "eval_steps_per_second": 1.322, + "step": 3445 + }, + { + "epoch": 180.0, + "eval_gen_len": 11.087, + "eval_loss": 0.8599569201469421, + "eval_rouge1": 0.0124, + "eval_rouge2": 0.0025, + "eval_rougeL": 0.0101, + "eval_rougeLsum": 0.0101, + "eval_runtime": 21.7846, + "eval_samples_per_second": 5.279, + "eval_steps_per_second": 0.918, + "step": 3465 + }, + { + "epoch": 180.99, + "eval_gen_len": 11.1478, + "eval_loss": 0.8579829931259155, + "eval_rouge1": 0.0132, + "eval_rouge2": 0.0026, + "eval_rougeL": 0.0111, + "eval_rougeLsum": 0.0109, + "eval_runtime": 14.4812, + "eval_samples_per_second": 7.941, + "eval_steps_per_second": 1.381, + "step": 3484 + }, + { + "epoch": 181.82, + "grad_norm": 0.5403133034706116, + "learning_rate": 7.736842105263158e-06, + "loss": 1.0168, + "step": 3500 + }, + { + "epoch": 181.97, + "eval_gen_len": 10.9739, + "eval_loss": 0.8559067845344543, + "eval_rouge1": 0.011, + "eval_rouge2": 0.0027, + "eval_rougeL": 0.0095, + "eval_rougeLsum": 0.0093, + "eval_runtime": 13.8018, + "eval_samples_per_second": 8.332, + "eval_steps_per_second": 1.449, + "step": 3503 + }, + { + "epoch": 182.96, + "eval_gen_len": 10.9652, + "eval_loss": 0.8531643152236938, + "eval_rouge1": 0.0122, + "eval_rouge2": 0.0033, + "eval_rougeL": 0.0101, + "eval_rougeLsum": 0.01, + "eval_runtime": 15.9407, + "eval_samples_per_second": 7.214, + "eval_steps_per_second": 1.255, + "step": 3522 + }, + { + "epoch": 184.0, + "eval_gen_len": 11.0609, + "eval_loss": 0.8499117493629456, + "eval_rouge1": 0.0141, + "eval_rouge2": 0.0034, + "eval_rougeL": 0.0121, + "eval_rougeLsum": 0.012, + "eval_runtime": 13.074, + "eval_samples_per_second": 8.796, + "eval_steps_per_second": 1.53, + "step": 3542 + }, + { + "epoch": 184.99, + "eval_gen_len": 11.3913, + "eval_loss": 0.8471864461898804, + "eval_rouge1": 0.0178, + "eval_rouge2": 0.0037, + "eval_rougeL": 0.0152, + "eval_rougeLsum": 0.0149, + "eval_runtime": 13.6132, + "eval_samples_per_second": 8.448, + "eval_steps_per_second": 1.469, + "step": 3561 + }, + { + "epoch": 185.97, + "eval_gen_len": 11.287, + "eval_loss": 0.8454113602638245, + "eval_rouge1": 0.0173, + "eval_rouge2": 0.0036, + "eval_rougeL": 0.0145, + "eval_rougeLsum": 0.0141, + "eval_runtime": 12.8847, + "eval_samples_per_second": 8.925, + "eval_steps_per_second": 1.552, + "step": 3580 + }, + { + "epoch": 186.96, + "eval_gen_len": 11.2261, + "eval_loss": 0.8434880375862122, + "eval_rouge1": 0.017, + "eval_rouge2": 0.0027, + "eval_rougeL": 0.0143, + "eval_rougeLsum": 0.0141, + "eval_runtime": 11.4202, + "eval_samples_per_second": 10.07, + "eval_steps_per_second": 1.751, + "step": 3599 + }, + { + "epoch": 188.0, + "eval_gen_len": 11.3913, + "eval_loss": 0.840716540813446, + "eval_rouge1": 0.0188, + "eval_rouge2": 0.0032, + "eval_rougeL": 0.0161, + "eval_rougeLsum": 0.0159, + "eval_runtime": 10.5922, + "eval_samples_per_second": 10.857, + "eval_steps_per_second": 1.888, + "step": 3619 + }, + { + "epoch": 188.99, + "eval_gen_len": 11.2087, + "eval_loss": 0.8385959267616272, + "eval_rouge1": 0.0166, + "eval_rouge2": 0.0033, + "eval_rougeL": 0.0144, + "eval_rougeLsum": 0.0141, + "eval_runtime": 11.9064, + "eval_samples_per_second": 9.659, + "eval_steps_per_second": 1.68, + "step": 3638 + }, + { + "epoch": 189.97, + "eval_gen_len": 11.2609, + "eval_loss": 0.836624026298523, + "eval_rouge1": 0.0169, + "eval_rouge2": 0.0031, + "eval_rougeL": 0.0147, + "eval_rougeLsum": 0.0144, + "eval_runtime": 10.125, + "eval_samples_per_second": 11.358, + "eval_steps_per_second": 1.975, + "step": 3657 + }, + { + "epoch": 190.96, + "eval_gen_len": 11.2522, + "eval_loss": 0.834960401058197, + "eval_rouge1": 0.0181, + "eval_rouge2": 0.0038, + "eval_rougeL": 0.0159, + "eval_rougeLsum": 0.0158, + "eval_runtime": 10.4265, + "eval_samples_per_second": 11.03, + "eval_steps_per_second": 1.918, + "step": 3676 + }, + { + "epoch": 192.0, + "eval_gen_len": 11.6174, + "eval_loss": 0.832145631313324, + "eval_rouge1": 0.0223, + "eval_rouge2": 0.0048, + "eval_rougeL": 0.0198, + "eval_rougeLsum": 0.0197, + "eval_runtime": 14.2788, + "eval_samples_per_second": 8.054, + "eval_steps_per_second": 1.401, + "step": 3696 + }, + { + "epoch": 192.99, + "eval_gen_len": 11.5913, + "eval_loss": 0.8298683166503906, + "eval_rouge1": 0.0238, + "eval_rouge2": 0.0054, + "eval_rougeL": 0.021, + "eval_rougeLsum": 0.0208, + "eval_runtime": 13.7725, + "eval_samples_per_second": 8.35, + "eval_steps_per_second": 1.452, + "step": 3715 + }, + { + "epoch": 193.97, + "eval_gen_len": 11.513, + "eval_loss": 0.8281151056289673, + "eval_rouge1": 0.0238, + "eval_rouge2": 0.0057, + "eval_rougeL": 0.0208, + "eval_rougeLsum": 0.0206, + "eval_runtime": 16.5357, + "eval_samples_per_second": 6.955, + "eval_steps_per_second": 1.21, + "step": 3734 + }, + { + "epoch": 194.96, + "eval_gen_len": 11.4696, + "eval_loss": 0.8263967633247375, + "eval_rouge1": 0.0242, + "eval_rouge2": 0.0054, + "eval_rougeL": 0.0212, + "eval_rougeLsum": 0.0209, + "eval_runtime": 14.9165, + "eval_samples_per_second": 7.71, + "eval_steps_per_second": 1.341, + "step": 3753 + }, + { + "epoch": 196.0, + "eval_gen_len": 11.513, + "eval_loss": 0.8241834044456482, + "eval_rouge1": 0.0242, + "eval_rouge2": 0.0054, + "eval_rougeL": 0.0203, + "eval_rougeLsum": 0.0202, + "eval_runtime": 13.3344, + "eval_samples_per_second": 8.624, + "eval_steps_per_second": 1.5, + "step": 3773 + }, + { + "epoch": 196.99, + "eval_gen_len": 11.8348, + "eval_loss": 0.8214186429977417, + "eval_rouge1": 0.0277, + "eval_rouge2": 0.0058, + "eval_rougeL": 0.0246, + "eval_rougeLsum": 0.0242, + "eval_runtime": 15.0895, + "eval_samples_per_second": 7.621, + "eval_steps_per_second": 1.325, + "step": 3792 + }, + { + "epoch": 197.97, + "eval_gen_len": 11.6435, + "eval_loss": 0.8196175694465637, + "eval_rouge1": 0.0262, + "eval_rouge2": 0.0056, + "eval_rougeL": 0.0226, + "eval_rougeLsum": 0.0227, + "eval_runtime": 14.814, + "eval_samples_per_second": 7.763, + "eval_steps_per_second": 1.35, + "step": 3811 + }, + { + "epoch": 198.96, + "eval_gen_len": 11.7043, + "eval_loss": 0.8178415894508362, + "eval_rouge1": 0.0293, + "eval_rouge2": 0.006, + "eval_rougeL": 0.025, + "eval_rougeLsum": 0.0248, + "eval_runtime": 15.1097, + "eval_samples_per_second": 7.611, + "eval_steps_per_second": 1.324, + "step": 3830 + }, + { + "epoch": 200.0, + "eval_gen_len": 11.8783, + "eval_loss": 0.815380334854126, + "eval_rouge1": 0.0318, + "eval_rouge2": 0.006, + "eval_rougeL": 0.0273, + "eval_rougeLsum": 0.027, + "eval_runtime": 14.6624, + "eval_samples_per_second": 7.843, + "eval_steps_per_second": 1.364, + "step": 3850 + }, + { + "epoch": 200.99, + "eval_gen_len": 11.7913, + "eval_loss": 0.8136902451515198, + "eval_rouge1": 0.0307, + "eval_rouge2": 0.0058, + "eval_rougeL": 0.0265, + "eval_rougeLsum": 0.0262, + "eval_runtime": 14.2757, + "eval_samples_per_second": 8.056, + "eval_steps_per_second": 1.401, + "step": 3869 + }, + { + "epoch": 201.97, + "eval_gen_len": 11.8, + "eval_loss": 0.8119075894355774, + "eval_rouge1": 0.032, + "eval_rouge2": 0.0061, + "eval_rougeL": 0.0279, + "eval_rougeLsum": 0.0277, + "eval_runtime": 14.1634, + "eval_samples_per_second": 8.12, + "eval_steps_per_second": 1.412, + "step": 3888 + }, + { + "epoch": 202.96, + "eval_gen_len": 11.9652, + "eval_loss": 0.8098872900009155, + "eval_rouge1": 0.0334, + "eval_rouge2": 0.0062, + "eval_rougeL": 0.0289, + "eval_rougeLsum": 0.0285, + "eval_runtime": 17.6617, + "eval_samples_per_second": 6.511, + "eval_steps_per_second": 1.132, + "step": 3907 + }, + { + "epoch": 204.0, + "eval_gen_len": 12.0522, + "eval_loss": 0.8077185750007629, + "eval_rouge1": 0.0339, + "eval_rouge2": 0.0068, + "eval_rougeL": 0.0293, + "eval_rougeLsum": 0.0291, + "eval_runtime": 18.6428, + "eval_samples_per_second": 6.169, + "eval_steps_per_second": 1.073, + "step": 3927 + }, + { + "epoch": 204.99, + "eval_gen_len": 11.9478, + "eval_loss": 0.8060031533241272, + "eval_rouge1": 0.0331, + "eval_rouge2": 0.0065, + "eval_rougeL": 0.0286, + "eval_rougeLsum": 0.0284, + "eval_runtime": 28.209, + "eval_samples_per_second": 4.077, + "eval_steps_per_second": 0.709, + "step": 3946 + }, + { + "epoch": 205.97, + "eval_gen_len": 12.2087, + "eval_loss": 0.8041767477989197, + "eval_rouge1": 0.038, + "eval_rouge2": 0.0083, + "eval_rougeL": 0.0331, + "eval_rougeLsum": 0.0329, + "eval_runtime": 13.9196, + "eval_samples_per_second": 8.262, + "eval_steps_per_second": 1.437, + "step": 3965 + }, + { + "epoch": 206.96, + "eval_gen_len": 12.2348, + "eval_loss": 0.8022732138633728, + "eval_rouge1": 0.04, + "eval_rouge2": 0.0093, + "eval_rougeL": 0.0351, + "eval_rougeLsum": 0.0348, + "eval_runtime": 18.1652, + "eval_samples_per_second": 6.331, + "eval_steps_per_second": 1.101, + "step": 3984 + }, + { + "epoch": 207.79, + "grad_norm": 0.6825528740882874, + "learning_rate": 5.982456140350877e-06, + "loss": 0.9396, + "step": 4000 + }, + { + "epoch": 208.0, + "eval_gen_len": 11.9913, + "eval_loss": 0.8004079461097717, + "eval_rouge1": 0.0377, + "eval_rouge2": 0.0083, + "eval_rougeL": 0.0326, + "eval_rougeLsum": 0.0324, + "eval_runtime": 15.4261, + "eval_samples_per_second": 7.455, + "eval_steps_per_second": 1.297, + "step": 4004 + }, + { + "epoch": 208.99, + "eval_gen_len": 12.2435, + "eval_loss": 0.7987371683120728, + "eval_rouge1": 0.0394, + "eval_rouge2": 0.0081, + "eval_rougeL": 0.0329, + "eval_rougeLsum": 0.0326, + "eval_runtime": 15.4914, + "eval_samples_per_second": 7.423, + "eval_steps_per_second": 1.291, + "step": 4023 + }, + { + "epoch": 209.97, + "eval_gen_len": 12.2174, + "eval_loss": 0.7974857687950134, + "eval_rouge1": 0.0398, + "eval_rouge2": 0.0088, + "eval_rougeL": 0.0348, + "eval_rougeLsum": 0.0344, + "eval_runtime": 15.8068, + "eval_samples_per_second": 7.275, + "eval_steps_per_second": 1.265, + "step": 4042 + }, + { + "epoch": 210.96, + "eval_gen_len": 12.2696, + "eval_loss": 0.7953728437423706, + "eval_rouge1": 0.0415, + "eval_rouge2": 0.009, + "eval_rougeL": 0.0365, + "eval_rougeLsum": 0.036, + "eval_runtime": 15.5151, + "eval_samples_per_second": 7.412, + "eval_steps_per_second": 1.289, + "step": 4061 + }, + { + "epoch": 212.0, + "eval_gen_len": 12.1304, + "eval_loss": 0.7937628626823425, + "eval_rouge1": 0.0418, + "eval_rouge2": 0.009, + "eval_rougeL": 0.037, + "eval_rougeLsum": 0.0366, + "eval_runtime": 15.6988, + "eval_samples_per_second": 7.325, + "eval_steps_per_second": 1.274, + "step": 4081 + }, + { + "epoch": 212.99, + "eval_gen_len": 12.1043, + "eval_loss": 0.7920788526535034, + "eval_rouge1": 0.0416, + "eval_rouge2": 0.009, + "eval_rougeL": 0.0369, + "eval_rougeLsum": 0.0367, + "eval_runtime": 14.6071, + "eval_samples_per_second": 7.873, + "eval_steps_per_second": 1.369, + "step": 4100 + }, + { + "epoch": 213.97, + "eval_gen_len": 11.9652, + "eval_loss": 0.7905020117759705, + "eval_rouge1": 0.041, + "eval_rouge2": 0.0078, + "eval_rougeL": 0.036, + "eval_rougeLsum": 0.0357, + "eval_runtime": 16.4932, + "eval_samples_per_second": 6.973, + "eval_steps_per_second": 1.213, + "step": 4119 + }, + { + "epoch": 214.96, + "eval_gen_len": 11.9391, + "eval_loss": 0.7891045212745667, + "eval_rouge1": 0.0411, + "eval_rouge2": 0.0078, + "eval_rougeL": 0.0361, + "eval_rougeLsum": 0.0358, + "eval_runtime": 15.0709, + "eval_samples_per_second": 7.631, + "eval_steps_per_second": 1.327, + "step": 4138 + }, + { + "epoch": 216.0, + "eval_gen_len": 12.1739, + "eval_loss": 0.7874982953071594, + "eval_rouge1": 0.0426, + "eval_rouge2": 0.0081, + "eval_rougeL": 0.0366, + "eval_rougeLsum": 0.0363, + "eval_runtime": 15.3399, + "eval_samples_per_second": 7.497, + "eval_steps_per_second": 1.304, + "step": 4158 + }, + { + "epoch": 216.99, + "eval_gen_len": 12.3043, + "eval_loss": 0.7856701016426086, + "eval_rouge1": 0.0444, + "eval_rouge2": 0.0092, + "eval_rougeL": 0.0384, + "eval_rougeLsum": 0.0383, + "eval_runtime": 16.5308, + "eval_samples_per_second": 6.957, + "eval_steps_per_second": 1.21, + "step": 4177 + }, + { + "epoch": 217.97, + "eval_gen_len": 12.2957, + "eval_loss": 0.7841366529464722, + "eval_rouge1": 0.0445, + "eval_rouge2": 0.0092, + "eval_rougeL": 0.039, + "eval_rougeLsum": 0.0388, + "eval_runtime": 13.9248, + "eval_samples_per_second": 8.259, + "eval_steps_per_second": 1.436, + "step": 4196 + }, + { + "epoch": 218.96, + "eval_gen_len": 12.313, + "eval_loss": 0.7825812101364136, + "eval_rouge1": 0.0443, + "eval_rouge2": 0.0087, + "eval_rougeL": 0.0382, + "eval_rougeLsum": 0.038, + "eval_runtime": 14.6481, + "eval_samples_per_second": 7.851, + "eval_steps_per_second": 1.365, + "step": 4215 + }, + { + "epoch": 220.0, + "eval_gen_len": 12.1217, + "eval_loss": 0.7813829779624939, + "eval_rouge1": 0.0438, + "eval_rouge2": 0.0085, + "eval_rougeL": 0.0379, + "eval_rougeLsum": 0.0375, + "eval_runtime": 16.4777, + "eval_samples_per_second": 6.979, + "eval_steps_per_second": 1.214, + "step": 4235 + }, + { + "epoch": 220.99, + "eval_gen_len": 12.0348, + "eval_loss": 0.7796338796615601, + "eval_rouge1": 0.0431, + "eval_rouge2": 0.0085, + "eval_rougeL": 0.0372, + "eval_rougeLsum": 0.0369, + "eval_runtime": 15.2958, + "eval_samples_per_second": 7.518, + "eval_steps_per_second": 1.308, + "step": 4254 + }, + { + "epoch": 221.97, + "eval_gen_len": 12.1043, + "eval_loss": 0.77826988697052, + "eval_rouge1": 0.0423, + "eval_rouge2": 0.009, + "eval_rougeL": 0.0365, + "eval_rougeLsum": 0.0362, + "eval_runtime": 16.2627, + "eval_samples_per_second": 7.071, + "eval_steps_per_second": 1.23, + "step": 4273 + }, + { + "epoch": 222.96, + "eval_gen_len": 12.0435, + "eval_loss": 0.7768360376358032, + "eval_rouge1": 0.0426, + "eval_rouge2": 0.009, + "eval_rougeL": 0.0365, + "eval_rougeLsum": 0.0363, + "eval_runtime": 24.5776, + "eval_samples_per_second": 4.679, + "eval_steps_per_second": 0.814, + "step": 4292 + }, + { + "epoch": 224.0, + "eval_gen_len": 12.0, + "eval_loss": 0.7752098441123962, + "eval_rouge1": 0.0425, + "eval_rouge2": 0.009, + "eval_rougeL": 0.0363, + "eval_rougeLsum": 0.0361, + "eval_runtime": 13.756, + "eval_samples_per_second": 8.36, + "eval_steps_per_second": 1.454, + "step": 4312 + }, + { + "epoch": 224.99, + "eval_gen_len": 11.9391, + "eval_loss": 0.7739911675453186, + "eval_rouge1": 0.043, + "eval_rouge2": 0.009, + "eval_rougeL": 0.0371, + "eval_rougeLsum": 0.0367, + "eval_runtime": 15.726, + "eval_samples_per_second": 7.313, + "eval_steps_per_second": 1.272, + "step": 4331 + }, + { + "epoch": 225.97, + "eval_gen_len": 11.8609, + "eval_loss": 0.7723690271377563, + "eval_rouge1": 0.0414, + "eval_rouge2": 0.009, + "eval_rougeL": 0.0357, + "eval_rougeLsum": 0.0355, + "eval_runtime": 16.5662, + "eval_samples_per_second": 6.942, + "eval_steps_per_second": 1.207, + "step": 4350 + }, + { + "epoch": 226.96, + "eval_gen_len": 11.7652, + "eval_loss": 0.7711983919143677, + "eval_rouge1": 0.0429, + "eval_rouge2": 0.0093, + "eval_rougeL": 0.0363, + "eval_rougeLsum": 0.0359, + "eval_runtime": 14.4971, + "eval_samples_per_second": 7.933, + "eval_steps_per_second": 1.38, + "step": 4369 + }, + { + "epoch": 228.0, + "eval_gen_len": 11.913, + "eval_loss": 0.7694764733314514, + "eval_rouge1": 0.0416, + "eval_rouge2": 0.0093, + "eval_rougeL": 0.0357, + "eval_rougeLsum": 0.0354, + "eval_runtime": 13.7683, + "eval_samples_per_second": 8.353, + "eval_steps_per_second": 1.453, + "step": 4389 + }, + { + "epoch": 228.99, + "eval_gen_len": 12.0087, + "eval_loss": 0.7683370113372803, + "eval_rouge1": 0.0426, + "eval_rouge2": 0.01, + "eval_rougeL": 0.0369, + "eval_rougeLsum": 0.0364, + "eval_runtime": 14.7051, + "eval_samples_per_second": 7.82, + "eval_steps_per_second": 1.36, + "step": 4408 + }, + { + "epoch": 229.97, + "eval_gen_len": 12.0696, + "eval_loss": 0.7668902277946472, + "eval_rouge1": 0.0422, + "eval_rouge2": 0.0095, + "eval_rougeL": 0.0364, + "eval_rougeLsum": 0.036, + "eval_runtime": 12.5967, + "eval_samples_per_second": 9.129, + "eval_steps_per_second": 1.588, + "step": 4427 + }, + { + "epoch": 230.96, + "eval_gen_len": 11.7217, + "eval_loss": 0.7656229734420776, + "eval_rouge1": 0.0396, + "eval_rouge2": 0.0094, + "eval_rougeL": 0.0342, + "eval_rougeLsum": 0.0339, + "eval_runtime": 10.0582, + "eval_samples_per_second": 11.433, + "eval_steps_per_second": 1.988, + "step": 4446 + }, + { + "epoch": 232.0, + "eval_gen_len": 11.5652, + "eval_loss": 0.7644599676132202, + "eval_rouge1": 0.0411, + "eval_rouge2": 0.0091, + "eval_rougeL": 0.0352, + "eval_rougeLsum": 0.0349, + "eval_runtime": 9.9608, + "eval_samples_per_second": 11.545, + "eval_steps_per_second": 2.008, + "step": 4466 + }, + { + "epoch": 232.99, + "eval_gen_len": 11.7826, + "eval_loss": 0.7628152370452881, + "eval_rouge1": 0.0421, + "eval_rouge2": 0.0095, + "eval_rougeL": 0.0371, + "eval_rougeLsum": 0.0371, + "eval_runtime": 10.6119, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.885, + "step": 4485 + }, + { + "epoch": 233.77, + "grad_norm": 0.5715782642364502, + "learning_rate": 4.228070175438596e-06, + "loss": 0.8871, + "step": 4500 + }, + { + "epoch": 233.97, + "eval_gen_len": 11.8957, + "eval_loss": 0.761337161064148, + "eval_rouge1": 0.0436, + "eval_rouge2": 0.0101, + "eval_rougeL": 0.0382, + "eval_rougeLsum": 0.0381, + "eval_runtime": 17.7977, + "eval_samples_per_second": 6.462, + "eval_steps_per_second": 1.124, + "step": 4504 + }, + { + "epoch": 234.96, + "eval_gen_len": 11.7652, + "eval_loss": 0.7602398991584778, + "eval_rouge1": 0.0424, + "eval_rouge2": 0.0099, + "eval_rougeL": 0.0372, + "eval_rougeLsum": 0.0369, + "eval_runtime": 10.3754, + "eval_samples_per_second": 11.084, + "eval_steps_per_second": 1.928, + "step": 4523 + }, + { + "epoch": 236.0, + "eval_gen_len": 11.5652, + "eval_loss": 0.7591829895973206, + "eval_rouge1": 0.0419, + "eval_rouge2": 0.0098, + "eval_rougeL": 0.037, + "eval_rougeLsum": 0.0367, + "eval_runtime": 13.3388, + "eval_samples_per_second": 8.621, + "eval_steps_per_second": 1.499, + "step": 4543 + }, + { + "epoch": 236.99, + "eval_gen_len": 11.6957, + "eval_loss": 0.7578958868980408, + "eval_rouge1": 0.0434, + "eval_rouge2": 0.0102, + "eval_rougeL": 0.0381, + "eval_rougeLsum": 0.0378, + "eval_runtime": 10.114, + "eval_samples_per_second": 11.37, + "eval_steps_per_second": 1.977, + "step": 4562 + }, + { + "epoch": 237.97, + "eval_gen_len": 11.7652, + "eval_loss": 0.7568346858024597, + "eval_rouge1": 0.0448, + "eval_rouge2": 0.0108, + "eval_rougeL": 0.0383, + "eval_rougeLsum": 0.0381, + "eval_runtime": 10.3192, + "eval_samples_per_second": 11.144, + "eval_steps_per_second": 1.938, + "step": 4581 + }, + { + "epoch": 238.96, + "eval_gen_len": 11.7739, + "eval_loss": 0.7555378079414368, + "eval_rouge1": 0.0455, + "eval_rouge2": 0.0105, + "eval_rougeL": 0.038, + "eval_rougeLsum": 0.0379, + "eval_runtime": 10.8876, + "eval_samples_per_second": 10.562, + "eval_steps_per_second": 1.837, + "step": 4600 + }, + { + "epoch": 240.0, + "eval_gen_len": 11.8957, + "eval_loss": 0.7544582486152649, + "eval_rouge1": 0.0445, + "eval_rouge2": 0.0105, + "eval_rougeL": 0.0372, + "eval_rougeLsum": 0.0371, + "eval_runtime": 17.7147, + "eval_samples_per_second": 6.492, + "eval_steps_per_second": 1.129, + "step": 4620 + }, + { + "epoch": 240.99, + "eval_gen_len": 12.0174, + "eval_loss": 0.7532872557640076, + "eval_rouge1": 0.0473, + "eval_rouge2": 0.0105, + "eval_rougeL": 0.0389, + "eval_rougeLsum": 0.0388, + "eval_runtime": 20.567, + "eval_samples_per_second": 5.591, + "eval_steps_per_second": 0.972, + "step": 4639 + }, + { + "epoch": 241.97, + "eval_gen_len": 11.9913, + "eval_loss": 0.7523981928825378, + "eval_rouge1": 0.0482, + "eval_rouge2": 0.0105, + "eval_rougeL": 0.0393, + "eval_rougeLsum": 0.0392, + "eval_runtime": 10.1389, + "eval_samples_per_second": 11.342, + "eval_steps_per_second": 1.973, + "step": 4658 + }, + { + "epoch": 242.96, + "eval_gen_len": 11.6609, + "eval_loss": 0.7515619993209839, + "eval_rouge1": 0.0454, + "eval_rouge2": 0.0098, + "eval_rougeL": 0.0379, + "eval_rougeLsum": 0.0378, + "eval_runtime": 10.4917, + "eval_samples_per_second": 10.961, + "eval_steps_per_second": 1.906, + "step": 4677 + }, + { + "epoch": 244.0, + "eval_gen_len": 11.6696, + "eval_loss": 0.7501043081283569, + "eval_rouge1": 0.0447, + "eval_rouge2": 0.0094, + "eval_rougeL": 0.0372, + "eval_rougeLsum": 0.0371, + "eval_runtime": 10.4455, + "eval_samples_per_second": 11.009, + "eval_steps_per_second": 1.915, + "step": 4697 + }, + { + "epoch": 244.99, + "eval_gen_len": 11.9826, + "eval_loss": 0.7491604089736938, + "eval_rouge1": 0.0469, + "eval_rouge2": 0.0097, + "eval_rougeL": 0.0389, + "eval_rougeLsum": 0.0389, + "eval_runtime": 14.1736, + "eval_samples_per_second": 8.114, + "eval_steps_per_second": 1.411, + "step": 4716 + }, + { + "epoch": 245.97, + "eval_gen_len": 11.9913, + "eval_loss": 0.7484715580940247, + "eval_rouge1": 0.0479, + "eval_rouge2": 0.0095, + "eval_rougeL": 0.0397, + "eval_rougeLsum": 0.0397, + "eval_runtime": 13.6957, + "eval_samples_per_second": 8.397, + "eval_steps_per_second": 1.46, + "step": 4735 + }, + { + "epoch": 246.96, + "eval_gen_len": 12.0522, + "eval_loss": 0.747407853603363, + "eval_rouge1": 0.0491, + "eval_rouge2": 0.01, + "eval_rougeL": 0.0403, + "eval_rougeLsum": 0.0404, + "eval_runtime": 10.4948, + "eval_samples_per_second": 10.958, + "eval_steps_per_second": 1.906, + "step": 4754 + }, + { + "epoch": 248.0, + "eval_gen_len": 11.9826, + "eval_loss": 0.746651291847229, + "eval_rouge1": 0.0482, + "eval_rouge2": 0.0092, + "eval_rougeL": 0.0394, + "eval_rougeLsum": 0.0395, + "eval_runtime": 10.1337, + "eval_samples_per_second": 11.348, + "eval_steps_per_second": 1.974, + "step": 4774 + }, + { + "epoch": 248.99, + "eval_gen_len": 12.1391, + "eval_loss": 0.7458359003067017, + "eval_rouge1": 0.0483, + "eval_rouge2": 0.0084, + "eval_rougeL": 0.0402, + "eval_rougeLsum": 0.0403, + "eval_runtime": 10.6355, + "eval_samples_per_second": 10.813, + "eval_steps_per_second": 1.88, + "step": 4793 + }, + { + "epoch": 249.97, + "eval_gen_len": 12.2087, + "eval_loss": 0.7449273467063904, + "eval_rouge1": 0.0487, + "eval_rouge2": 0.0083, + "eval_rougeL": 0.0402, + "eval_rougeLsum": 0.0404, + "eval_runtime": 10.4912, + "eval_samples_per_second": 10.962, + "eval_steps_per_second": 1.906, + "step": 4812 + }, + { + "epoch": 250.96, + "eval_gen_len": 11.9391, + "eval_loss": 0.7444418668746948, + "eval_rouge1": 0.0483, + "eval_rouge2": 0.0083, + "eval_rougeL": 0.0402, + "eval_rougeLsum": 0.0403, + "eval_runtime": 10.0739, + "eval_samples_per_second": 11.416, + "eval_steps_per_second": 1.985, + "step": 4831 + }, + { + "epoch": 252.0, + "eval_gen_len": 11.913, + "eval_loss": 0.7435948848724365, + "eval_rouge1": 0.0479, + "eval_rouge2": 0.0083, + "eval_rougeL": 0.0396, + "eval_rougeLsum": 0.0397, + "eval_runtime": 10.3551, + "eval_samples_per_second": 11.106, + "eval_steps_per_second": 1.931, + "step": 4851 + }, + { + "epoch": 252.99, + "eval_gen_len": 11.8783, + "eval_loss": 0.7429930567741394, + "eval_rouge1": 0.048, + "eval_rouge2": 0.0083, + "eval_rougeL": 0.0398, + "eval_rougeLsum": 0.0399, + "eval_runtime": 10.191, + "eval_samples_per_second": 11.285, + "eval_steps_per_second": 1.963, + "step": 4870 + }, + { + "epoch": 253.97, + "eval_gen_len": 11.9652, + "eval_loss": 0.7424508333206177, + "eval_rouge1": 0.0481, + "eval_rouge2": 0.0083, + "eval_rougeL": 0.04, + "eval_rougeLsum": 0.04, + "eval_runtime": 10.1791, + "eval_samples_per_second": 11.298, + "eval_steps_per_second": 1.965, + "step": 4889 + }, + { + "epoch": 254.96, + "eval_gen_len": 12.0174, + "eval_loss": 0.7415958642959595, + "eval_rouge1": 0.0486, + "eval_rouge2": 0.0083, + "eval_rougeL": 0.0398, + "eval_rougeLsum": 0.0398, + "eval_runtime": 16.411, + "eval_samples_per_second": 7.007, + "eval_steps_per_second": 1.219, + "step": 4908 + }, + { + "epoch": 256.0, + "eval_gen_len": 11.7478, + "eval_loss": 0.7406365871429443, + "eval_rouge1": 0.0475, + "eval_rouge2": 0.0083, + "eval_rougeL": 0.0386, + "eval_rougeLsum": 0.0387, + "eval_runtime": 10.1485, + "eval_samples_per_second": 11.332, + "eval_steps_per_second": 1.971, + "step": 4928 + }, + { + "epoch": 256.99, + "eval_gen_len": 11.8696, + "eval_loss": 0.7399746179580688, + "eval_rouge1": 0.0483, + "eval_rouge2": 0.0079, + "eval_rougeL": 0.039, + "eval_rougeLsum": 0.0393, + "eval_runtime": 16.5037, + "eval_samples_per_second": 6.968, + "eval_steps_per_second": 1.212, + "step": 4947 + }, + { + "epoch": 257.97, + "eval_gen_len": 11.6609, + "eval_loss": 0.7393442392349243, + "eval_rouge1": 0.0467, + "eval_rouge2": 0.0075, + "eval_rougeL": 0.0377, + "eval_rougeLsum": 0.0378, + "eval_runtime": 12.2582, + "eval_samples_per_second": 9.381, + "eval_steps_per_second": 1.632, + "step": 4966 + }, + { + "epoch": 258.96, + "eval_gen_len": 11.4087, + "eval_loss": 0.7388782501220703, + "eval_rouge1": 0.0455, + "eval_rouge2": 0.0072, + "eval_rougeL": 0.037, + "eval_rougeLsum": 0.0372, + "eval_runtime": 10.1512, + "eval_samples_per_second": 11.329, + "eval_steps_per_second": 1.97, + "step": 4985 + }, + { + "epoch": 259.74, + "grad_norm": 0.4241856038570404, + "learning_rate": 2.473684210526316e-06, + "loss": 0.8499, + "step": 5000 + }, + { + "epoch": 260.0, + "eval_gen_len": 11.5913, + "eval_loss": 0.7382517457008362, + "eval_rouge1": 0.0464, + "eval_rouge2": 0.007, + "eval_rougeL": 0.0372, + "eval_rougeLsum": 0.0376, + "eval_runtime": 10.3008, + "eval_samples_per_second": 11.164, + "eval_steps_per_second": 1.942, + "step": 5005 + }, + { + "epoch": 260.99, + "eval_gen_len": 11.6348, + "eval_loss": 0.7377699613571167, + "eval_rouge1": 0.0482, + "eval_rouge2": 0.0077, + "eval_rougeL": 0.0385, + "eval_rougeLsum": 0.0388, + "eval_runtime": 10.0612, + "eval_samples_per_second": 11.43, + "eval_steps_per_second": 1.988, + "step": 5024 + }, + { + "epoch": 261.97, + "eval_gen_len": 11.4522, + "eval_loss": 0.7373109459877014, + "eval_rouge1": 0.0483, + "eval_rouge2": 0.008, + "eval_rougeL": 0.0384, + "eval_rougeLsum": 0.0385, + "eval_runtime": 11.974, + "eval_samples_per_second": 9.604, + "eval_steps_per_second": 1.67, + "step": 5043 + }, + { + "epoch": 262.96, + "eval_gen_len": 11.3913, + "eval_loss": 0.7369760870933533, + "eval_rouge1": 0.0474, + "eval_rouge2": 0.0076, + "eval_rougeL": 0.0375, + "eval_rougeLsum": 0.0377, + "eval_runtime": 10.4532, + "eval_samples_per_second": 11.001, + "eval_steps_per_second": 1.913, + "step": 5062 + }, + { + "epoch": 264.0, + "eval_gen_len": 11.6696, + "eval_loss": 0.7363179326057434, + "eval_rouge1": 0.0486, + "eval_rouge2": 0.0077, + "eval_rougeL": 0.0385, + "eval_rougeLsum": 0.0385, + "eval_runtime": 10.1961, + "eval_samples_per_second": 11.279, + "eval_steps_per_second": 1.962, + "step": 5082 + }, + { + "epoch": 264.99, + "eval_gen_len": 11.7826, + "eval_loss": 0.7355965971946716, + "eval_rouge1": 0.0493, + "eval_rouge2": 0.0084, + "eval_rougeL": 0.039, + "eval_rougeLsum": 0.039, + "eval_runtime": 10.1789, + "eval_samples_per_second": 11.298, + "eval_steps_per_second": 1.965, + "step": 5101 + }, + { + "epoch": 265.97, + "eval_gen_len": 11.4609, + "eval_loss": 0.735298752784729, + "eval_rouge1": 0.047, + "eval_rouge2": 0.0076, + "eval_rougeL": 0.0371, + "eval_rougeLsum": 0.0372, + "eval_runtime": 10.1771, + "eval_samples_per_second": 11.3, + "eval_steps_per_second": 1.965, + "step": 5120 + }, + { + "epoch": 266.96, + "eval_gen_len": 11.3217, + "eval_loss": 0.734704852104187, + "eval_rouge1": 0.0461, + "eval_rouge2": 0.0076, + "eval_rougeL": 0.0364, + "eval_rougeLsum": 0.0366, + "eval_runtime": 10.0146, + "eval_samples_per_second": 11.483, + "eval_steps_per_second": 1.997, + "step": 5139 + }, + { + "epoch": 268.0, + "eval_gen_len": 11.3478, + "eval_loss": 0.7341461777687073, + "eval_rouge1": 0.0461, + "eval_rouge2": 0.0078, + "eval_rougeL": 0.0368, + "eval_rougeLsum": 0.0371, + "eval_runtime": 10.1605, + "eval_samples_per_second": 11.318, + "eval_steps_per_second": 1.968, + "step": 5159 + }, + { + "epoch": 268.99, + "eval_gen_len": 11.3304, + "eval_loss": 0.7336880564689636, + "eval_rouge1": 0.0461, + "eval_rouge2": 0.0078, + "eval_rougeL": 0.0368, + "eval_rougeLsum": 0.0371, + "eval_runtime": 10.0336, + "eval_samples_per_second": 11.462, + "eval_steps_per_second": 1.993, + "step": 5178 + }, + { + "epoch": 269.97, + "eval_gen_len": 11.3565, + "eval_loss": 0.7333458662033081, + "eval_rouge1": 0.0466, + "eval_rouge2": 0.0084, + "eval_rougeL": 0.0374, + "eval_rougeLsum": 0.0378, + "eval_runtime": 10.1088, + "eval_samples_per_second": 11.376, + "eval_steps_per_second": 1.978, + "step": 5197 + }, + { + "epoch": 270.96, + "eval_gen_len": 11.4696, + "eval_loss": 0.7329635620117188, + "eval_rouge1": 0.0484, + "eval_rouge2": 0.009, + "eval_rougeL": 0.0383, + "eval_rougeLsum": 0.0387, + "eval_runtime": 10.0683, + "eval_samples_per_second": 11.422, + "eval_steps_per_second": 1.986, + "step": 5216 + }, + { + "epoch": 272.0, + "eval_gen_len": 11.1826, + "eval_loss": 0.7325230836868286, + "eval_rouge1": 0.0471, + "eval_rouge2": 0.0086, + "eval_rougeL": 0.0373, + "eval_rougeLsum": 0.0376, + "eval_runtime": 10.2034, + "eval_samples_per_second": 11.271, + "eval_steps_per_second": 1.96, + "step": 5236 + }, + { + "epoch": 272.99, + "eval_gen_len": 11.113, + "eval_loss": 0.7321166396141052, + "eval_rouge1": 0.0467, + "eval_rouge2": 0.0085, + "eval_rougeL": 0.0372, + "eval_rougeLsum": 0.0377, + "eval_runtime": 12.209, + "eval_samples_per_second": 9.419, + "eval_steps_per_second": 1.638, + "step": 5255 + }, + { + "epoch": 273.97, + "eval_gen_len": 10.9304, + "eval_loss": 0.731701135635376, + "eval_rouge1": 0.0465, + "eval_rouge2": 0.0085, + "eval_rougeL": 0.037, + "eval_rougeLsum": 0.0374, + "eval_runtime": 10.4807, + "eval_samples_per_second": 10.973, + "eval_steps_per_second": 1.908, + "step": 5274 + }, + { + "epoch": 274.96, + "eval_gen_len": 10.8609, + "eval_loss": 0.7313553094863892, + "eval_rouge1": 0.047, + "eval_rouge2": 0.0089, + "eval_rougeL": 0.0374, + "eval_rougeLsum": 0.0379, + "eval_runtime": 10.1881, + "eval_samples_per_second": 11.288, + "eval_steps_per_second": 1.963, + "step": 5293 + }, + { + "epoch": 276.0, + "eval_gen_len": 10.8261, + "eval_loss": 0.7310741543769836, + "eval_rouge1": 0.0468, + "eval_rouge2": 0.0086, + "eval_rougeL": 0.0374, + "eval_rougeLsum": 0.038, + "eval_runtime": 10.4254, + "eval_samples_per_second": 11.031, + "eval_steps_per_second": 1.918, + "step": 5313 + }, + { + "epoch": 276.99, + "eval_gen_len": 11.0348, + "eval_loss": 0.7306625247001648, + "eval_rouge1": 0.0473, + "eval_rouge2": 0.0086, + "eval_rougeL": 0.0379, + "eval_rougeLsum": 0.0384, + "eval_runtime": 17.0869, + "eval_samples_per_second": 6.73, + "eval_steps_per_second": 1.17, + "step": 5332 + }, + { + "epoch": 277.97, + "eval_gen_len": 11.0609, + "eval_loss": 0.7303984761238098, + "eval_rouge1": 0.0482, + "eval_rouge2": 0.0089, + "eval_rougeL": 0.0388, + "eval_rougeLsum": 0.0393, + "eval_runtime": 10.2327, + "eval_samples_per_second": 11.238, + "eval_steps_per_second": 1.955, + "step": 5351 + }, + { + "epoch": 278.96, + "eval_gen_len": 11.1304, + "eval_loss": 0.7300783395767212, + "eval_rouge1": 0.0482, + "eval_rouge2": 0.0089, + "eval_rougeL": 0.0388, + "eval_rougeLsum": 0.0393, + "eval_runtime": 10.0601, + "eval_samples_per_second": 11.431, + "eval_steps_per_second": 1.988, + "step": 5370 + }, + { + "epoch": 280.0, + "eval_gen_len": 10.9826, + "eval_loss": 0.729738175868988, + "eval_rouge1": 0.0477, + "eval_rouge2": 0.0089, + "eval_rougeL": 0.0384, + "eval_rougeLsum": 0.039, + "eval_runtime": 10.0013, + "eval_samples_per_second": 11.498, + "eval_steps_per_second": 2.0, + "step": 5390 + }, + { + "epoch": 280.99, + "eval_gen_len": 10.9652, + "eval_loss": 0.7294939756393433, + "eval_rouge1": 0.048, + "eval_rouge2": 0.0089, + "eval_rougeL": 0.0386, + "eval_rougeLsum": 0.0392, + "eval_runtime": 9.9502, + "eval_samples_per_second": 11.558, + "eval_steps_per_second": 2.01, + "step": 5409 + }, + { + "epoch": 281.97, + "eval_gen_len": 11.0435, + "eval_loss": 0.7292339205741882, + "eval_rouge1": 0.0486, + "eval_rouge2": 0.0093, + "eval_rougeL": 0.039, + "eval_rougeLsum": 0.0394, + "eval_runtime": 10.0288, + "eval_samples_per_second": 11.467, + "eval_steps_per_second": 1.994, + "step": 5428 + }, + { + "epoch": 282.96, + "eval_gen_len": 10.9478, + "eval_loss": 0.7289875149726868, + "eval_rouge1": 0.0488, + "eval_rouge2": 0.0095, + "eval_rougeL": 0.0393, + "eval_rougeLsum": 0.0398, + "eval_runtime": 9.9992, + "eval_samples_per_second": 11.501, + "eval_steps_per_second": 2.0, + "step": 5447 + }, + { + "epoch": 284.0, + "eval_gen_len": 10.887, + "eval_loss": 0.7288010120391846, + "eval_rouge1": 0.0483, + "eval_rouge2": 0.0093, + "eval_rougeL": 0.0388, + "eval_rougeLsum": 0.0392, + "eval_runtime": 11.7568, + "eval_samples_per_second": 9.782, + "eval_steps_per_second": 1.701, + "step": 5467 + }, + { + "epoch": 284.99, + "eval_gen_len": 10.7391, + "eval_loss": 0.7286383509635925, + "eval_rouge1": 0.0472, + "eval_rouge2": 0.0091, + "eval_rougeL": 0.038, + "eval_rougeLsum": 0.0383, + "eval_runtime": 10.4604, + "eval_samples_per_second": 10.994, + "eval_steps_per_second": 1.912, + "step": 5486 + }, + { + "epoch": 285.71, + "grad_norm": 0.46214622259140015, + "learning_rate": 7.192982456140352e-07, + "loss": 0.8305, + "step": 5500 + } + ], + "logging_steps": 500, + "max_steps": 5700, + "num_input_tokens_seen": 0, + "num_train_epochs": 300, + "save_steps": 500, + "total_flos": 1.6008000528973824e+17, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}