german-jeopardy-mt5-base-256 / trainer_state.json
Marvin
Initial commit
bfe0903 unverified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.78531558608845,
"eval_steps": 500,
"global_step": 720,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.99,
"learning_rate": 0.0001,
"loss": 8.9608,
"step": 36
},
{
"epoch": 0.99,
"eval_bleu": 0.2352,
"eval_bp": 0.828,
"eval_counts_1": 2306,
"eval_counts_2": 50,
"eval_counts_3": 12,
"eval_counts_4": 2,
"eval_exact_match": 0.0,
"eval_f1": 0.0092,
"eval_gen_len": 3.1969,
"eval_loss": 2.8882896900177,
"eval_precisions_1": 12.9,
"eval_precisions_2": 0.319,
"eval_precisions_3": 0.0891,
"eval_precisions_4": 0.0178,
"eval_ref_len": 21250,
"eval_rouge1": 0.0081,
"eval_rouge2": 0.0022,
"eval_rougeL": 0.0078,
"eval_rougeLsum": 0.0078,
"eval_runtime": 386.3015,
"eval_samples_per_second": 5.705,
"eval_steps_per_second": 1.426,
"eval_sys_len": 17876,
"eval_totals_1": 17876,
"eval_totals_2": 15672,
"eval_totals_3": 13468,
"eval_totals_4": 11264,
"step": 36
},
{
"epoch": 1.98,
"learning_rate": 0.0001,
"loss": 3.2364,
"step": 72
},
{
"epoch": 1.98,
"eval_bleu": 6.7083,
"eval_bp": 0.9954,
"eval_counts_1": 6125,
"eval_counts_2": 1727,
"eval_counts_3": 687,
"eval_counts_4": 277,
"eval_exact_match": 0.0018,
"eval_f1": 0.2514,
"eval_gen_len": 11.8072,
"eval_loss": 1.9241770505905151,
"eval_precisions_1": 28.9571,
"eval_precisions_2": 9.1144,
"eval_precisions_3": 4.103,
"eval_precisions_4": 1.9051,
"eval_ref_len": 21250,
"eval_rouge1": 0.2457,
"eval_rouge2": 0.1026,
"eval_rougeL": 0.2345,
"eval_rougeLsum": 0.2346,
"eval_runtime": 440.0537,
"eval_samples_per_second": 5.008,
"eval_steps_per_second": 1.252,
"eval_sys_len": 21152,
"eval_totals_1": 21152,
"eval_totals_2": 18948,
"eval_totals_3": 16744,
"eval_totals_4": 14540,
"step": 72
},
{
"epoch": 3.0,
"learning_rate": 0.0001,
"loss": 2.4963,
"step": 109
},
{
"epoch": 3.0,
"eval_bleu": 9.1493,
"eval_bp": 0.752,
"eval_counts_1": 6903,
"eval_counts_2": 2271,
"eval_counts_3": 975,
"eval_counts_4": 409,
"eval_exact_match": 0.01,
"eval_f1": 0.2909,
"eval_gen_len": 12.176,
"eval_loss": 1.6558014154434204,
"eval_precisions_1": 41.7428,
"eval_precisions_2": 15.8446,
"eval_precisions_3": 8.0386,
"eval_precisions_4": 4.1209,
"eval_ref_len": 21250,
"eval_rouge1": 0.2966,
"eval_rouge2": 0.1415,
"eval_rougeL": 0.2854,
"eval_rougeLsum": 0.2852,
"eval_runtime": 434.1741,
"eval_samples_per_second": 5.076,
"eval_steps_per_second": 1.269,
"eval_sys_len": 16537,
"eval_totals_1": 16537,
"eval_totals_2": 14333,
"eval_totals_3": 12129,
"eval_totals_4": 9925,
"step": 109
},
{
"epoch": 3.98,
"learning_rate": 0.0001,
"loss": 2.2314,
"step": 145
},
{
"epoch": 3.98,
"eval_bleu": 10.187,
"eval_bp": 0.7573,
"eval_counts_1": 7160,
"eval_counts_2": 2440,
"eval_counts_3": 1098,
"eval_counts_4": 501,
"eval_exact_match": 0.0136,
"eval_f1": 0.3069,
"eval_gen_len": 12.157,
"eval_loss": 1.5771422386169434,
"eval_precisions_1": 43.0625,
"eval_precisions_2": 16.9174,
"eval_precisions_3": 8.986,
"eval_precisions_4": 5.0025,
"eval_ref_len": 21250,
"eval_rouge1": 0.314,
"eval_rouge2": 0.1535,
"eval_rougeL": 0.3028,
"eval_rougeLsum": 0.3028,
"eval_runtime": 436.5308,
"eval_samples_per_second": 5.049,
"eval_steps_per_second": 1.262,
"eval_sys_len": 16627,
"eval_totals_1": 16627,
"eval_totals_2": 14423,
"eval_totals_3": 12219,
"eval_totals_4": 10015,
"step": 145
},
{
"epoch": 4.97,
"learning_rate": 0.0001,
"loss": 2.0578,
"step": 181
},
{
"epoch": 4.97,
"eval_bleu": 11.0621,
"eval_bp": 0.7961,
"eval_counts_1": 7447,
"eval_counts_2": 2625,
"eval_counts_3": 1214,
"eval_counts_4": 566,
"eval_exact_match": 0.0163,
"eval_f1": 0.32,
"eval_gen_len": 12.5585,
"eval_loss": 1.5346813201904297,
"eval_precisions_1": 43.0338,
"eval_precisions_2": 17.383,
"eval_precisions_3": 9.413,
"eval_precisions_4": 5.2932,
"eval_ref_len": 21250,
"eval_rouge1": 0.3286,
"eval_rouge2": 0.1628,
"eval_rougeL": 0.3146,
"eval_rougeLsum": 0.3146,
"eval_runtime": 444.2911,
"eval_samples_per_second": 4.961,
"eval_steps_per_second": 1.24,
"eval_sys_len": 17305,
"eval_totals_1": 17305,
"eval_totals_2": 15101,
"eval_totals_3": 12897,
"eval_totals_4": 10693,
"step": 181
},
{
"epoch": 5.99,
"learning_rate": 0.0001,
"loss": 1.8928,
"step": 218
},
{
"epoch": 5.99,
"eval_bleu": 11.4063,
"eval_bp": 0.7556,
"eval_counts_1": 7396,
"eval_counts_2": 2659,
"eval_counts_3": 1257,
"eval_counts_4": 611,
"eval_exact_match": 0.0177,
"eval_f1": 0.3234,
"eval_gen_len": 12.1692,
"eval_loss": 1.512817144393921,
"eval_precisions_1": 44.5596,
"eval_precisions_2": 18.473,
"eval_precisions_3": 10.3117,
"eval_precisions_4": 6.1186,
"eval_ref_len": 21250,
"eval_rouge1": 0.3326,
"eval_rouge2": 0.1684,
"eval_rougeL": 0.3198,
"eval_rougeLsum": 0.3198,
"eval_runtime": 441.07,
"eval_samples_per_second": 4.997,
"eval_steps_per_second": 1.249,
"eval_sys_len": 16598,
"eval_totals_1": 16598,
"eval_totals_2": 14394,
"eval_totals_3": 12190,
"eval_totals_4": 9986,
"step": 218
},
{
"epoch": 6.98,
"learning_rate": 0.0001,
"loss": 1.8573,
"step": 254
},
{
"epoch": 6.98,
"eval_bleu": 11.8292,
"eval_bp": 0.7631,
"eval_counts_1": 7531,
"eval_counts_2": 2758,
"eval_counts_3": 1313,
"eval_counts_4": 641,
"eval_exact_match": 0.0163,
"eval_f1": 0.327,
"eval_gen_len": 12.3035,
"eval_loss": 1.4735780954360962,
"eval_precisions_1": 45.0203,
"eval_precisions_2": 18.9893,
"eval_precisions_3": 10.6575,
"eval_precisions_4": 6.3365,
"eval_ref_len": 21250,
"eval_rouge1": 0.3349,
"eval_rouge2": 0.1717,
"eval_rougeL": 0.3216,
"eval_rougeLsum": 0.3216,
"eval_runtime": 442.6304,
"eval_samples_per_second": 4.979,
"eval_steps_per_second": 1.245,
"eval_sys_len": 16728,
"eval_totals_1": 16728,
"eval_totals_2": 14524,
"eval_totals_3": 12320,
"eval_totals_4": 10116,
"step": 254
},
{
"epoch": 8.0,
"learning_rate": 0.0001,
"loss": 1.7361,
"step": 291
},
{
"epoch": 8.0,
"eval_bleu": 12.2208,
"eval_bp": 0.7747,
"eval_counts_1": 7658,
"eval_counts_2": 2849,
"eval_counts_3": 1368,
"eval_counts_4": 668,
"eval_exact_match": 0.0181,
"eval_f1": 0.3334,
"eval_gen_len": 12.4628,
"eval_loss": 1.4544174671173096,
"eval_precisions_1": 45.2387,
"eval_precisions_2": 19.3494,
"eval_precisions_3": 10.9265,
"eval_precisions_4": 6.4754,
"eval_ref_len": 21250,
"eval_rouge1": 0.3414,
"eval_rouge2": 0.1762,
"eval_rougeL": 0.3283,
"eval_rougeLsum": 0.3284,
"eval_runtime": 442.3648,
"eval_samples_per_second": 4.982,
"eval_steps_per_second": 1.246,
"eval_sys_len": 16928,
"eval_totals_1": 16928,
"eval_totals_2": 14724,
"eval_totals_3": 12520,
"eval_totals_4": 10316,
"step": 291
},
{
"epoch": 8.99,
"learning_rate": 0.0001,
"loss": 1.7162,
"step": 327
},
{
"epoch": 8.99,
"eval_bleu": 12.4536,
"eval_bp": 0.767,
"eval_counts_1": 7703,
"eval_counts_2": 2891,
"eval_counts_3": 1390,
"eval_counts_4": 694,
"eval_exact_match": 0.0159,
"eval_f1": 0.3374,
"eval_gen_len": 12.4174,
"eval_loss": 1.4459445476531982,
"eval_precisions_1": 45.8648,
"eval_precisions_2": 19.8136,
"eval_precisions_3": 11.2214,
"eval_precisions_4": 6.8153,
"eval_ref_len": 21250,
"eval_rouge1": 0.3454,
"eval_rouge2": 0.1785,
"eval_rougeL": 0.3325,
"eval_rougeLsum": 0.3323,
"eval_runtime": 436.4836,
"eval_samples_per_second": 5.049,
"eval_steps_per_second": 1.262,
"eval_sys_len": 16795,
"eval_totals_1": 16795,
"eval_totals_2": 14591,
"eval_totals_3": 12387,
"eval_totals_4": 10183,
"step": 327
},
{
"epoch": 9.98,
"learning_rate": 0.0001,
"loss": 1.6589,
"step": 363
},
{
"epoch": 9.98,
"eval_bleu": 12.8553,
"eval_bp": 0.8002,
"eval_counts_1": 7889,
"eval_counts_2": 2983,
"eval_counts_3": 1449,
"eval_counts_4": 719,
"eval_exact_match": 0.0172,
"eval_f1": 0.3435,
"eval_gen_len": 12.7101,
"eval_loss": 1.438312292098999,
"eval_precisions_1": 45.4017,
"eval_precisions_2": 19.6612,
"eval_precisions_3": 11.1737,
"eval_precisions_4": 6.6797,
"eval_ref_len": 21250,
"eval_rouge1": 0.3519,
"eval_rouge2": 0.1816,
"eval_rougeL": 0.3375,
"eval_rougeLsum": 0.3372,
"eval_runtime": 449.3427,
"eval_samples_per_second": 4.905,
"eval_steps_per_second": 1.226,
"eval_sys_len": 17376,
"eval_totals_1": 17376,
"eval_totals_2": 15172,
"eval_totals_3": 12968,
"eval_totals_4": 10764,
"step": 363
},
{
"epoch": 10.99,
"learning_rate": 0.0001,
"loss": 1.5571,
"step": 400
},
{
"epoch": 10.99,
"eval_bleu": 12.9671,
"eval_bp": 0.7894,
"eval_counts_1": 7889,
"eval_counts_2": 2994,
"eval_counts_3": 1457,
"eval_counts_4": 736,
"eval_exact_match": 0.02,
"eval_f1": 0.3457,
"eval_gen_len": 12.6466,
"eval_loss": 1.4213731288909912,
"eval_precisions_1": 45.9063,
"eval_precisions_2": 19.9853,
"eval_precisions_3": 11.4033,
"eval_precisions_4": 6.9611,
"eval_ref_len": 21250,
"eval_rouge1": 0.3529,
"eval_rouge2": 0.1845,
"eval_rougeL": 0.3392,
"eval_rougeLsum": 0.3393,
"eval_runtime": 440.5687,
"eval_samples_per_second": 5.003,
"eval_steps_per_second": 1.251,
"eval_sys_len": 17185,
"eval_totals_1": 17185,
"eval_totals_2": 14981,
"eval_totals_3": 12777,
"eval_totals_4": 10573,
"step": 400
},
{
"epoch": 11.98,
"learning_rate": 0.0001,
"loss": 1.5502,
"step": 436
},
{
"epoch": 11.98,
"eval_bleu": 13.0741,
"eval_bp": 0.7712,
"eval_counts_1": 7930,
"eval_counts_2": 3008,
"eval_counts_3": 1477,
"eval_counts_4": 741,
"eval_exact_match": 0.0213,
"eval_f1": 0.3541,
"eval_gen_len": 12.4483,
"eval_loss": 1.4135174751281738,
"eval_precisions_1": 47.0121,
"eval_precisions_2": 20.5128,
"eval_precisions_3": 11.8539,
"eval_precisions_4": 7.225,
"eval_ref_len": 21250,
"eval_rouge1": 0.3619,
"eval_rouge2": 0.189,
"eval_rougeL": 0.3492,
"eval_rougeLsum": 0.3491,
"eval_runtime": 443.1145,
"eval_samples_per_second": 4.974,
"eval_steps_per_second": 1.243,
"eval_sys_len": 16868,
"eval_totals_1": 16868,
"eval_totals_2": 14664,
"eval_totals_3": 12460,
"eval_totals_4": 10256,
"step": 436
},
{
"epoch": 13.0,
"learning_rate": 0.0001,
"loss": 1.4564,
"step": 473
},
{
"epoch": 13.0,
"eval_bleu": 14.1014,
"eval_bp": 0.8309,
"eval_counts_1": 8268,
"eval_counts_2": 3200,
"eval_counts_3": 1616,
"eval_counts_4": 837,
"eval_exact_match": 0.0218,
"eval_f1": 0.3647,
"eval_gen_len": 13.2441,
"eval_loss": 1.3942722082138062,
"eval_precisions_1": 46.1152,
"eval_precisions_2": 20.3498,
"eval_precisions_3": 11.9518,
"eval_precisions_4": 7.396,
"eval_ref_len": 21250,
"eval_rouge1": 0.3729,
"eval_rouge2": 0.1974,
"eval_rougeL": 0.3578,
"eval_rougeLsum": 0.3576,
"eval_runtime": 460.2282,
"eval_samples_per_second": 4.789,
"eval_steps_per_second": 1.197,
"eval_sys_len": 17929,
"eval_totals_1": 17929,
"eval_totals_2": 15725,
"eval_totals_3": 13521,
"eval_totals_4": 11317,
"step": 473
},
{
"epoch": 13.99,
"learning_rate": 0.0001,
"loss": 1.4522,
"step": 509
},
{
"epoch": 13.99,
"eval_bleu": 13.7526,
"eval_bp": 0.7667,
"eval_counts_1": 8047,
"eval_counts_2": 3130,
"eval_counts_3": 1564,
"eval_counts_4": 811,
"eval_exact_match": 0.0227,
"eval_f1": 0.3627,
"eval_gen_len": 12.515,
"eval_loss": 1.3952871561050415,
"eval_precisions_1": 47.9302,
"eval_precisions_2": 21.4604,
"eval_precisions_3": 12.6323,
"eval_precisions_4": 7.9689,
"eval_ref_len": 21250,
"eval_rouge1": 0.3712,
"eval_rouge2": 0.197,
"eval_rougeL": 0.3582,
"eval_rougeLsum": 0.3581,
"eval_runtime": 437.5396,
"eval_samples_per_second": 5.037,
"eval_steps_per_second": 1.259,
"eval_sys_len": 16789,
"eval_totals_1": 16789,
"eval_totals_2": 14585,
"eval_totals_3": 12381,
"eval_totals_4": 10177,
"step": 509
},
{
"epoch": 14.98,
"learning_rate": 0.0001,
"loss": 1.407,
"step": 545
},
{
"epoch": 14.98,
"eval_bleu": 14.7315,
"eval_bp": 0.8306,
"eval_counts_1": 8498,
"eval_counts_2": 3358,
"eval_counts_3": 1703,
"eval_counts_4": 877,
"eval_exact_match": 0.0213,
"eval_f1": 0.3772,
"eval_gen_len": 13.2849,
"eval_loss": 1.3759350776672363,
"eval_precisions_1": 47.4139,
"eval_precisions_2": 21.3627,
"eval_precisions_3": 12.6008,
"eval_precisions_4": 7.7535,
"eval_ref_len": 21250,
"eval_rouge1": 0.3856,
"eval_rouge2": 0.2063,
"eval_rougeL": 0.3709,
"eval_rougeLsum": 0.3706,
"eval_runtime": 453.6157,
"eval_samples_per_second": 4.859,
"eval_steps_per_second": 1.215,
"eval_sys_len": 17923,
"eval_totals_1": 17923,
"eval_totals_2": 15719,
"eval_totals_3": 13515,
"eval_totals_4": 11311,
"step": 545
},
{
"epoch": 15.99,
"learning_rate": 0.0001,
"loss": 1.3294,
"step": 582
},
{
"epoch": 15.99,
"eval_bleu": 14.868,
"eval_bp": 0.8044,
"eval_counts_1": 8481,
"eval_counts_2": 3407,
"eval_counts_3": 1721,
"eval_counts_4": 883,
"eval_exact_match": 0.024,
"eval_f1": 0.3822,
"eval_gen_len": 12.9142,
"eval_loss": 1.3775662183761597,
"eval_precisions_1": 48.5989,
"eval_precisions_2": 22.3454,
"eval_precisions_3": 13.1948,
"eval_precisions_4": 8.1465,
"eval_ref_len": 21250,
"eval_rouge1": 0.3907,
"eval_rouge2": 0.211,
"eval_rougeL": 0.3766,
"eval_rougeLsum": 0.3766,
"eval_runtime": 448.6685,
"eval_samples_per_second": 4.912,
"eval_steps_per_second": 1.228,
"eval_sys_len": 17451,
"eval_totals_1": 17451,
"eval_totals_2": 15247,
"eval_totals_3": 13043,
"eval_totals_4": 10839,
"step": 582
},
{
"epoch": 16.98,
"learning_rate": 0.0001,
"loss": 1.3294,
"step": 618
},
{
"epoch": 16.98,
"eval_bleu": 15.2312,
"eval_bp": 0.835,
"eval_counts_1": 8633,
"eval_counts_2": 3464,
"eval_counts_3": 1767,
"eval_counts_4": 923,
"eval_exact_match": 0.0263,
"eval_f1": 0.3868,
"eval_gen_len": 13.3103,
"eval_loss": 1.380259394645691,
"eval_precisions_1": 47.9505,
"eval_precisions_2": 21.9241,
"eval_precisions_3": 12.9965,
"eval_precisions_4": 8.1022,
"eval_ref_len": 21250,
"eval_rouge1": 0.3946,
"eval_rouge2": 0.2133,
"eval_rougeL": 0.3801,
"eval_rougeLsum": 0.3798,
"eval_runtime": 456.612,
"eval_samples_per_second": 4.827,
"eval_steps_per_second": 1.207,
"eval_sys_len": 18004,
"eval_totals_1": 18004,
"eval_totals_2": 15800,
"eval_totals_3": 13596,
"eval_totals_4": 11392,
"step": 618
},
{
"epoch": 18.0,
"learning_rate": 0.0001,
"loss": 1.2605,
"step": 655
},
{
"epoch": 18.0,
"eval_bleu": 14.779,
"eval_bp": 0.8255,
"eval_counts_1": 8560,
"eval_counts_2": 3376,
"eval_counts_3": 1695,
"eval_counts_4": 880,
"eval_exact_match": 0.0231,
"eval_f1": 0.3846,
"eval_gen_len": 13.1665,
"eval_loss": 1.3709588050842285,
"eval_precisions_1": 48.009,
"eval_precisions_2": 21.605,
"eval_precisions_3": 12.6285,
"eval_precisions_4": 7.8445,
"eval_ref_len": 21250,
"eval_rouge1": 0.3922,
"eval_rouge2": 0.2092,
"eval_rougeL": 0.3778,
"eval_rougeLsum": 0.3775,
"eval_runtime": 456.164,
"eval_samples_per_second": 4.832,
"eval_steps_per_second": 1.208,
"eval_sys_len": 17830,
"eval_totals_1": 17830,
"eval_totals_2": 15626,
"eval_totals_3": 13422,
"eval_totals_4": 11218,
"step": 655
},
{
"epoch": 18.99,
"learning_rate": 0.0001,
"loss": 1.2667,
"step": 691
},
{
"epoch": 18.99,
"eval_bleu": 15.0008,
"eval_bp": 0.8257,
"eval_counts_1": 8664,
"eval_counts_2": 3455,
"eval_counts_3": 1733,
"eval_counts_4": 882,
"eval_exact_match": 0.0227,
"eval_f1": 0.3906,
"eval_gen_len": 13.2232,
"eval_loss": 1.3694192171096802,
"eval_precisions_1": 48.5814,
"eval_precisions_2": 22.1049,
"eval_precisions_3": 12.9078,
"eval_precisions_4": 7.8596,
"eval_ref_len": 21250,
"eval_rouge1": 0.3987,
"eval_rouge2": 0.2138,
"eval_rougeL": 0.3853,
"eval_rougeLsum": 0.3851,
"eval_runtime": 454.2362,
"eval_samples_per_second": 4.852,
"eval_steps_per_second": 1.213,
"eval_sys_len": 17834,
"eval_totals_1": 17834,
"eval_totals_2": 15630,
"eval_totals_3": 13426,
"eval_totals_4": 11222,
"step": 691
},
{
"epoch": 19.79,
"learning_rate": 0.0001,
"loss": 1.2074,
"step": 720
},
{
"epoch": 19.79,
"eval_bleu": 15.0442,
"eval_bp": 0.8369,
"eval_counts_1": 8770,
"eval_counts_2": 3465,
"eval_counts_3": 1737,
"eval_counts_4": 880,
"eval_exact_match": 0.0227,
"eval_f1": 0.3941,
"eval_gen_len": 13.4424,
"eval_loss": 1.365785837173462,
"eval_precisions_1": 48.6169,
"eval_precisions_2": 21.8819,
"eval_precisions_3": 12.743,
"eval_precisions_4": 7.7011,
"eval_ref_len": 21250,
"eval_rouge1": 0.4025,
"eval_rouge2": 0.215,
"eval_rougeL": 0.3883,
"eval_rougeLsum": 0.3879,
"eval_runtime": 459.1457,
"eval_samples_per_second": 4.8,
"eval_steps_per_second": 1.2,
"eval_sys_len": 18039,
"eval_totals_1": 18039,
"eval_totals_2": 15835,
"eval_totals_3": 13631,
"eval_totals_4": 11427,
"step": 720
},
{
"epoch": 19.79,
"step": 720,
"total_flos": 4.419252384883016e+17,
"train_loss": 2.0875697082943385,
"train_runtime": 23544.6757,
"train_samples_per_second": 7.912,
"train_steps_per_second": 0.031
}
],
"logging_steps": 500,
"max_steps": 720,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 4.419252384883016e+17,
"trial_name": null,
"trial_params": null
}