iwslt_aligned_smallT5_cont0 / trainer_state.json
paulh27's picture
End of training
8933820 verified
raw
history blame
87.5 kB
{
"best_metric": 65.6358,
"best_model_checkpoint": "/tmp/finetuned_models/iwslt_aligned_smallT5_cont0/checkpoint-490000",
"epoch": 38.81384878124515,
"eval_steps": 10000,
"global_step": 500000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08,
"grad_norm": 3.0081684589385986,
"learning_rate": 0.0002,
"loss": 3.3371,
"step": 1000
},
{
"epoch": 0.16,
"grad_norm": 2.347439765930176,
"learning_rate": 0.0002,
"loss": 1.9825,
"step": 2000
},
{
"epoch": 0.23,
"grad_norm": 3.2636098861694336,
"learning_rate": 0.0002,
"loss": 1.7503,
"step": 3000
},
{
"epoch": 0.31,
"grad_norm": 2.1704177856445312,
"learning_rate": 0.0002,
"loss": 1.6063,
"step": 4000
},
{
"epoch": 0.39,
"grad_norm": 1.8257893323898315,
"learning_rate": 0.0002,
"loss": 1.5193,
"step": 5000
},
{
"epoch": 0.47,
"grad_norm": 2.1923136711120605,
"learning_rate": 0.0002,
"loss": 1.4439,
"step": 6000
},
{
"epoch": 0.54,
"grad_norm": 2.198930501937866,
"learning_rate": 0.0002,
"loss": 1.3742,
"step": 7000
},
{
"epoch": 0.62,
"grad_norm": 1.8519206047058105,
"learning_rate": 0.0002,
"loss": 1.3327,
"step": 8000
},
{
"epoch": 0.7,
"grad_norm": 2.371457576751709,
"learning_rate": 0.0002,
"loss": 1.2872,
"step": 9000
},
{
"epoch": 0.78,
"grad_norm": 2.2528538703918457,
"learning_rate": 0.0002,
"loss": 1.2426,
"step": 10000
},
{
"epoch": 0.78,
"eval_bleu": 46.2793,
"eval_gen_len": 28.6532,
"eval_loss": 0.8300400972366333,
"eval_runtime": 100.4495,
"eval_samples_per_second": 8.84,
"eval_steps_per_second": 1.105,
"step": 10000
},
{
"epoch": 0.85,
"grad_norm": 1.8507146835327148,
"learning_rate": 0.0002,
"loss": 1.2154,
"step": 11000
},
{
"epoch": 0.93,
"grad_norm": 1.629703164100647,
"learning_rate": 0.0002,
"loss": 1.1822,
"step": 12000
},
{
"epoch": 1.01,
"grad_norm": 1.9470340013504028,
"learning_rate": 0.0002,
"loss": 1.1505,
"step": 13000
},
{
"epoch": 1.09,
"grad_norm": 1.733299970626831,
"learning_rate": 0.0002,
"loss": 1.0877,
"step": 14000
},
{
"epoch": 1.16,
"grad_norm": 1.5334522724151611,
"learning_rate": 0.0002,
"loss": 1.0647,
"step": 15000
},
{
"epoch": 1.24,
"grad_norm": 1.8881198167800903,
"learning_rate": 0.0002,
"loss": 1.0595,
"step": 16000
},
{
"epoch": 1.32,
"grad_norm": 2.1781082153320312,
"learning_rate": 0.0002,
"loss": 1.0441,
"step": 17000
},
{
"epoch": 1.4,
"grad_norm": 1.509994387626648,
"learning_rate": 0.0002,
"loss": 1.0307,
"step": 18000
},
{
"epoch": 1.47,
"grad_norm": 2.5609610080718994,
"learning_rate": 0.0002,
"loss": 1.0165,
"step": 19000
},
{
"epoch": 1.55,
"grad_norm": 1.512005090713501,
"learning_rate": 0.0002,
"loss": 0.9931,
"step": 20000
},
{
"epoch": 1.55,
"eval_bleu": 52.2709,
"eval_gen_len": 28.6441,
"eval_loss": 0.6755661368370056,
"eval_runtime": 99.4907,
"eval_samples_per_second": 8.925,
"eval_steps_per_second": 1.116,
"step": 20000
},
{
"epoch": 1.63,
"grad_norm": 1.4830211400985718,
"learning_rate": 0.0002,
"loss": 0.9854,
"step": 21000
},
{
"epoch": 1.71,
"grad_norm": 1.8581557273864746,
"learning_rate": 0.0002,
"loss": 0.9736,
"step": 22000
},
{
"epoch": 1.79,
"grad_norm": 1.589917778968811,
"learning_rate": 0.0002,
"loss": 0.9642,
"step": 23000
},
{
"epoch": 1.86,
"grad_norm": 2.332538604736328,
"learning_rate": 0.0002,
"loss": 0.9476,
"step": 24000
},
{
"epoch": 1.94,
"grad_norm": 1.61520516872406,
"learning_rate": 0.0002,
"loss": 0.943,
"step": 25000
},
{
"epoch": 2.02,
"grad_norm": 1.2320265769958496,
"learning_rate": 0.0002,
"loss": 0.9135,
"step": 26000
},
{
"epoch": 2.1,
"grad_norm": 1.543626308441162,
"learning_rate": 0.0002,
"loss": 0.8763,
"step": 27000
},
{
"epoch": 2.17,
"grad_norm": 1.7634472846984863,
"learning_rate": 0.0002,
"loss": 0.8664,
"step": 28000
},
{
"epoch": 2.25,
"grad_norm": 1.1254847049713135,
"learning_rate": 0.0002,
"loss": 0.8625,
"step": 29000
},
{
"epoch": 2.33,
"grad_norm": 1.390243649482727,
"learning_rate": 0.0002,
"loss": 0.8573,
"step": 30000
},
{
"epoch": 2.33,
"eval_bleu": 55.8294,
"eval_gen_len": 28.5405,
"eval_loss": 0.6142958998680115,
"eval_runtime": 100.5081,
"eval_samples_per_second": 8.835,
"eval_steps_per_second": 1.104,
"step": 30000
},
{
"epoch": 2.41,
"grad_norm": 1.592313289642334,
"learning_rate": 0.0002,
"loss": 0.855,
"step": 31000
},
{
"epoch": 2.48,
"grad_norm": 1.1361020803451538,
"learning_rate": 0.0002,
"loss": 0.8487,
"step": 32000
},
{
"epoch": 2.56,
"grad_norm": 1.2688100337982178,
"learning_rate": 0.0002,
"loss": 0.8417,
"step": 33000
},
{
"epoch": 2.64,
"grad_norm": 1.3345963954925537,
"learning_rate": 0.0002,
"loss": 0.829,
"step": 34000
},
{
"epoch": 2.72,
"grad_norm": 1.4640212059020996,
"learning_rate": 0.0002,
"loss": 0.8264,
"step": 35000
},
{
"epoch": 2.79,
"grad_norm": 1.4818629026412964,
"learning_rate": 0.0002,
"loss": 0.8207,
"step": 36000
},
{
"epoch": 2.87,
"grad_norm": 1.4580553770065308,
"learning_rate": 0.0002,
"loss": 0.8155,
"step": 37000
},
{
"epoch": 2.95,
"grad_norm": 1.8713078498840332,
"learning_rate": 0.0002,
"loss": 0.8134,
"step": 38000
},
{
"epoch": 3.03,
"grad_norm": 1.295332670211792,
"learning_rate": 0.0002,
"loss": 0.787,
"step": 39000
},
{
"epoch": 3.11,
"grad_norm": 1.532378077507019,
"learning_rate": 0.0002,
"loss": 0.762,
"step": 40000
},
{
"epoch": 3.11,
"eval_bleu": 57.5135,
"eval_gen_len": 28.366,
"eval_loss": 0.581108570098877,
"eval_runtime": 98.8348,
"eval_samples_per_second": 8.985,
"eval_steps_per_second": 1.123,
"step": 40000
},
{
"epoch": 3.18,
"grad_norm": 1.3924858570098877,
"learning_rate": 0.0002,
"loss": 0.7577,
"step": 41000
},
{
"epoch": 3.26,
"grad_norm": 1.4161888360977173,
"learning_rate": 0.0002,
"loss": 0.7482,
"step": 42000
},
{
"epoch": 3.34,
"grad_norm": 1.639460802078247,
"learning_rate": 0.0002,
"loss": 0.7582,
"step": 43000
},
{
"epoch": 3.42,
"grad_norm": 1.3779182434082031,
"learning_rate": 0.0002,
"loss": 0.7474,
"step": 44000
},
{
"epoch": 3.49,
"grad_norm": 1.8350883722305298,
"learning_rate": 0.0002,
"loss": 0.751,
"step": 45000
},
{
"epoch": 3.57,
"grad_norm": 1.2075496912002563,
"learning_rate": 0.0002,
"loss": 0.7371,
"step": 46000
},
{
"epoch": 3.65,
"grad_norm": 1.3083984851837158,
"learning_rate": 0.0002,
"loss": 0.7318,
"step": 47000
},
{
"epoch": 3.73,
"grad_norm": 1.5021324157714844,
"learning_rate": 0.0002,
"loss": 0.7376,
"step": 48000
},
{
"epoch": 3.8,
"grad_norm": 1.1597286462783813,
"learning_rate": 0.0002,
"loss": 0.7336,
"step": 49000
},
{
"epoch": 3.88,
"grad_norm": 1.3814338445663452,
"learning_rate": 0.0002,
"loss": 0.734,
"step": 50000
},
{
"epoch": 3.88,
"eval_bleu": 58.6125,
"eval_gen_len": 28.5101,
"eval_loss": 0.5499288439750671,
"eval_runtime": 99.2548,
"eval_samples_per_second": 8.947,
"eval_steps_per_second": 1.118,
"step": 50000
},
{
"epoch": 3.96,
"grad_norm": 1.0968077182769775,
"learning_rate": 0.0002,
"loss": 0.7288,
"step": 51000
},
{
"epoch": 4.04,
"grad_norm": 1.9418740272521973,
"learning_rate": 0.0002,
"loss": 0.7057,
"step": 52000
},
{
"epoch": 4.11,
"grad_norm": 1.1883801221847534,
"learning_rate": 0.0002,
"loss": 0.6761,
"step": 53000
},
{
"epoch": 4.19,
"grad_norm": 1.1581670045852661,
"learning_rate": 0.0002,
"loss": 0.6812,
"step": 54000
},
{
"epoch": 4.27,
"grad_norm": 1.4657800197601318,
"learning_rate": 0.0002,
"loss": 0.6783,
"step": 55000
},
{
"epoch": 4.35,
"grad_norm": 1.1991990804672241,
"learning_rate": 0.0002,
"loss": 0.6764,
"step": 56000
},
{
"epoch": 4.42,
"grad_norm": 1.5376391410827637,
"learning_rate": 0.0002,
"loss": 0.6805,
"step": 57000
},
{
"epoch": 4.5,
"grad_norm": 1.228727102279663,
"learning_rate": 0.0002,
"loss": 0.6734,
"step": 58000
},
{
"epoch": 4.58,
"grad_norm": 0.949891984462738,
"learning_rate": 0.0002,
"loss": 0.6702,
"step": 59000
},
{
"epoch": 4.66,
"grad_norm": 1.5632683038711548,
"learning_rate": 0.0002,
"loss": 0.6722,
"step": 60000
},
{
"epoch": 4.66,
"eval_bleu": 59.6427,
"eval_gen_len": 28.8356,
"eval_loss": 0.522808849811554,
"eval_runtime": 101.7798,
"eval_samples_per_second": 8.725,
"eval_steps_per_second": 1.091,
"step": 60000
},
{
"epoch": 4.74,
"grad_norm": 1.2885149717330933,
"learning_rate": 0.0002,
"loss": 0.67,
"step": 61000
},
{
"epoch": 4.81,
"grad_norm": 1.7992392778396606,
"learning_rate": 0.0002,
"loss": 0.6686,
"step": 62000
},
{
"epoch": 4.89,
"grad_norm": 1.7027188539505005,
"learning_rate": 0.0002,
"loss": 0.6682,
"step": 63000
},
{
"epoch": 4.97,
"grad_norm": 1.2083909511566162,
"learning_rate": 0.0002,
"loss": 0.6545,
"step": 64000
},
{
"epoch": 5.05,
"grad_norm": 1.1508337259292603,
"learning_rate": 0.0002,
"loss": 0.6308,
"step": 65000
},
{
"epoch": 5.12,
"grad_norm": 1.0901002883911133,
"learning_rate": 0.0002,
"loss": 0.6192,
"step": 66000
},
{
"epoch": 5.2,
"grad_norm": 1.3939250707626343,
"learning_rate": 0.0002,
"loss": 0.6217,
"step": 67000
},
{
"epoch": 5.28,
"grad_norm": 0.9105481505393982,
"learning_rate": 0.0002,
"loss": 0.6177,
"step": 68000
},
{
"epoch": 5.36,
"grad_norm": 0.944652795791626,
"learning_rate": 0.0002,
"loss": 0.6227,
"step": 69000
},
{
"epoch": 5.43,
"grad_norm": 1.1488838195800781,
"learning_rate": 0.0002,
"loss": 0.6215,
"step": 70000
},
{
"epoch": 5.43,
"eval_bleu": 60.4701,
"eval_gen_len": 28.7534,
"eval_loss": 0.5161064863204956,
"eval_runtime": 100.914,
"eval_samples_per_second": 8.8,
"eval_steps_per_second": 1.1,
"step": 70000
},
{
"epoch": 5.51,
"grad_norm": 1.0799453258514404,
"learning_rate": 0.0002,
"loss": 0.6173,
"step": 71000
},
{
"epoch": 5.59,
"grad_norm": 1.2405527830123901,
"learning_rate": 0.0002,
"loss": 0.6229,
"step": 72000
},
{
"epoch": 5.67,
"grad_norm": 1.045590877532959,
"learning_rate": 0.0002,
"loss": 0.6183,
"step": 73000
},
{
"epoch": 5.74,
"grad_norm": 1.4318771362304688,
"learning_rate": 0.0002,
"loss": 0.6146,
"step": 74000
},
{
"epoch": 5.82,
"grad_norm": 1.0059374570846558,
"learning_rate": 0.0002,
"loss": 0.6175,
"step": 75000
},
{
"epoch": 5.9,
"grad_norm": 1.0831586122512817,
"learning_rate": 0.0002,
"loss": 0.6165,
"step": 76000
},
{
"epoch": 5.98,
"grad_norm": 1.4094606637954712,
"learning_rate": 0.0002,
"loss": 0.6158,
"step": 77000
},
{
"epoch": 6.05,
"grad_norm": 1.2640748023986816,
"learning_rate": 0.0002,
"loss": 0.5827,
"step": 78000
},
{
"epoch": 6.13,
"grad_norm": 1.0088295936584473,
"learning_rate": 0.0002,
"loss": 0.5699,
"step": 79000
},
{
"epoch": 6.21,
"grad_norm": 0.9942078590393066,
"learning_rate": 0.0002,
"loss": 0.5756,
"step": 80000
},
{
"epoch": 6.21,
"eval_bleu": 62.0864,
"eval_gen_len": 28.6498,
"eval_loss": 0.5067651867866516,
"eval_runtime": 99.2392,
"eval_samples_per_second": 8.948,
"eval_steps_per_second": 1.119,
"step": 80000
},
{
"epoch": 6.29,
"grad_norm": 1.838376522064209,
"learning_rate": 0.0002,
"loss": 0.5784,
"step": 81000
},
{
"epoch": 6.37,
"grad_norm": 1.477137804031372,
"learning_rate": 0.0002,
"loss": 0.5743,
"step": 82000
},
{
"epoch": 6.44,
"grad_norm": 1.1481742858886719,
"learning_rate": 0.0002,
"loss": 0.578,
"step": 83000
},
{
"epoch": 6.52,
"grad_norm": 1.1412264108657837,
"learning_rate": 0.0002,
"loss": 0.5785,
"step": 84000
},
{
"epoch": 6.6,
"grad_norm": 0.9707184433937073,
"learning_rate": 0.0002,
"loss": 0.5746,
"step": 85000
},
{
"epoch": 6.68,
"grad_norm": 0.9683183431625366,
"learning_rate": 0.0002,
"loss": 0.5794,
"step": 86000
},
{
"epoch": 6.75,
"grad_norm": 1.2322285175323486,
"learning_rate": 0.0002,
"loss": 0.5765,
"step": 87000
},
{
"epoch": 6.83,
"grad_norm": 1.1276684999465942,
"learning_rate": 0.0002,
"loss": 0.5767,
"step": 88000
},
{
"epoch": 6.91,
"grad_norm": 1.4353203773498535,
"learning_rate": 0.0002,
"loss": 0.5776,
"step": 89000
},
{
"epoch": 6.99,
"grad_norm": 1.116827368736267,
"learning_rate": 0.0002,
"loss": 0.5738,
"step": 90000
},
{
"epoch": 6.99,
"eval_bleu": 61.9714,
"eval_gen_len": 28.5788,
"eval_loss": 0.5005396008491516,
"eval_runtime": 98.6774,
"eval_samples_per_second": 8.999,
"eval_steps_per_second": 1.125,
"step": 90000
},
{
"epoch": 7.06,
"grad_norm": 1.0088461637496948,
"learning_rate": 0.0002,
"loss": 0.538,
"step": 91000
},
{
"epoch": 7.14,
"grad_norm": 1.1016899347305298,
"learning_rate": 0.0002,
"loss": 0.5349,
"step": 92000
},
{
"epoch": 7.22,
"grad_norm": 1.4406321048736572,
"learning_rate": 0.0002,
"loss": 0.5408,
"step": 93000
},
{
"epoch": 7.3,
"grad_norm": 1.1226301193237305,
"learning_rate": 0.0002,
"loss": 0.5362,
"step": 94000
},
{
"epoch": 7.37,
"grad_norm": 1.4062280654907227,
"learning_rate": 0.0002,
"loss": 0.5381,
"step": 95000
},
{
"epoch": 7.45,
"grad_norm": 1.2851547002792358,
"learning_rate": 0.0002,
"loss": 0.5438,
"step": 96000
},
{
"epoch": 7.53,
"grad_norm": 0.9344896078109741,
"learning_rate": 0.0002,
"loss": 0.5442,
"step": 97000
},
{
"epoch": 7.61,
"grad_norm": 1.7169030904769897,
"learning_rate": 0.0002,
"loss": 0.539,
"step": 98000
},
{
"epoch": 7.69,
"grad_norm": 1.1855800151824951,
"learning_rate": 0.0002,
"loss": 0.543,
"step": 99000
},
{
"epoch": 7.76,
"grad_norm": 1.093122959136963,
"learning_rate": 0.0002,
"loss": 0.5384,
"step": 100000
},
{
"epoch": 7.76,
"eval_bleu": 62.407,
"eval_gen_len": 28.5282,
"eval_loss": 0.49085742235183716,
"eval_runtime": 98.8889,
"eval_samples_per_second": 8.98,
"eval_steps_per_second": 1.122,
"step": 100000
},
{
"epoch": 7.84,
"grad_norm": 1.3849202394485474,
"learning_rate": 0.0002,
"loss": 0.5381,
"step": 101000
},
{
"epoch": 7.92,
"grad_norm": 1.12773859500885,
"learning_rate": 0.0002,
"loss": 0.544,
"step": 102000
},
{
"epoch": 8.0,
"grad_norm": 0.9132428169250488,
"learning_rate": 0.0002,
"loss": 0.5371,
"step": 103000
},
{
"epoch": 8.07,
"grad_norm": 1.5978176593780518,
"learning_rate": 0.0002,
"loss": 0.5015,
"step": 104000
},
{
"epoch": 8.15,
"grad_norm": 1.028082013130188,
"learning_rate": 0.0002,
"loss": 0.4956,
"step": 105000
},
{
"epoch": 8.23,
"grad_norm": 1.0597223043441772,
"learning_rate": 0.0002,
"loss": 0.5074,
"step": 106000
},
{
"epoch": 8.31,
"grad_norm": 1.47709059715271,
"learning_rate": 0.0002,
"loss": 0.5022,
"step": 107000
},
{
"epoch": 8.38,
"grad_norm": 1.1731916666030884,
"learning_rate": 0.0002,
"loss": 0.5039,
"step": 108000
},
{
"epoch": 8.46,
"grad_norm": 1.2147469520568848,
"learning_rate": 0.0002,
"loss": 0.5121,
"step": 109000
},
{
"epoch": 8.54,
"grad_norm": 0.8521010279655457,
"learning_rate": 0.0002,
"loss": 0.5109,
"step": 110000
},
{
"epoch": 8.54,
"eval_bleu": 62.1452,
"eval_gen_len": 28.4617,
"eval_loss": 0.4901648759841919,
"eval_runtime": 98.4744,
"eval_samples_per_second": 9.018,
"eval_steps_per_second": 1.127,
"step": 110000
},
{
"epoch": 8.62,
"grad_norm": 1.250752568244934,
"learning_rate": 0.0002,
"loss": 0.5072,
"step": 111000
},
{
"epoch": 8.69,
"grad_norm": 1.2694823741912842,
"learning_rate": 0.0002,
"loss": 0.5126,
"step": 112000
},
{
"epoch": 8.77,
"grad_norm": 1.0290015935897827,
"learning_rate": 0.0002,
"loss": 0.5065,
"step": 113000
},
{
"epoch": 8.85,
"grad_norm": 1.222034215927124,
"learning_rate": 0.0002,
"loss": 0.514,
"step": 114000
},
{
"epoch": 8.93,
"grad_norm": 1.0359649658203125,
"learning_rate": 0.0002,
"loss": 0.5127,
"step": 115000
},
{
"epoch": 9.0,
"grad_norm": 1.19712495803833,
"learning_rate": 0.0002,
"loss": 0.5114,
"step": 116000
},
{
"epoch": 9.08,
"grad_norm": 1.1766573190689087,
"learning_rate": 0.0002,
"loss": 0.4698,
"step": 117000
},
{
"epoch": 9.16,
"grad_norm": 1.2681427001953125,
"learning_rate": 0.0002,
"loss": 0.4755,
"step": 118000
},
{
"epoch": 9.24,
"grad_norm": 1.2988672256469727,
"learning_rate": 0.0002,
"loss": 0.4772,
"step": 119000
},
{
"epoch": 9.32,
"grad_norm": 1.440721035003662,
"learning_rate": 0.0002,
"loss": 0.4816,
"step": 120000
},
{
"epoch": 9.32,
"eval_bleu": 62.6499,
"eval_gen_len": 28.5518,
"eval_loss": 0.48750796914100647,
"eval_runtime": 99.0223,
"eval_samples_per_second": 8.968,
"eval_steps_per_second": 1.121,
"step": 120000
},
{
"epoch": 9.39,
"grad_norm": 1.038442611694336,
"learning_rate": 0.0002,
"loss": 0.4792,
"step": 121000
},
{
"epoch": 9.47,
"grad_norm": 1.3428473472595215,
"learning_rate": 0.0002,
"loss": 0.4827,
"step": 122000
},
{
"epoch": 9.55,
"grad_norm": 1.4756362438201904,
"learning_rate": 0.0002,
"loss": 0.4832,
"step": 123000
},
{
"epoch": 9.63,
"grad_norm": 1.2109817266464233,
"learning_rate": 0.0002,
"loss": 0.481,
"step": 124000
},
{
"epoch": 9.7,
"grad_norm": 1.2007863521575928,
"learning_rate": 0.0002,
"loss": 0.4815,
"step": 125000
},
{
"epoch": 9.78,
"grad_norm": 1.1711379289627075,
"learning_rate": 0.0002,
"loss": 0.488,
"step": 126000
},
{
"epoch": 9.86,
"grad_norm": 1.1571533679962158,
"learning_rate": 0.0002,
"loss": 0.4827,
"step": 127000
},
{
"epoch": 9.94,
"grad_norm": 1.2341859340667725,
"learning_rate": 0.0002,
"loss": 0.4844,
"step": 128000
},
{
"epoch": 10.01,
"grad_norm": 1.5796501636505127,
"learning_rate": 0.0002,
"loss": 0.4741,
"step": 129000
},
{
"epoch": 10.09,
"grad_norm": 0.8122438788414001,
"learning_rate": 0.0002,
"loss": 0.4493,
"step": 130000
},
{
"epoch": 10.09,
"eval_bleu": 62.6694,
"eval_gen_len": 28.6993,
"eval_loss": 0.4866686761379242,
"eval_runtime": 100.7784,
"eval_samples_per_second": 8.811,
"eval_steps_per_second": 1.101,
"step": 130000
},
{
"epoch": 10.17,
"grad_norm": 1.1835366487503052,
"learning_rate": 0.0002,
"loss": 0.448,
"step": 131000
},
{
"epoch": 10.25,
"grad_norm": 1.0868804454803467,
"learning_rate": 0.0002,
"loss": 0.4517,
"step": 132000
},
{
"epoch": 10.32,
"grad_norm": 0.9316431283950806,
"learning_rate": 0.0002,
"loss": 0.454,
"step": 133000
},
{
"epoch": 10.4,
"grad_norm": 1.5438517332077026,
"learning_rate": 0.0002,
"loss": 0.4526,
"step": 134000
},
{
"epoch": 10.48,
"grad_norm": 1.5842955112457275,
"learning_rate": 0.0002,
"loss": 0.4576,
"step": 135000
},
{
"epoch": 10.56,
"grad_norm": 1.450462818145752,
"learning_rate": 0.0002,
"loss": 0.463,
"step": 136000
},
{
"epoch": 10.63,
"grad_norm": 0.8578802347183228,
"learning_rate": 0.0002,
"loss": 0.4588,
"step": 137000
},
{
"epoch": 10.71,
"grad_norm": 1.1508352756500244,
"learning_rate": 0.0002,
"loss": 0.4542,
"step": 138000
},
{
"epoch": 10.79,
"grad_norm": 1.1183589696884155,
"learning_rate": 0.0002,
"loss": 0.4601,
"step": 139000
},
{
"epoch": 10.87,
"grad_norm": 0.9286684393882751,
"learning_rate": 0.0002,
"loss": 0.4648,
"step": 140000
},
{
"epoch": 10.87,
"eval_bleu": 63.3179,
"eval_gen_len": 28.5495,
"eval_loss": 0.4774705469608307,
"eval_runtime": 98.6639,
"eval_samples_per_second": 9.0,
"eval_steps_per_second": 1.125,
"step": 140000
},
{
"epoch": 10.95,
"grad_norm": 1.3315681219100952,
"learning_rate": 0.0002,
"loss": 0.4627,
"step": 141000
},
{
"epoch": 11.02,
"grad_norm": 1.204750418663025,
"learning_rate": 0.0002,
"loss": 0.4497,
"step": 142000
},
{
"epoch": 11.1,
"grad_norm": 1.0254498720169067,
"learning_rate": 0.0002,
"loss": 0.4242,
"step": 143000
},
{
"epoch": 11.18,
"grad_norm": 1.052018404006958,
"learning_rate": 0.0002,
"loss": 0.4306,
"step": 144000
},
{
"epoch": 11.26,
"grad_norm": 0.9426015019416809,
"learning_rate": 0.0002,
"loss": 0.4275,
"step": 145000
},
{
"epoch": 11.33,
"grad_norm": 1.079633116722107,
"learning_rate": 0.0002,
"loss": 0.4303,
"step": 146000
},
{
"epoch": 11.41,
"grad_norm": 1.025631070137024,
"learning_rate": 0.0002,
"loss": 0.433,
"step": 147000
},
{
"epoch": 11.49,
"grad_norm": 1.21865713596344,
"learning_rate": 0.0002,
"loss": 0.4351,
"step": 148000
},
{
"epoch": 11.57,
"grad_norm": 1.0304579734802246,
"learning_rate": 0.0002,
"loss": 0.4358,
"step": 149000
},
{
"epoch": 11.64,
"grad_norm": 1.297282338142395,
"learning_rate": 0.0002,
"loss": 0.4414,
"step": 150000
},
{
"epoch": 11.64,
"eval_bleu": 63.6928,
"eval_gen_len": 28.4673,
"eval_loss": 0.4786856770515442,
"eval_runtime": 98.6768,
"eval_samples_per_second": 8.999,
"eval_steps_per_second": 1.125,
"step": 150000
},
{
"epoch": 11.72,
"grad_norm": 0.973185658454895,
"learning_rate": 0.0002,
"loss": 0.4375,
"step": 151000
},
{
"epoch": 11.8,
"grad_norm": 0.9765141010284424,
"learning_rate": 0.0002,
"loss": 0.4398,
"step": 152000
},
{
"epoch": 11.88,
"grad_norm": 1.1599891185760498,
"learning_rate": 0.0002,
"loss": 0.4422,
"step": 153000
},
{
"epoch": 11.95,
"grad_norm": 1.0690301656723022,
"learning_rate": 0.0002,
"loss": 0.4396,
"step": 154000
},
{
"epoch": 12.03,
"grad_norm": 0.9184726476669312,
"learning_rate": 0.0002,
"loss": 0.4232,
"step": 155000
},
{
"epoch": 12.11,
"grad_norm": 1.1572961807250977,
"learning_rate": 0.0002,
"loss": 0.4038,
"step": 156000
},
{
"epoch": 12.19,
"grad_norm": 1.1003015041351318,
"learning_rate": 0.0002,
"loss": 0.4088,
"step": 157000
},
{
"epoch": 12.27,
"grad_norm": 1.147965908050537,
"learning_rate": 0.0002,
"loss": 0.4099,
"step": 158000
},
{
"epoch": 12.34,
"grad_norm": 1.3417842388153076,
"learning_rate": 0.0002,
"loss": 0.4108,
"step": 159000
},
{
"epoch": 12.42,
"grad_norm": 0.9816053509712219,
"learning_rate": 0.0002,
"loss": 0.4158,
"step": 160000
},
{
"epoch": 12.42,
"eval_bleu": 63.8752,
"eval_gen_len": 28.5011,
"eval_loss": 0.47919762134552,
"eval_runtime": 98.6149,
"eval_samples_per_second": 9.005,
"eval_steps_per_second": 1.126,
"step": 160000
},
{
"epoch": 12.5,
"grad_norm": 1.1307754516601562,
"learning_rate": 0.0002,
"loss": 0.4139,
"step": 161000
},
{
"epoch": 12.58,
"grad_norm": 1.2909305095672607,
"learning_rate": 0.0002,
"loss": 0.4191,
"step": 162000
},
{
"epoch": 12.65,
"grad_norm": 1.0675512552261353,
"learning_rate": 0.0002,
"loss": 0.4178,
"step": 163000
},
{
"epoch": 12.73,
"grad_norm": 1.062435269355774,
"learning_rate": 0.0002,
"loss": 0.4183,
"step": 164000
},
{
"epoch": 12.81,
"grad_norm": 1.2755943536758423,
"learning_rate": 0.0002,
"loss": 0.42,
"step": 165000
},
{
"epoch": 12.89,
"grad_norm": 1.0879075527191162,
"learning_rate": 0.0002,
"loss": 0.4231,
"step": 166000
},
{
"epoch": 12.96,
"grad_norm": 1.1521817445755005,
"learning_rate": 0.0002,
"loss": 0.4231,
"step": 167000
},
{
"epoch": 13.04,
"grad_norm": 1.038859486579895,
"learning_rate": 0.0002,
"loss": 0.4014,
"step": 168000
},
{
"epoch": 13.12,
"grad_norm": 1.001861333847046,
"learning_rate": 0.0002,
"loss": 0.3875,
"step": 169000
},
{
"epoch": 13.2,
"grad_norm": 0.967998743057251,
"learning_rate": 0.0002,
"loss": 0.3895,
"step": 170000
},
{
"epoch": 13.2,
"eval_bleu": 63.8429,
"eval_gen_len": 28.6498,
"eval_loss": 0.4793872535228729,
"eval_runtime": 99.8944,
"eval_samples_per_second": 8.889,
"eval_steps_per_second": 1.111,
"step": 170000
},
{
"epoch": 13.27,
"grad_norm": 1.1491278409957886,
"learning_rate": 0.0002,
"loss": 0.3947,
"step": 171000
},
{
"epoch": 13.35,
"grad_norm": 1.0739213228225708,
"learning_rate": 0.0002,
"loss": 0.3937,
"step": 172000
},
{
"epoch": 13.43,
"grad_norm": 1.3349049091339111,
"learning_rate": 0.0002,
"loss": 0.3932,
"step": 173000
},
{
"epoch": 13.51,
"grad_norm": 1.4266788959503174,
"learning_rate": 0.0002,
"loss": 0.4012,
"step": 174000
},
{
"epoch": 13.58,
"grad_norm": 1.064070701599121,
"learning_rate": 0.0002,
"loss": 0.4,
"step": 175000
},
{
"epoch": 13.66,
"grad_norm": 1.930474877357483,
"learning_rate": 0.0002,
"loss": 0.399,
"step": 176000
},
{
"epoch": 13.74,
"grad_norm": 0.994195282459259,
"learning_rate": 0.0002,
"loss": 0.4026,
"step": 177000
},
{
"epoch": 13.82,
"grad_norm": 0.9755762815475464,
"learning_rate": 0.0002,
"loss": 0.4019,
"step": 178000
},
{
"epoch": 13.9,
"grad_norm": 1.0802558660507202,
"learning_rate": 0.0002,
"loss": 0.4027,
"step": 179000
},
{
"epoch": 13.97,
"grad_norm": 1.4257205724716187,
"learning_rate": 0.0002,
"loss": 0.4031,
"step": 180000
},
{
"epoch": 13.97,
"eval_bleu": 63.9496,
"eval_gen_len": 28.7264,
"eval_loss": 0.4756912291049957,
"eval_runtime": 100.6936,
"eval_samples_per_second": 8.819,
"eval_steps_per_second": 1.102,
"step": 180000
},
{
"epoch": 14.05,
"grad_norm": 1.0172358751296997,
"learning_rate": 0.0002,
"loss": 0.3785,
"step": 181000
},
{
"epoch": 14.13,
"grad_norm": 0.9525344967842102,
"learning_rate": 0.0002,
"loss": 0.367,
"step": 182000
},
{
"epoch": 14.21,
"grad_norm": 1.0674399137496948,
"learning_rate": 0.0002,
"loss": 0.3723,
"step": 183000
},
{
"epoch": 14.28,
"grad_norm": 1.311464548110962,
"learning_rate": 0.0002,
"loss": 0.3743,
"step": 184000
},
{
"epoch": 14.36,
"grad_norm": 1.020115613937378,
"learning_rate": 0.0002,
"loss": 0.3743,
"step": 185000
},
{
"epoch": 14.44,
"grad_norm": 0.9766080379486084,
"learning_rate": 0.0002,
"loss": 0.3751,
"step": 186000
},
{
"epoch": 14.52,
"grad_norm": 1.0636546611785889,
"learning_rate": 0.0002,
"loss": 0.3839,
"step": 187000
},
{
"epoch": 14.59,
"grad_norm": 1.5485342741012573,
"learning_rate": 0.0002,
"loss": 0.3864,
"step": 188000
},
{
"epoch": 14.67,
"grad_norm": 1.189011812210083,
"learning_rate": 0.0002,
"loss": 0.3836,
"step": 189000
},
{
"epoch": 14.75,
"grad_norm": 1.2171902656555176,
"learning_rate": 0.0002,
"loss": 0.3844,
"step": 190000
},
{
"epoch": 14.75,
"eval_bleu": 63.7498,
"eval_gen_len": 28.8288,
"eval_loss": 0.48547232151031494,
"eval_runtime": 102.2105,
"eval_samples_per_second": 8.688,
"eval_steps_per_second": 1.086,
"step": 190000
},
{
"epoch": 14.83,
"grad_norm": 0.9004954695701599,
"learning_rate": 0.0002,
"loss": 0.3821,
"step": 191000
},
{
"epoch": 14.9,
"grad_norm": 1.2197577953338623,
"learning_rate": 0.0002,
"loss": 0.3871,
"step": 192000
},
{
"epoch": 14.98,
"grad_norm": 1.0094869136810303,
"learning_rate": 0.0002,
"loss": 0.3924,
"step": 193000
},
{
"epoch": 15.06,
"grad_norm": 1.1337696313858032,
"learning_rate": 0.0002,
"loss": 0.3602,
"step": 194000
},
{
"epoch": 15.14,
"grad_norm": 1.5074607133865356,
"learning_rate": 0.0002,
"loss": 0.3548,
"step": 195000
},
{
"epoch": 15.22,
"grad_norm": 0.9171730279922485,
"learning_rate": 0.0002,
"loss": 0.3586,
"step": 196000
},
{
"epoch": 15.29,
"grad_norm": 0.8581980466842651,
"learning_rate": 0.0002,
"loss": 0.3609,
"step": 197000
},
{
"epoch": 15.37,
"grad_norm": 0.8790922164916992,
"learning_rate": 0.0002,
"loss": 0.363,
"step": 198000
},
{
"epoch": 15.45,
"grad_norm": 1.012073278427124,
"learning_rate": 0.0002,
"loss": 0.3618,
"step": 199000
},
{
"epoch": 15.53,
"grad_norm": 0.9808474183082581,
"learning_rate": 0.0002,
"loss": 0.3637,
"step": 200000
},
{
"epoch": 15.53,
"eval_bleu": 64.2277,
"eval_gen_len": 28.661,
"eval_loss": 0.4799739420413971,
"eval_runtime": 99.1042,
"eval_samples_per_second": 8.96,
"eval_steps_per_second": 1.12,
"step": 200000
},
{
"epoch": 15.6,
"grad_norm": 0.995276689529419,
"learning_rate": 0.0002,
"loss": 0.3676,
"step": 201000
},
{
"epoch": 15.68,
"grad_norm": 1.2943910360336304,
"learning_rate": 0.0002,
"loss": 0.3705,
"step": 202000
},
{
"epoch": 15.76,
"grad_norm": 0.9539749026298523,
"learning_rate": 0.0002,
"loss": 0.3706,
"step": 203000
},
{
"epoch": 15.84,
"grad_norm": 0.9351176619529724,
"learning_rate": 0.0002,
"loss": 0.3706,
"step": 204000
},
{
"epoch": 15.91,
"grad_norm": 1.087781310081482,
"learning_rate": 0.0002,
"loss": 0.3713,
"step": 205000
},
{
"epoch": 15.99,
"grad_norm": 1.2164143323898315,
"learning_rate": 0.0002,
"loss": 0.3729,
"step": 206000
},
{
"epoch": 16.07,
"grad_norm": 1.1458275318145752,
"learning_rate": 0.0002,
"loss": 0.3417,
"step": 207000
},
{
"epoch": 16.15,
"grad_norm": 0.9169874787330627,
"learning_rate": 0.0002,
"loss": 0.339,
"step": 208000
},
{
"epoch": 16.22,
"grad_norm": 1.1594195365905762,
"learning_rate": 0.0002,
"loss": 0.3426,
"step": 209000
},
{
"epoch": 16.3,
"grad_norm": 0.8710166215896606,
"learning_rate": 0.0002,
"loss": 0.3473,
"step": 210000
},
{
"epoch": 16.3,
"eval_bleu": 64.4683,
"eval_gen_len": 28.786,
"eval_loss": 0.4854079782962799,
"eval_runtime": 99.2915,
"eval_samples_per_second": 8.943,
"eval_steps_per_second": 1.118,
"step": 210000
},
{
"epoch": 16.38,
"grad_norm": 1.1366904973983765,
"learning_rate": 0.0002,
"loss": 0.3483,
"step": 211000
},
{
"epoch": 16.46,
"grad_norm": 1.135487675666809,
"learning_rate": 0.0002,
"loss": 0.3509,
"step": 212000
},
{
"epoch": 16.53,
"grad_norm": 0.9458820819854736,
"learning_rate": 0.0002,
"loss": 0.3519,
"step": 213000
},
{
"epoch": 16.61,
"grad_norm": 0.8842834830284119,
"learning_rate": 0.0002,
"loss": 0.3551,
"step": 214000
},
{
"epoch": 16.69,
"grad_norm": 1.2772917747497559,
"learning_rate": 0.0002,
"loss": 0.3572,
"step": 215000
},
{
"epoch": 16.77,
"grad_norm": 1.5344486236572266,
"learning_rate": 0.0002,
"loss": 0.3567,
"step": 216000
},
{
"epoch": 16.85,
"grad_norm": 1.4000177383422852,
"learning_rate": 0.0002,
"loss": 0.3605,
"step": 217000
},
{
"epoch": 16.92,
"grad_norm": 1.5617576837539673,
"learning_rate": 0.0002,
"loss": 0.3577,
"step": 218000
},
{
"epoch": 17.0,
"grad_norm": 1.4207055568695068,
"learning_rate": 0.0002,
"loss": 0.3581,
"step": 219000
},
{
"epoch": 17.08,
"grad_norm": 1.6633687019348145,
"learning_rate": 0.0002,
"loss": 0.3243,
"step": 220000
},
{
"epoch": 17.08,
"eval_bleu": 64.7805,
"eval_gen_len": 28.6791,
"eval_loss": 0.490304172039032,
"eval_runtime": 99.3675,
"eval_samples_per_second": 8.937,
"eval_steps_per_second": 1.117,
"step": 220000
},
{
"epoch": 17.16,
"grad_norm": 1.1573091745376587,
"learning_rate": 0.0002,
"loss": 0.3298,
"step": 221000
},
{
"epoch": 17.23,
"grad_norm": 1.046479344367981,
"learning_rate": 0.0002,
"loss": 0.3312,
"step": 222000
},
{
"epoch": 17.31,
"grad_norm": 1.2901638746261597,
"learning_rate": 0.0002,
"loss": 0.3325,
"step": 223000
},
{
"epoch": 17.39,
"grad_norm": 1.0912984609603882,
"learning_rate": 0.0002,
"loss": 0.3351,
"step": 224000
},
{
"epoch": 17.47,
"grad_norm": 1.5278785228729248,
"learning_rate": 0.0002,
"loss": 0.3382,
"step": 225000
},
{
"epoch": 17.54,
"grad_norm": 1.013113021850586,
"learning_rate": 0.0002,
"loss": 0.3386,
"step": 226000
},
{
"epoch": 17.62,
"grad_norm": 1.254299283027649,
"learning_rate": 0.0002,
"loss": 0.3393,
"step": 227000
},
{
"epoch": 17.7,
"grad_norm": 0.8990402221679688,
"learning_rate": 0.0002,
"loss": 0.3404,
"step": 228000
},
{
"epoch": 17.78,
"grad_norm": 1.2247493267059326,
"learning_rate": 0.0002,
"loss": 0.3451,
"step": 229000
},
{
"epoch": 17.85,
"grad_norm": 1.454061508178711,
"learning_rate": 0.0002,
"loss": 0.3426,
"step": 230000
},
{
"epoch": 17.85,
"eval_bleu": 64.679,
"eval_gen_len": 28.4809,
"eval_loss": 0.4818822741508484,
"eval_runtime": 98.7347,
"eval_samples_per_second": 8.994,
"eval_steps_per_second": 1.124,
"step": 230000
},
{
"epoch": 17.93,
"grad_norm": 1.4103410243988037,
"learning_rate": 0.0002,
"loss": 0.3457,
"step": 231000
},
{
"epoch": 18.01,
"grad_norm": 1.0248557329177856,
"learning_rate": 0.0002,
"loss": 0.3449,
"step": 232000
},
{
"epoch": 18.09,
"grad_norm": 1.421231985092163,
"learning_rate": 0.0002,
"loss": 0.3115,
"step": 233000
},
{
"epoch": 18.16,
"grad_norm": 1.0739413499832153,
"learning_rate": 0.0002,
"loss": 0.314,
"step": 234000
},
{
"epoch": 18.24,
"grad_norm": 1.0976619720458984,
"learning_rate": 0.0002,
"loss": 0.3214,
"step": 235000
},
{
"epoch": 18.32,
"grad_norm": 1.480944037437439,
"learning_rate": 0.0002,
"loss": 0.3173,
"step": 236000
},
{
"epoch": 18.4,
"grad_norm": 1.249569296836853,
"learning_rate": 0.0002,
"loss": 0.3227,
"step": 237000
},
{
"epoch": 18.48,
"grad_norm": 1.1228398084640503,
"learning_rate": 0.0002,
"loss": 0.3263,
"step": 238000
},
{
"epoch": 18.55,
"grad_norm": 1.318242073059082,
"learning_rate": 0.0002,
"loss": 0.327,
"step": 239000
},
{
"epoch": 18.63,
"grad_norm": 1.1360143423080444,
"learning_rate": 0.0002,
"loss": 0.3295,
"step": 240000
},
{
"epoch": 18.63,
"eval_bleu": 65.3735,
"eval_gen_len": 28.6014,
"eval_loss": 0.4851875603199005,
"eval_runtime": 98.572,
"eval_samples_per_second": 9.009,
"eval_steps_per_second": 1.126,
"step": 240000
},
{
"epoch": 18.71,
"grad_norm": 1.4588408470153809,
"learning_rate": 0.0002,
"loss": 0.3268,
"step": 241000
},
{
"epoch": 18.79,
"grad_norm": 1.1620702743530273,
"learning_rate": 0.0002,
"loss": 0.3341,
"step": 242000
},
{
"epoch": 18.86,
"grad_norm": 1.0640859603881836,
"learning_rate": 0.0002,
"loss": 0.3346,
"step": 243000
},
{
"epoch": 18.94,
"grad_norm": 1.096739649772644,
"learning_rate": 0.0002,
"loss": 0.3334,
"step": 244000
},
{
"epoch": 19.02,
"grad_norm": 1.23462975025177,
"learning_rate": 0.0002,
"loss": 0.3273,
"step": 245000
},
{
"epoch": 19.1,
"grad_norm": 0.9026219248771667,
"learning_rate": 0.0002,
"loss": 0.3003,
"step": 246000
},
{
"epoch": 19.17,
"grad_norm": 1.0630195140838623,
"learning_rate": 0.0002,
"loss": 0.3042,
"step": 247000
},
{
"epoch": 19.25,
"grad_norm": 1.0800952911376953,
"learning_rate": 0.0002,
"loss": 0.306,
"step": 248000
},
{
"epoch": 19.33,
"grad_norm": 1.2505557537078857,
"learning_rate": 0.0002,
"loss": 0.311,
"step": 249000
},
{
"epoch": 19.41,
"grad_norm": 0.822533369064331,
"learning_rate": 0.0002,
"loss": 0.3124,
"step": 250000
},
{
"epoch": 19.41,
"eval_bleu": 64.5641,
"eval_gen_len": 28.6745,
"eval_loss": 0.4947471022605896,
"eval_runtime": 99.0725,
"eval_samples_per_second": 8.963,
"eval_steps_per_second": 1.12,
"step": 250000
},
{
"epoch": 19.48,
"grad_norm": 1.2964988946914673,
"learning_rate": 0.0002,
"loss": 0.312,
"step": 251000
},
{
"epoch": 19.56,
"grad_norm": 1.0119915008544922,
"learning_rate": 0.0002,
"loss": 0.3151,
"step": 252000
},
{
"epoch": 19.64,
"grad_norm": 1.2384612560272217,
"learning_rate": 0.0002,
"loss": 0.3171,
"step": 253000
},
{
"epoch": 19.72,
"grad_norm": 1.7689512968063354,
"learning_rate": 0.0002,
"loss": 0.3177,
"step": 254000
},
{
"epoch": 19.8,
"grad_norm": 1.3058741092681885,
"learning_rate": 0.0002,
"loss": 0.3195,
"step": 255000
},
{
"epoch": 19.87,
"grad_norm": 1.2293740510940552,
"learning_rate": 0.0002,
"loss": 0.3226,
"step": 256000
},
{
"epoch": 19.95,
"grad_norm": 1.398077368736267,
"learning_rate": 0.0002,
"loss": 0.3222,
"step": 257000
},
{
"epoch": 20.03,
"grad_norm": 0.9053579568862915,
"learning_rate": 0.0002,
"loss": 0.3122,
"step": 258000
},
{
"epoch": 20.11,
"grad_norm": 1.2684714794158936,
"learning_rate": 0.0002,
"loss": 0.2891,
"step": 259000
},
{
"epoch": 20.18,
"grad_norm": 0.9774475693702698,
"learning_rate": 0.0002,
"loss": 0.2933,
"step": 260000
},
{
"epoch": 20.18,
"eval_bleu": 65.1364,
"eval_gen_len": 28.6419,
"eval_loss": 0.49722200632095337,
"eval_runtime": 99.0096,
"eval_samples_per_second": 8.969,
"eval_steps_per_second": 1.121,
"step": 260000
},
{
"epoch": 20.26,
"grad_norm": 1.2156912088394165,
"learning_rate": 0.0002,
"loss": 0.296,
"step": 261000
},
{
"epoch": 20.34,
"grad_norm": 1.221637487411499,
"learning_rate": 0.0002,
"loss": 0.2981,
"step": 262000
},
{
"epoch": 20.42,
"grad_norm": 0.9357077479362488,
"learning_rate": 0.0002,
"loss": 0.3018,
"step": 263000
},
{
"epoch": 20.49,
"grad_norm": 0.9926024079322815,
"learning_rate": 0.0002,
"loss": 0.3012,
"step": 264000
},
{
"epoch": 20.57,
"grad_norm": 1.6473757028579712,
"learning_rate": 0.0002,
"loss": 0.3049,
"step": 265000
},
{
"epoch": 20.65,
"grad_norm": 1.1541528701782227,
"learning_rate": 0.0002,
"loss": 0.3039,
"step": 266000
},
{
"epoch": 20.73,
"grad_norm": 1.220951795578003,
"learning_rate": 0.0002,
"loss": 0.3074,
"step": 267000
},
{
"epoch": 20.8,
"grad_norm": 1.074318289756775,
"learning_rate": 0.0002,
"loss": 0.3119,
"step": 268000
},
{
"epoch": 20.88,
"grad_norm": 1.015864372253418,
"learning_rate": 0.0002,
"loss": 0.3116,
"step": 269000
},
{
"epoch": 20.96,
"grad_norm": 1.0277948379516602,
"learning_rate": 0.0002,
"loss": 0.3101,
"step": 270000
},
{
"epoch": 20.96,
"eval_bleu": 64.6747,
"eval_gen_len": 28.6802,
"eval_loss": 0.4901565611362457,
"eval_runtime": 100.085,
"eval_samples_per_second": 8.872,
"eval_steps_per_second": 1.109,
"step": 270000
},
{
"epoch": 21.04,
"grad_norm": 0.9389250874519348,
"learning_rate": 0.0002,
"loss": 0.2942,
"step": 271000
},
{
"epoch": 21.11,
"grad_norm": 1.2478715181350708,
"learning_rate": 0.0002,
"loss": 0.2806,
"step": 272000
},
{
"epoch": 21.19,
"grad_norm": 0.9297951459884644,
"learning_rate": 0.0002,
"loss": 0.2833,
"step": 273000
},
{
"epoch": 21.27,
"grad_norm": 0.9602841734886169,
"learning_rate": 0.0002,
"loss": 0.2876,
"step": 274000
},
{
"epoch": 21.35,
"grad_norm": 0.9561505317687988,
"learning_rate": 0.0002,
"loss": 0.2898,
"step": 275000
},
{
"epoch": 21.43,
"grad_norm": 1.0724116563796997,
"learning_rate": 0.0002,
"loss": 0.2897,
"step": 276000
},
{
"epoch": 21.5,
"grad_norm": 0.9960470199584961,
"learning_rate": 0.0002,
"loss": 0.2942,
"step": 277000
},
{
"epoch": 21.58,
"grad_norm": 1.1480662822723389,
"learning_rate": 0.0002,
"loss": 0.2915,
"step": 278000
},
{
"epoch": 21.66,
"grad_norm": 1.1175373792648315,
"learning_rate": 0.0002,
"loss": 0.2991,
"step": 279000
},
{
"epoch": 21.74,
"grad_norm": 1.6251972913742065,
"learning_rate": 0.0002,
"loss": 0.2991,
"step": 280000
},
{
"epoch": 21.74,
"eval_bleu": 64.9732,
"eval_gen_len": 28.5653,
"eval_loss": 0.4907251298427582,
"eval_runtime": 98.6604,
"eval_samples_per_second": 9.001,
"eval_steps_per_second": 1.125,
"step": 280000
},
{
"epoch": 21.81,
"grad_norm": 0.8608353137969971,
"learning_rate": 0.0002,
"loss": 0.2994,
"step": 281000
},
{
"epoch": 21.89,
"grad_norm": 1.077614426612854,
"learning_rate": 0.0002,
"loss": 0.3012,
"step": 282000
},
{
"epoch": 21.97,
"grad_norm": 1.6897170543670654,
"learning_rate": 0.0002,
"loss": 0.3012,
"step": 283000
},
{
"epoch": 22.05,
"grad_norm": 1.3074902296066284,
"learning_rate": 0.0002,
"loss": 0.2814,
"step": 284000
},
{
"epoch": 22.12,
"grad_norm": 0.9641602039337158,
"learning_rate": 0.0002,
"loss": 0.2717,
"step": 285000
},
{
"epoch": 22.2,
"grad_norm": 1.634346842765808,
"learning_rate": 0.0002,
"loss": 0.2747,
"step": 286000
},
{
"epoch": 22.28,
"grad_norm": 1.4014965295791626,
"learning_rate": 0.0002,
"loss": 0.2783,
"step": 287000
},
{
"epoch": 22.36,
"grad_norm": 1.3981196880340576,
"learning_rate": 0.0002,
"loss": 0.2831,
"step": 288000
},
{
"epoch": 22.43,
"grad_norm": 0.8403178453445435,
"learning_rate": 0.0002,
"loss": 0.2801,
"step": 289000
},
{
"epoch": 22.51,
"grad_norm": 0.879589319229126,
"learning_rate": 0.0002,
"loss": 0.2828,
"step": 290000
},
{
"epoch": 22.51,
"eval_bleu": 64.7552,
"eval_gen_len": 28.6261,
"eval_loss": 0.5037782192230225,
"eval_runtime": 98.7235,
"eval_samples_per_second": 8.995,
"eval_steps_per_second": 1.124,
"step": 290000
},
{
"epoch": 22.59,
"grad_norm": 1.751582145690918,
"learning_rate": 0.0002,
"loss": 0.2846,
"step": 291000
},
{
"epoch": 22.67,
"grad_norm": 1.2374579906463623,
"learning_rate": 0.0002,
"loss": 0.2855,
"step": 292000
},
{
"epoch": 22.74,
"grad_norm": 1.152079463005066,
"learning_rate": 0.0002,
"loss": 0.2916,
"step": 293000
},
{
"epoch": 22.82,
"grad_norm": 1.2837114334106445,
"learning_rate": 0.0002,
"loss": 0.2889,
"step": 294000
},
{
"epoch": 22.9,
"grad_norm": 1.161375880241394,
"learning_rate": 0.0002,
"loss": 0.2894,
"step": 295000
},
{
"epoch": 22.98,
"grad_norm": 0.8594853281974792,
"learning_rate": 0.0002,
"loss": 0.2936,
"step": 296000
},
{
"epoch": 23.06,
"grad_norm": 0.9489020705223083,
"learning_rate": 0.0002,
"loss": 0.2733,
"step": 297000
},
{
"epoch": 23.13,
"grad_norm": 1.2100919485092163,
"learning_rate": 0.0002,
"loss": 0.263,
"step": 298000
},
{
"epoch": 23.21,
"grad_norm": 2.140540361404419,
"learning_rate": 0.0002,
"loss": 0.2666,
"step": 299000
},
{
"epoch": 23.29,
"grad_norm": 1.070940375328064,
"learning_rate": 0.0002,
"loss": 0.2688,
"step": 300000
},
{
"epoch": 23.29,
"eval_bleu": 65.0702,
"eval_gen_len": 28.7534,
"eval_loss": 0.5042341351509094,
"eval_runtime": 99.9941,
"eval_samples_per_second": 8.881,
"eval_steps_per_second": 1.11,
"step": 300000
},
{
"epoch": 23.37,
"grad_norm": 1.0847973823547363,
"learning_rate": 0.0002,
"loss": 0.2692,
"step": 301000
},
{
"epoch": 23.44,
"grad_norm": 1.098399043083191,
"learning_rate": 0.0002,
"loss": 0.2721,
"step": 302000
},
{
"epoch": 23.52,
"grad_norm": 0.9735555648803711,
"learning_rate": 0.0002,
"loss": 0.2725,
"step": 303000
},
{
"epoch": 23.6,
"grad_norm": 1.2928968667984009,
"learning_rate": 0.0002,
"loss": 0.2772,
"step": 304000
},
{
"epoch": 23.68,
"grad_norm": 1.1871669292449951,
"learning_rate": 0.0002,
"loss": 0.2766,
"step": 305000
},
{
"epoch": 23.75,
"grad_norm": 0.9379162788391113,
"learning_rate": 0.0002,
"loss": 0.2797,
"step": 306000
},
{
"epoch": 23.83,
"grad_norm": 0.8844149112701416,
"learning_rate": 0.0002,
"loss": 0.2813,
"step": 307000
},
{
"epoch": 23.91,
"grad_norm": 1.0218191146850586,
"learning_rate": 0.0002,
"loss": 0.2834,
"step": 308000
},
{
"epoch": 23.99,
"grad_norm": 1.234649896621704,
"learning_rate": 0.0002,
"loss": 0.2827,
"step": 309000
},
{
"epoch": 24.06,
"grad_norm": 0.8998326659202576,
"learning_rate": 0.0002,
"loss": 0.2555,
"step": 310000
},
{
"epoch": 24.06,
"eval_bleu": 65.0378,
"eval_gen_len": 29.089,
"eval_loss": 0.5101344585418701,
"eval_runtime": 101.77,
"eval_samples_per_second": 8.726,
"eval_steps_per_second": 1.091,
"step": 310000
},
{
"epoch": 24.14,
"grad_norm": 0.9993298053741455,
"learning_rate": 0.0002,
"loss": 0.2575,
"step": 311000
},
{
"epoch": 24.22,
"grad_norm": 1.078316569328308,
"learning_rate": 0.0002,
"loss": 0.2577,
"step": 312000
},
{
"epoch": 24.3,
"grad_norm": 1.0775636434555054,
"learning_rate": 0.0002,
"loss": 0.2603,
"step": 313000
},
{
"epoch": 24.38,
"grad_norm": 1.0711839199066162,
"learning_rate": 0.0002,
"loss": 0.2641,
"step": 314000
},
{
"epoch": 24.45,
"grad_norm": 1.1953543424606323,
"learning_rate": 0.0002,
"loss": 0.2666,
"step": 315000
},
{
"epoch": 24.53,
"grad_norm": 0.7338001132011414,
"learning_rate": 0.0002,
"loss": 0.2662,
"step": 316000
},
{
"epoch": 24.61,
"grad_norm": 1.651564121246338,
"learning_rate": 0.0002,
"loss": 0.2658,
"step": 317000
},
{
"epoch": 24.69,
"grad_norm": 0.8356152176856995,
"learning_rate": 0.0002,
"loss": 0.2706,
"step": 318000
},
{
"epoch": 24.76,
"grad_norm": 0.8503906726837158,
"learning_rate": 0.0002,
"loss": 0.2715,
"step": 319000
},
{
"epoch": 24.84,
"grad_norm": 0.9122622609138489,
"learning_rate": 0.0002,
"loss": 0.2692,
"step": 320000
},
{
"epoch": 24.84,
"eval_bleu": 64.9991,
"eval_gen_len": 28.6937,
"eval_loss": 0.5021673440933228,
"eval_runtime": 99.2776,
"eval_samples_per_second": 8.945,
"eval_steps_per_second": 1.118,
"step": 320000
},
{
"epoch": 24.92,
"grad_norm": 1.0263617038726807,
"learning_rate": 0.0002,
"loss": 0.2725,
"step": 321000
},
{
"epoch": 25.0,
"grad_norm": 1.140886902809143,
"learning_rate": 0.0002,
"loss": 0.2748,
"step": 322000
},
{
"epoch": 25.07,
"grad_norm": 0.9275480508804321,
"learning_rate": 0.0002,
"loss": 0.2467,
"step": 323000
},
{
"epoch": 25.15,
"grad_norm": 1.170021653175354,
"learning_rate": 0.0002,
"loss": 0.2465,
"step": 324000
},
{
"epoch": 25.23,
"grad_norm": 1.1251965761184692,
"learning_rate": 0.0002,
"loss": 0.2492,
"step": 325000
},
{
"epoch": 25.31,
"grad_norm": 1.0885039567947388,
"learning_rate": 0.0002,
"loss": 0.2518,
"step": 326000
},
{
"epoch": 25.38,
"grad_norm": 1.2162927389144897,
"learning_rate": 0.0002,
"loss": 0.2535,
"step": 327000
},
{
"epoch": 25.46,
"grad_norm": 1.0869230031967163,
"learning_rate": 0.0002,
"loss": 0.2534,
"step": 328000
},
{
"epoch": 25.54,
"grad_norm": 0.9775025248527527,
"learning_rate": 0.0002,
"loss": 0.2586,
"step": 329000
},
{
"epoch": 25.62,
"grad_norm": 0.8145058155059814,
"learning_rate": 0.0002,
"loss": 0.2593,
"step": 330000
},
{
"epoch": 25.62,
"eval_bleu": 65.2478,
"eval_gen_len": 28.6137,
"eval_loss": 0.508499026298523,
"eval_runtime": 98.6428,
"eval_samples_per_second": 9.002,
"eval_steps_per_second": 1.125,
"step": 330000
},
{
"epoch": 25.69,
"grad_norm": 1.270075798034668,
"learning_rate": 0.0002,
"loss": 0.2613,
"step": 331000
},
{
"epoch": 25.77,
"grad_norm": 1.431252121925354,
"learning_rate": 0.0002,
"loss": 0.2628,
"step": 332000
},
{
"epoch": 25.85,
"grad_norm": 1.3506394624710083,
"learning_rate": 0.0002,
"loss": 0.2651,
"step": 333000
},
{
"epoch": 25.93,
"grad_norm": 1.0612725019454956,
"learning_rate": 0.0002,
"loss": 0.2628,
"step": 334000
},
{
"epoch": 26.01,
"grad_norm": 0.8760356307029724,
"learning_rate": 0.0002,
"loss": 0.2654,
"step": 335000
},
{
"epoch": 26.08,
"grad_norm": 1.0780360698699951,
"learning_rate": 0.0002,
"loss": 0.2363,
"step": 336000
},
{
"epoch": 26.16,
"grad_norm": 1.4022656679153442,
"learning_rate": 0.0002,
"loss": 0.2404,
"step": 337000
},
{
"epoch": 26.24,
"grad_norm": 1.1530039310455322,
"learning_rate": 0.0002,
"loss": 0.2416,
"step": 338000
},
{
"epoch": 26.32,
"grad_norm": 1.028208613395691,
"learning_rate": 0.0002,
"loss": 0.2453,
"step": 339000
},
{
"epoch": 26.39,
"grad_norm": 0.8168412446975708,
"learning_rate": 0.0002,
"loss": 0.2439,
"step": 340000
},
{
"epoch": 26.39,
"eval_bleu": 64.863,
"eval_gen_len": 28.6464,
"eval_loss": 0.5152307748794556,
"eval_runtime": 99.207,
"eval_samples_per_second": 8.951,
"eval_steps_per_second": 1.119,
"step": 340000
},
{
"epoch": 26.47,
"grad_norm": 1.415486216545105,
"learning_rate": 0.0002,
"loss": 0.2481,
"step": 341000
},
{
"epoch": 26.55,
"grad_norm": 1.016444444656372,
"learning_rate": 0.0002,
"loss": 0.2515,
"step": 342000
},
{
"epoch": 26.63,
"grad_norm": 1.0151183605194092,
"learning_rate": 0.0002,
"loss": 0.2519,
"step": 343000
},
{
"epoch": 26.7,
"grad_norm": 0.8286064267158508,
"learning_rate": 0.0002,
"loss": 0.2528,
"step": 344000
},
{
"epoch": 26.78,
"grad_norm": 1.0916731357574463,
"learning_rate": 0.0002,
"loss": 0.2529,
"step": 345000
},
{
"epoch": 26.86,
"grad_norm": 1.0001248121261597,
"learning_rate": 0.0002,
"loss": 0.256,
"step": 346000
},
{
"epoch": 26.94,
"grad_norm": 0.8120971322059631,
"learning_rate": 0.0002,
"loss": 0.2575,
"step": 347000
},
{
"epoch": 27.01,
"grad_norm": 0.9800658822059631,
"learning_rate": 0.0002,
"loss": 0.2522,
"step": 348000
},
{
"epoch": 27.09,
"grad_norm": 1.0135070085525513,
"learning_rate": 0.0002,
"loss": 0.2298,
"step": 349000
},
{
"epoch": 27.17,
"grad_norm": 1.1721863746643066,
"learning_rate": 0.0002,
"loss": 0.2327,
"step": 350000
},
{
"epoch": 27.17,
"eval_bleu": 65.0748,
"eval_gen_len": 28.7286,
"eval_loss": 0.5164603590965271,
"eval_runtime": 99.7391,
"eval_samples_per_second": 8.903,
"eval_steps_per_second": 1.113,
"step": 350000
},
{
"epoch": 27.25,
"grad_norm": 0.7238809466362,
"learning_rate": 0.0002,
"loss": 0.2337,
"step": 351000
},
{
"epoch": 27.32,
"grad_norm": 0.8267261385917664,
"learning_rate": 0.0002,
"loss": 0.2357,
"step": 352000
},
{
"epoch": 27.4,
"grad_norm": 1.0274128913879395,
"learning_rate": 0.0002,
"loss": 0.2398,
"step": 353000
},
{
"epoch": 27.48,
"grad_norm": 0.9916879534721375,
"learning_rate": 0.0002,
"loss": 0.2401,
"step": 354000
},
{
"epoch": 27.56,
"grad_norm": 1.095639944076538,
"learning_rate": 0.0002,
"loss": 0.2428,
"step": 355000
},
{
"epoch": 27.64,
"grad_norm": 0.8598717451095581,
"learning_rate": 0.0002,
"loss": 0.2432,
"step": 356000
},
{
"epoch": 27.71,
"grad_norm": 0.8891191482543945,
"learning_rate": 0.0002,
"loss": 0.2431,
"step": 357000
},
{
"epoch": 27.79,
"grad_norm": 0.9431182146072388,
"learning_rate": 0.0002,
"loss": 0.2468,
"step": 358000
},
{
"epoch": 27.87,
"grad_norm": 1.3781706094741821,
"learning_rate": 0.0002,
"loss": 0.2498,
"step": 359000
},
{
"epoch": 27.95,
"grad_norm": 0.9336220622062683,
"learning_rate": 0.0002,
"loss": 0.249,
"step": 360000
},
{
"epoch": 27.95,
"eval_bleu": 64.7249,
"eval_gen_len": 28.6137,
"eval_loss": 0.5116418600082397,
"eval_runtime": 98.8178,
"eval_samples_per_second": 8.986,
"eval_steps_per_second": 1.123,
"step": 360000
},
{
"epoch": 28.02,
"grad_norm": 1.2862168550491333,
"learning_rate": 0.0002,
"loss": 0.2416,
"step": 361000
},
{
"epoch": 28.1,
"grad_norm": 0.8687452077865601,
"learning_rate": 0.0002,
"loss": 0.2204,
"step": 362000
},
{
"epoch": 28.18,
"grad_norm": 1.8673216104507446,
"learning_rate": 0.0002,
"loss": 0.2254,
"step": 363000
},
{
"epoch": 28.26,
"grad_norm": 0.9244999885559082,
"learning_rate": 0.0002,
"loss": 0.227,
"step": 364000
},
{
"epoch": 28.33,
"grad_norm": 0.7414880990982056,
"learning_rate": 0.0002,
"loss": 0.2302,
"step": 365000
},
{
"epoch": 28.41,
"grad_norm": 1.0677781105041504,
"learning_rate": 0.0002,
"loss": 0.2311,
"step": 366000
},
{
"epoch": 28.49,
"grad_norm": 1.0712281465530396,
"learning_rate": 0.0002,
"loss": 0.2354,
"step": 367000
},
{
"epoch": 28.57,
"grad_norm": 1.0177695751190186,
"learning_rate": 0.0002,
"loss": 0.2349,
"step": 368000
},
{
"epoch": 28.64,
"grad_norm": 1.2082629203796387,
"learning_rate": 0.0002,
"loss": 0.2343,
"step": 369000
},
{
"epoch": 28.72,
"grad_norm": 0.9800160527229309,
"learning_rate": 0.0002,
"loss": 0.238,
"step": 370000
},
{
"epoch": 28.72,
"eval_bleu": 64.7651,
"eval_gen_len": 28.5968,
"eval_loss": 0.5202394723892212,
"eval_runtime": 98.8503,
"eval_samples_per_second": 8.983,
"eval_steps_per_second": 1.123,
"step": 370000
},
{
"epoch": 28.8,
"grad_norm": 1.4668409824371338,
"learning_rate": 0.0002,
"loss": 0.2417,
"step": 371000
},
{
"epoch": 28.88,
"grad_norm": 0.9679712653160095,
"learning_rate": 0.0002,
"loss": 0.2397,
"step": 372000
},
{
"epoch": 28.96,
"grad_norm": 1.0757184028625488,
"learning_rate": 0.0002,
"loss": 0.2419,
"step": 373000
},
{
"epoch": 29.03,
"grad_norm": 1.3961704969406128,
"learning_rate": 0.0002,
"loss": 0.2326,
"step": 374000
},
{
"epoch": 29.11,
"grad_norm": 1.4827901124954224,
"learning_rate": 0.0002,
"loss": 0.2159,
"step": 375000
},
{
"epoch": 29.19,
"grad_norm": 1.065645456314087,
"learning_rate": 0.0002,
"loss": 0.2195,
"step": 376000
},
{
"epoch": 29.27,
"grad_norm": 0.8756958842277527,
"learning_rate": 0.0002,
"loss": 0.2229,
"step": 377000
},
{
"epoch": 29.34,
"grad_norm": 1.2630327939987183,
"learning_rate": 0.0002,
"loss": 0.2251,
"step": 378000
},
{
"epoch": 29.42,
"grad_norm": 0.9434683322906494,
"learning_rate": 0.0002,
"loss": 0.2267,
"step": 379000
},
{
"epoch": 29.5,
"grad_norm": 0.8589434623718262,
"learning_rate": 0.0002,
"loss": 0.2297,
"step": 380000
},
{
"epoch": 29.5,
"eval_bleu": 65.3334,
"eval_gen_len": 28.7005,
"eval_loss": 0.5242559909820557,
"eval_runtime": 99.2088,
"eval_samples_per_second": 8.951,
"eval_steps_per_second": 1.119,
"step": 380000
},
{
"epoch": 29.58,
"grad_norm": 1.0252753496170044,
"learning_rate": 0.0002,
"loss": 0.229,
"step": 381000
},
{
"epoch": 29.65,
"grad_norm": 1.4881134033203125,
"learning_rate": 0.0002,
"loss": 0.2301,
"step": 382000
},
{
"epoch": 29.73,
"grad_norm": 1.0281462669372559,
"learning_rate": 0.0002,
"loss": 0.2307,
"step": 383000
},
{
"epoch": 29.81,
"grad_norm": 1.1244617700576782,
"learning_rate": 0.0002,
"loss": 0.2335,
"step": 384000
},
{
"epoch": 29.89,
"grad_norm": 1.1461416482925415,
"learning_rate": 0.0002,
"loss": 0.2355,
"step": 385000
},
{
"epoch": 29.96,
"grad_norm": 1.742311716079712,
"learning_rate": 0.0002,
"loss": 0.2341,
"step": 386000
},
{
"epoch": 30.04,
"grad_norm": 0.8539097309112549,
"learning_rate": 0.0002,
"loss": 0.2196,
"step": 387000
},
{
"epoch": 30.12,
"grad_norm": 0.9865394830703735,
"learning_rate": 0.0002,
"loss": 0.2118,
"step": 388000
},
{
"epoch": 30.2,
"grad_norm": 1.2487947940826416,
"learning_rate": 0.0002,
"loss": 0.2111,
"step": 389000
},
{
"epoch": 30.27,
"grad_norm": 0.9401417970657349,
"learning_rate": 0.0002,
"loss": 0.2152,
"step": 390000
},
{
"epoch": 30.27,
"eval_bleu": 64.9364,
"eval_gen_len": 28.6081,
"eval_loss": 0.533649206161499,
"eval_runtime": 99.1874,
"eval_samples_per_second": 8.953,
"eval_steps_per_second": 1.119,
"step": 390000
},
{
"epoch": 30.35,
"grad_norm": 1.5141676664352417,
"learning_rate": 0.0002,
"loss": 0.2185,
"step": 391000
},
{
"epoch": 30.43,
"grad_norm": 1.4947956800460815,
"learning_rate": 0.0002,
"loss": 0.221,
"step": 392000
},
{
"epoch": 30.51,
"grad_norm": 0.8870178461074829,
"learning_rate": 0.0002,
"loss": 0.2221,
"step": 393000
},
{
"epoch": 30.59,
"grad_norm": 1.013377070426941,
"learning_rate": 0.0002,
"loss": 0.2223,
"step": 394000
},
{
"epoch": 30.66,
"grad_norm": 1.2745546102523804,
"learning_rate": 0.0002,
"loss": 0.2242,
"step": 395000
},
{
"epoch": 30.74,
"grad_norm": 1.3159047365188599,
"learning_rate": 0.0002,
"loss": 0.2286,
"step": 396000
},
{
"epoch": 30.82,
"grad_norm": 0.8441556096076965,
"learning_rate": 0.0002,
"loss": 0.2269,
"step": 397000
},
{
"epoch": 30.9,
"grad_norm": 1.0391247272491455,
"learning_rate": 0.0002,
"loss": 0.2297,
"step": 398000
},
{
"epoch": 30.97,
"grad_norm": 1.0133869647979736,
"learning_rate": 0.0002,
"loss": 0.2277,
"step": 399000
},
{
"epoch": 31.05,
"grad_norm": 1.5093469619750977,
"learning_rate": 0.0002,
"loss": 0.2106,
"step": 400000
},
{
"epoch": 31.05,
"eval_bleu": 65.117,
"eval_gen_len": 28.6745,
"eval_loss": 0.540839433670044,
"eval_runtime": 98.9473,
"eval_samples_per_second": 8.974,
"eval_steps_per_second": 1.122,
"step": 400000
},
{
"epoch": 31.13,
"grad_norm": 1.0606015920639038,
"learning_rate": 0.0002,
"loss": 0.2031,
"step": 401000
},
{
"epoch": 31.21,
"grad_norm": 1.4048112630844116,
"learning_rate": 0.0002,
"loss": 0.2043,
"step": 402000
},
{
"epoch": 31.28,
"grad_norm": 1.1232408285140991,
"learning_rate": 0.0002,
"loss": 0.211,
"step": 403000
},
{
"epoch": 31.36,
"grad_norm": 1.2367199659347534,
"learning_rate": 0.0002,
"loss": 0.2107,
"step": 404000
},
{
"epoch": 31.44,
"grad_norm": 1.1147772073745728,
"learning_rate": 0.0002,
"loss": 0.2116,
"step": 405000
},
{
"epoch": 31.52,
"grad_norm": 0.9711781740188599,
"learning_rate": 0.0002,
"loss": 0.2147,
"step": 406000
},
{
"epoch": 31.59,
"grad_norm": 1.4205774068832397,
"learning_rate": 0.0002,
"loss": 0.2158,
"step": 407000
},
{
"epoch": 31.67,
"grad_norm": 1.303250789642334,
"learning_rate": 0.0002,
"loss": 0.2187,
"step": 408000
},
{
"epoch": 31.75,
"grad_norm": 2.3327102661132812,
"learning_rate": 0.0002,
"loss": 0.2196,
"step": 409000
},
{
"epoch": 31.83,
"grad_norm": 1.9003146886825562,
"learning_rate": 0.0002,
"loss": 0.2234,
"step": 410000
},
{
"epoch": 31.83,
"eval_bleu": 64.8926,
"eval_gen_len": 28.6318,
"eval_loss": 0.5249429941177368,
"eval_runtime": 98.7286,
"eval_samples_per_second": 8.994,
"eval_steps_per_second": 1.124,
"step": 410000
},
{
"epoch": 31.9,
"grad_norm": 0.9950889348983765,
"learning_rate": 0.0002,
"loss": 0.2232,
"step": 411000
},
{
"epoch": 31.98,
"grad_norm": 0.8693845272064209,
"learning_rate": 0.0002,
"loss": 0.2236,
"step": 412000
},
{
"epoch": 32.06,
"grad_norm": 0.9227551817893982,
"learning_rate": 0.0002,
"loss": 0.204,
"step": 413000
},
{
"epoch": 32.14,
"grad_norm": 1.0269570350646973,
"learning_rate": 0.0002,
"loss": 0.2019,
"step": 414000
},
{
"epoch": 32.22,
"grad_norm": 1.0199569463729858,
"learning_rate": 0.0002,
"loss": 0.2015,
"step": 415000
},
{
"epoch": 32.29,
"grad_norm": 1.4488086700439453,
"learning_rate": 0.0002,
"loss": 0.2036,
"step": 416000
},
{
"epoch": 32.37,
"grad_norm": 0.8843773007392883,
"learning_rate": 0.0002,
"loss": 0.2049,
"step": 417000
},
{
"epoch": 32.45,
"grad_norm": 1.3630881309509277,
"learning_rate": 0.0002,
"loss": 0.2085,
"step": 418000
},
{
"epoch": 32.53,
"grad_norm": 0.9767336845397949,
"learning_rate": 0.0002,
"loss": 0.2097,
"step": 419000
},
{
"epoch": 32.6,
"grad_norm": 0.9147652983665466,
"learning_rate": 0.0002,
"loss": 0.2085,
"step": 420000
},
{
"epoch": 32.6,
"eval_bleu": 65.5715,
"eval_gen_len": 28.7984,
"eval_loss": 0.5305626392364502,
"eval_runtime": 100.0595,
"eval_samples_per_second": 8.875,
"eval_steps_per_second": 1.109,
"step": 420000
},
{
"epoch": 32.68,
"grad_norm": 1.4235540628433228,
"learning_rate": 0.0002,
"loss": 0.212,
"step": 421000
},
{
"epoch": 32.76,
"grad_norm": 0.9653807282447815,
"learning_rate": 0.0002,
"loss": 0.2129,
"step": 422000
},
{
"epoch": 32.84,
"grad_norm": 1.0437246561050415,
"learning_rate": 0.0002,
"loss": 0.2153,
"step": 423000
},
{
"epoch": 32.91,
"grad_norm": 1.0093231201171875,
"learning_rate": 0.0002,
"loss": 0.2146,
"step": 424000
},
{
"epoch": 32.99,
"grad_norm": 0.9372303485870361,
"learning_rate": 0.0002,
"loss": 0.2176,
"step": 425000
},
{
"epoch": 33.07,
"grad_norm": 0.990990161895752,
"learning_rate": 0.0002,
"loss": 0.1946,
"step": 426000
},
{
"epoch": 33.15,
"grad_norm": 1.221752405166626,
"learning_rate": 0.0002,
"loss": 0.1937,
"step": 427000
},
{
"epoch": 33.22,
"grad_norm": 1.0376135110855103,
"learning_rate": 0.0002,
"loss": 0.1971,
"step": 428000
},
{
"epoch": 33.3,
"grad_norm": 1.2878087759017944,
"learning_rate": 0.0002,
"loss": 0.1993,
"step": 429000
},
{
"epoch": 33.38,
"grad_norm": 1.702043890953064,
"learning_rate": 0.0002,
"loss": 0.2018,
"step": 430000
},
{
"epoch": 33.38,
"eval_bleu": 64.9154,
"eval_gen_len": 28.6351,
"eval_loss": 0.5428734421730042,
"eval_runtime": 99.9626,
"eval_samples_per_second": 8.883,
"eval_steps_per_second": 1.11,
"step": 430000
},
{
"epoch": 33.46,
"grad_norm": 1.171934723854065,
"learning_rate": 0.0002,
"loss": 0.2042,
"step": 431000
},
{
"epoch": 33.54,
"grad_norm": 0.9023895859718323,
"learning_rate": 0.0002,
"loss": 0.2032,
"step": 432000
},
{
"epoch": 33.61,
"grad_norm": 1.5410844087600708,
"learning_rate": 0.0002,
"loss": 0.204,
"step": 433000
},
{
"epoch": 33.69,
"grad_norm": 1.297434687614441,
"learning_rate": 0.0002,
"loss": 0.2057,
"step": 434000
},
{
"epoch": 33.77,
"grad_norm": 1.636635422706604,
"learning_rate": 0.0002,
"loss": 0.2085,
"step": 435000
},
{
"epoch": 33.85,
"grad_norm": 1.3059121370315552,
"learning_rate": 0.0002,
"loss": 0.2099,
"step": 436000
},
{
"epoch": 33.92,
"grad_norm": 1.1616836786270142,
"learning_rate": 0.0002,
"loss": 0.2098,
"step": 437000
},
{
"epoch": 34.0,
"grad_norm": 0.9708386063575745,
"learning_rate": 0.0002,
"loss": 0.2103,
"step": 438000
},
{
"epoch": 34.08,
"grad_norm": 1.1958973407745361,
"learning_rate": 0.0002,
"loss": 0.1868,
"step": 439000
},
{
"epoch": 34.16,
"grad_norm": 0.9669882655143738,
"learning_rate": 0.0002,
"loss": 0.1885,
"step": 440000
},
{
"epoch": 34.16,
"eval_bleu": 65.0538,
"eval_gen_len": 28.8525,
"eval_loss": 0.5453199148178101,
"eval_runtime": 98.9637,
"eval_samples_per_second": 8.973,
"eval_steps_per_second": 1.122,
"step": 440000
},
{
"epoch": 34.23,
"grad_norm": 1.3960009813308716,
"learning_rate": 0.0002,
"loss": 0.192,
"step": 441000
},
{
"epoch": 34.31,
"grad_norm": 1.1039202213287354,
"learning_rate": 0.0002,
"loss": 0.1928,
"step": 442000
},
{
"epoch": 34.39,
"grad_norm": 1.4681973457336426,
"learning_rate": 0.0002,
"loss": 0.1963,
"step": 443000
},
{
"epoch": 34.47,
"grad_norm": 1.1876535415649414,
"learning_rate": 0.0002,
"loss": 0.1955,
"step": 444000
},
{
"epoch": 34.54,
"grad_norm": 1.0030099153518677,
"learning_rate": 0.0002,
"loss": 0.197,
"step": 445000
},
{
"epoch": 34.62,
"grad_norm": 1.262609839439392,
"learning_rate": 0.0002,
"loss": 0.1965,
"step": 446000
},
{
"epoch": 34.7,
"grad_norm": 4.133481979370117,
"learning_rate": 0.0002,
"loss": 0.2009,
"step": 447000
},
{
"epoch": 34.78,
"grad_norm": 1.3214054107666016,
"learning_rate": 0.0002,
"loss": 0.2011,
"step": 448000
},
{
"epoch": 34.85,
"grad_norm": 1.061333417892456,
"learning_rate": 0.0002,
"loss": 0.205,
"step": 449000
},
{
"epoch": 34.93,
"grad_norm": 1.487025260925293,
"learning_rate": 0.0002,
"loss": 0.2049,
"step": 450000
},
{
"epoch": 34.93,
"eval_bleu": 65.2857,
"eval_gen_len": 28.7207,
"eval_loss": 0.5434128046035767,
"eval_runtime": 98.7363,
"eval_samples_per_second": 8.994,
"eval_steps_per_second": 1.124,
"step": 450000
},
{
"epoch": 35.01,
"grad_norm": 1.3061411380767822,
"learning_rate": 0.0002,
"loss": 0.2042,
"step": 451000
},
{
"epoch": 35.09,
"grad_norm": 0.9900358319282532,
"learning_rate": 0.0002,
"loss": 0.1803,
"step": 452000
},
{
"epoch": 35.17,
"grad_norm": 1.2118251323699951,
"learning_rate": 0.0002,
"loss": 0.183,
"step": 453000
},
{
"epoch": 35.24,
"grad_norm": 1.1625529527664185,
"learning_rate": 0.0002,
"loss": 0.1879,
"step": 454000
},
{
"epoch": 35.32,
"grad_norm": 1.0669846534729004,
"learning_rate": 0.0002,
"loss": 0.1888,
"step": 455000
},
{
"epoch": 35.4,
"grad_norm": 1.285409688949585,
"learning_rate": 0.0002,
"loss": 0.1908,
"step": 456000
},
{
"epoch": 35.48,
"grad_norm": 1.292738914489746,
"learning_rate": 0.0002,
"loss": 0.1944,
"step": 457000
},
{
"epoch": 35.55,
"grad_norm": 0.9169420599937439,
"learning_rate": 0.0002,
"loss": 0.1924,
"step": 458000
},
{
"epoch": 35.63,
"grad_norm": 1.1117466688156128,
"learning_rate": 0.0002,
"loss": 0.1944,
"step": 459000
},
{
"epoch": 35.71,
"grad_norm": 1.400664210319519,
"learning_rate": 0.0002,
"loss": 0.1957,
"step": 460000
},
{
"epoch": 35.71,
"eval_bleu": 65.3436,
"eval_gen_len": 28.714,
"eval_loss": 0.549137532711029,
"eval_runtime": 99.1079,
"eval_samples_per_second": 8.96,
"eval_steps_per_second": 1.12,
"step": 460000
},
{
"epoch": 35.79,
"grad_norm": 1.1465002298355103,
"learning_rate": 0.0002,
"loss": 0.1974,
"step": 461000
},
{
"epoch": 35.86,
"grad_norm": 0.9425164461135864,
"learning_rate": 0.0002,
"loss": 0.1967,
"step": 462000
},
{
"epoch": 35.94,
"grad_norm": 1.0649182796478271,
"learning_rate": 0.0002,
"loss": 0.1974,
"step": 463000
},
{
"epoch": 36.02,
"grad_norm": 0.9610468149185181,
"learning_rate": 0.0002,
"loss": 0.1943,
"step": 464000
},
{
"epoch": 36.1,
"grad_norm": 1.0697602033615112,
"learning_rate": 0.0002,
"loss": 0.1785,
"step": 465000
},
{
"epoch": 36.17,
"grad_norm": 0.8167102336883545,
"learning_rate": 0.0002,
"loss": 0.181,
"step": 466000
},
{
"epoch": 36.25,
"grad_norm": 1.155148983001709,
"learning_rate": 0.0002,
"loss": 0.1804,
"step": 467000
},
{
"epoch": 36.33,
"grad_norm": 1.036157250404358,
"learning_rate": 0.0002,
"loss": 0.1811,
"step": 468000
},
{
"epoch": 36.41,
"grad_norm": 0.9966660141944885,
"learning_rate": 0.0002,
"loss": 0.1825,
"step": 469000
},
{
"epoch": 36.49,
"grad_norm": 1.3554514646530151,
"learning_rate": 0.0002,
"loss": 0.1867,
"step": 470000
},
{
"epoch": 36.49,
"eval_bleu": 65.4934,
"eval_gen_len": 28.7939,
"eval_loss": 0.5535929203033447,
"eval_runtime": 99.8585,
"eval_samples_per_second": 8.893,
"eval_steps_per_second": 1.112,
"step": 470000
},
{
"epoch": 36.56,
"grad_norm": 1.1400065422058105,
"learning_rate": 0.0002,
"loss": 0.1889,
"step": 471000
},
{
"epoch": 36.64,
"grad_norm": 1.2936526536941528,
"learning_rate": 0.0002,
"loss": 0.1892,
"step": 472000
},
{
"epoch": 36.72,
"grad_norm": 1.3375158309936523,
"learning_rate": 0.0002,
"loss": 0.1932,
"step": 473000
},
{
"epoch": 36.8,
"grad_norm": 1.3976365327835083,
"learning_rate": 0.0002,
"loss": 0.1931,
"step": 474000
},
{
"epoch": 36.87,
"grad_norm": 1.2075397968292236,
"learning_rate": 0.0002,
"loss": 0.1938,
"step": 475000
},
{
"epoch": 36.95,
"grad_norm": 1.2333601713180542,
"learning_rate": 0.0002,
"loss": 0.1918,
"step": 476000
},
{
"epoch": 37.03,
"grad_norm": 0.9724763631820679,
"learning_rate": 0.0002,
"loss": 0.1857,
"step": 477000
},
{
"epoch": 37.11,
"grad_norm": 1.305141568183899,
"learning_rate": 0.0002,
"loss": 0.1741,
"step": 478000
},
{
"epoch": 37.18,
"grad_norm": 1.2358112335205078,
"learning_rate": 0.0002,
"loss": 0.1727,
"step": 479000
},
{
"epoch": 37.26,
"grad_norm": 1.040460228919983,
"learning_rate": 0.0002,
"loss": 0.1765,
"step": 480000
},
{
"epoch": 37.26,
"eval_bleu": 65.5595,
"eval_gen_len": 28.8255,
"eval_loss": 0.5582976341247559,
"eval_runtime": 99.6921,
"eval_samples_per_second": 8.907,
"eval_steps_per_second": 1.113,
"step": 480000
},
{
"epoch": 37.34,
"grad_norm": 1.0049262046813965,
"learning_rate": 0.0002,
"loss": 0.1801,
"step": 481000
},
{
"epoch": 37.42,
"grad_norm": 0.9716454148292542,
"learning_rate": 0.0002,
"loss": 0.1806,
"step": 482000
},
{
"epoch": 37.49,
"grad_norm": 1.2684077024459839,
"learning_rate": 0.0002,
"loss": 0.1809,
"step": 483000
},
{
"epoch": 37.57,
"grad_norm": 1.4772919416427612,
"learning_rate": 0.0002,
"loss": 0.1798,
"step": 484000
},
{
"epoch": 37.65,
"grad_norm": 0.8240026831626892,
"learning_rate": 0.0002,
"loss": 0.1849,
"step": 485000
},
{
"epoch": 37.73,
"grad_norm": 1.2247587442398071,
"learning_rate": 0.0002,
"loss": 0.1872,
"step": 486000
},
{
"epoch": 37.8,
"grad_norm": 1.4645825624465942,
"learning_rate": 0.0002,
"loss": 0.1889,
"step": 487000
},
{
"epoch": 37.88,
"grad_norm": 1.0552102327346802,
"learning_rate": 0.0002,
"loss": 0.1866,
"step": 488000
},
{
"epoch": 37.96,
"grad_norm": 1.2899285554885864,
"learning_rate": 0.0002,
"loss": 0.1897,
"step": 489000
},
{
"epoch": 38.04,
"grad_norm": 1.0461792945861816,
"learning_rate": 0.0002,
"loss": 0.1786,
"step": 490000
},
{
"epoch": 38.04,
"eval_bleu": 65.6358,
"eval_gen_len": 28.7691,
"eval_loss": 0.5611980557441711,
"eval_runtime": 99.4338,
"eval_samples_per_second": 8.931,
"eval_steps_per_second": 1.116,
"step": 490000
},
{
"epoch": 38.12,
"grad_norm": 1.1956135034561157,
"learning_rate": 0.0002,
"loss": 0.1708,
"step": 491000
},
{
"epoch": 38.19,
"grad_norm": 1.903419852256775,
"learning_rate": 0.0002,
"loss": 0.1726,
"step": 492000
},
{
"epoch": 38.27,
"grad_norm": 1.4714049100875854,
"learning_rate": 0.0002,
"loss": 0.174,
"step": 493000
},
{
"epoch": 38.35,
"grad_norm": 1.117650032043457,
"learning_rate": 0.0002,
"loss": 0.1753,
"step": 494000
},
{
"epoch": 38.43,
"grad_norm": 0.9286689162254333,
"learning_rate": 0.0002,
"loss": 0.1766,
"step": 495000
},
{
"epoch": 38.5,
"grad_norm": 1.0359840393066406,
"learning_rate": 0.0002,
"loss": 0.1774,
"step": 496000
},
{
"epoch": 38.58,
"grad_norm": 0.9324952363967896,
"learning_rate": 0.0002,
"loss": 0.1795,
"step": 497000
},
{
"epoch": 38.66,
"grad_norm": 1.2552545070648193,
"learning_rate": 0.0002,
"loss": 0.1795,
"step": 498000
},
{
"epoch": 38.74,
"grad_norm": 0.9712297916412354,
"learning_rate": 0.0002,
"loss": 0.1798,
"step": 499000
},
{
"epoch": 38.81,
"grad_norm": 1.3964751958847046,
"learning_rate": 0.0002,
"loss": 0.1809,
"step": 500000
},
{
"epoch": 38.81,
"eval_bleu": 65.0266,
"eval_gen_len": 28.7455,
"eval_loss": 0.5573469996452332,
"eval_runtime": 99.5788,
"eval_samples_per_second": 8.918,
"eval_steps_per_second": 1.115,
"step": 500000
},
{
"epoch": 38.81,
"step": 500000,
"total_flos": 5.131418179149005e+17,
"train_loss": 0.40471466763305664,
"train_runtime": 141171.7116,
"train_samples_per_second": 56.669,
"train_steps_per_second": 3.542
}
],
"logging_steps": 1000,
"max_steps": 500000,
"num_input_tokens_seen": 0,
"num_train_epochs": 39,
"save_steps": 10000,
"total_flos": 5.131418179149005e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}