{ "best_metric": 65.6358, "best_model_checkpoint": "/tmp/finetuned_models/iwslt_aligned_smallT5_cont0/checkpoint-490000", "epoch": 38.81384878124515, "eval_steps": 10000, "global_step": 500000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 3.0081684589385986, "learning_rate": 0.0002, "loss": 3.3371, "step": 1000 }, { "epoch": 0.16, "grad_norm": 2.347439765930176, "learning_rate": 0.0002, "loss": 1.9825, "step": 2000 }, { "epoch": 0.23, "grad_norm": 3.2636098861694336, "learning_rate": 0.0002, "loss": 1.7503, "step": 3000 }, { "epoch": 0.31, "grad_norm": 2.1704177856445312, "learning_rate": 0.0002, "loss": 1.6063, "step": 4000 }, { "epoch": 0.39, "grad_norm": 1.8257893323898315, "learning_rate": 0.0002, "loss": 1.5193, "step": 5000 }, { "epoch": 0.47, "grad_norm": 2.1923136711120605, "learning_rate": 0.0002, "loss": 1.4439, "step": 6000 }, { "epoch": 0.54, "grad_norm": 2.198930501937866, "learning_rate": 0.0002, "loss": 1.3742, "step": 7000 }, { "epoch": 0.62, "grad_norm": 1.8519206047058105, "learning_rate": 0.0002, "loss": 1.3327, "step": 8000 }, { "epoch": 0.7, "grad_norm": 2.371457576751709, "learning_rate": 0.0002, "loss": 1.2872, "step": 9000 }, { "epoch": 0.78, "grad_norm": 2.2528538703918457, "learning_rate": 0.0002, "loss": 1.2426, "step": 10000 }, { "epoch": 0.78, "eval_bleu": 46.2793, "eval_gen_len": 28.6532, "eval_loss": 0.8300400972366333, "eval_runtime": 100.4495, "eval_samples_per_second": 8.84, "eval_steps_per_second": 1.105, "step": 10000 }, { "epoch": 0.85, "grad_norm": 1.8507146835327148, "learning_rate": 0.0002, "loss": 1.2154, "step": 11000 }, { "epoch": 0.93, "grad_norm": 1.629703164100647, "learning_rate": 0.0002, "loss": 1.1822, "step": 12000 }, { "epoch": 1.01, "grad_norm": 1.9470340013504028, "learning_rate": 0.0002, "loss": 1.1505, "step": 13000 }, { "epoch": 1.09, "grad_norm": 1.733299970626831, "learning_rate": 0.0002, "loss": 1.0877, "step": 14000 }, { "epoch": 1.16, "grad_norm": 1.5334522724151611, "learning_rate": 0.0002, "loss": 1.0647, "step": 15000 }, { "epoch": 1.24, "grad_norm": 1.8881198167800903, "learning_rate": 0.0002, "loss": 1.0595, "step": 16000 }, { "epoch": 1.32, "grad_norm": 2.1781082153320312, "learning_rate": 0.0002, "loss": 1.0441, "step": 17000 }, { "epoch": 1.4, "grad_norm": 1.509994387626648, "learning_rate": 0.0002, "loss": 1.0307, "step": 18000 }, { "epoch": 1.47, "grad_norm": 2.5609610080718994, "learning_rate": 0.0002, "loss": 1.0165, "step": 19000 }, { "epoch": 1.55, "grad_norm": 1.512005090713501, "learning_rate": 0.0002, "loss": 0.9931, "step": 20000 }, { "epoch": 1.55, "eval_bleu": 52.2709, "eval_gen_len": 28.6441, "eval_loss": 0.6755661368370056, "eval_runtime": 99.4907, "eval_samples_per_second": 8.925, "eval_steps_per_second": 1.116, "step": 20000 }, { "epoch": 1.63, "grad_norm": 1.4830211400985718, "learning_rate": 0.0002, "loss": 0.9854, "step": 21000 }, { "epoch": 1.71, "grad_norm": 1.8581557273864746, "learning_rate": 0.0002, "loss": 0.9736, "step": 22000 }, { "epoch": 1.79, "grad_norm": 1.589917778968811, "learning_rate": 0.0002, "loss": 0.9642, "step": 23000 }, { "epoch": 1.86, "grad_norm": 2.332538604736328, "learning_rate": 0.0002, "loss": 0.9476, "step": 24000 }, { "epoch": 1.94, "grad_norm": 1.61520516872406, "learning_rate": 0.0002, "loss": 0.943, "step": 25000 }, { "epoch": 2.02, "grad_norm": 1.2320265769958496, "learning_rate": 0.0002, "loss": 0.9135, "step": 26000 }, { "epoch": 2.1, "grad_norm": 1.543626308441162, "learning_rate": 0.0002, "loss": 0.8763, "step": 27000 }, { "epoch": 2.17, "grad_norm": 1.7634472846984863, "learning_rate": 0.0002, "loss": 0.8664, "step": 28000 }, { "epoch": 2.25, "grad_norm": 1.1254847049713135, "learning_rate": 0.0002, "loss": 0.8625, "step": 29000 }, { "epoch": 2.33, "grad_norm": 1.390243649482727, "learning_rate": 0.0002, "loss": 0.8573, "step": 30000 }, { "epoch": 2.33, "eval_bleu": 55.8294, "eval_gen_len": 28.5405, "eval_loss": 0.6142958998680115, "eval_runtime": 100.5081, "eval_samples_per_second": 8.835, "eval_steps_per_second": 1.104, "step": 30000 }, { "epoch": 2.41, "grad_norm": 1.592313289642334, "learning_rate": 0.0002, "loss": 0.855, "step": 31000 }, { "epoch": 2.48, "grad_norm": 1.1361020803451538, "learning_rate": 0.0002, "loss": 0.8487, "step": 32000 }, { "epoch": 2.56, "grad_norm": 1.2688100337982178, "learning_rate": 0.0002, "loss": 0.8417, "step": 33000 }, { "epoch": 2.64, "grad_norm": 1.3345963954925537, "learning_rate": 0.0002, "loss": 0.829, "step": 34000 }, { "epoch": 2.72, "grad_norm": 1.4640212059020996, "learning_rate": 0.0002, "loss": 0.8264, "step": 35000 }, { "epoch": 2.79, "grad_norm": 1.4818629026412964, "learning_rate": 0.0002, "loss": 0.8207, "step": 36000 }, { "epoch": 2.87, "grad_norm": 1.4580553770065308, "learning_rate": 0.0002, "loss": 0.8155, "step": 37000 }, { "epoch": 2.95, "grad_norm": 1.8713078498840332, "learning_rate": 0.0002, "loss": 0.8134, "step": 38000 }, { "epoch": 3.03, "grad_norm": 1.295332670211792, "learning_rate": 0.0002, "loss": 0.787, "step": 39000 }, { "epoch": 3.11, "grad_norm": 1.532378077507019, "learning_rate": 0.0002, "loss": 0.762, "step": 40000 }, { "epoch": 3.11, "eval_bleu": 57.5135, "eval_gen_len": 28.366, "eval_loss": 0.581108570098877, "eval_runtime": 98.8348, "eval_samples_per_second": 8.985, "eval_steps_per_second": 1.123, "step": 40000 }, { "epoch": 3.18, "grad_norm": 1.3924858570098877, "learning_rate": 0.0002, "loss": 0.7577, "step": 41000 }, { "epoch": 3.26, "grad_norm": 1.4161888360977173, "learning_rate": 0.0002, "loss": 0.7482, "step": 42000 }, { "epoch": 3.34, "grad_norm": 1.639460802078247, "learning_rate": 0.0002, "loss": 0.7582, "step": 43000 }, { "epoch": 3.42, "grad_norm": 1.3779182434082031, "learning_rate": 0.0002, "loss": 0.7474, "step": 44000 }, { "epoch": 3.49, "grad_norm": 1.8350883722305298, "learning_rate": 0.0002, "loss": 0.751, "step": 45000 }, { "epoch": 3.57, "grad_norm": 1.2075496912002563, "learning_rate": 0.0002, "loss": 0.7371, "step": 46000 }, { "epoch": 3.65, "grad_norm": 1.3083984851837158, "learning_rate": 0.0002, "loss": 0.7318, "step": 47000 }, { "epoch": 3.73, "grad_norm": 1.5021324157714844, "learning_rate": 0.0002, "loss": 0.7376, "step": 48000 }, { "epoch": 3.8, "grad_norm": 1.1597286462783813, "learning_rate": 0.0002, "loss": 0.7336, "step": 49000 }, { "epoch": 3.88, "grad_norm": 1.3814338445663452, "learning_rate": 0.0002, "loss": 0.734, "step": 50000 }, { "epoch": 3.88, "eval_bleu": 58.6125, "eval_gen_len": 28.5101, "eval_loss": 0.5499288439750671, "eval_runtime": 99.2548, "eval_samples_per_second": 8.947, "eval_steps_per_second": 1.118, "step": 50000 }, { "epoch": 3.96, "grad_norm": 1.0968077182769775, "learning_rate": 0.0002, "loss": 0.7288, "step": 51000 }, { "epoch": 4.04, "grad_norm": 1.9418740272521973, "learning_rate": 0.0002, "loss": 0.7057, "step": 52000 }, { "epoch": 4.11, "grad_norm": 1.1883801221847534, "learning_rate": 0.0002, "loss": 0.6761, "step": 53000 }, { "epoch": 4.19, "grad_norm": 1.1581670045852661, "learning_rate": 0.0002, "loss": 0.6812, "step": 54000 }, { "epoch": 4.27, "grad_norm": 1.4657800197601318, "learning_rate": 0.0002, "loss": 0.6783, "step": 55000 }, { "epoch": 4.35, "grad_norm": 1.1991990804672241, "learning_rate": 0.0002, "loss": 0.6764, "step": 56000 }, { "epoch": 4.42, "grad_norm": 1.5376391410827637, "learning_rate": 0.0002, "loss": 0.6805, "step": 57000 }, { "epoch": 4.5, "grad_norm": 1.228727102279663, "learning_rate": 0.0002, "loss": 0.6734, "step": 58000 }, { "epoch": 4.58, "grad_norm": 0.949891984462738, "learning_rate": 0.0002, "loss": 0.6702, "step": 59000 }, { "epoch": 4.66, "grad_norm": 1.5632683038711548, "learning_rate": 0.0002, "loss": 0.6722, "step": 60000 }, { "epoch": 4.66, "eval_bleu": 59.6427, "eval_gen_len": 28.8356, "eval_loss": 0.522808849811554, "eval_runtime": 101.7798, "eval_samples_per_second": 8.725, "eval_steps_per_second": 1.091, "step": 60000 }, { "epoch": 4.74, "grad_norm": 1.2885149717330933, "learning_rate": 0.0002, "loss": 0.67, "step": 61000 }, { "epoch": 4.81, "grad_norm": 1.7992392778396606, "learning_rate": 0.0002, "loss": 0.6686, "step": 62000 }, { "epoch": 4.89, "grad_norm": 1.7027188539505005, "learning_rate": 0.0002, "loss": 0.6682, "step": 63000 }, { "epoch": 4.97, "grad_norm": 1.2083909511566162, "learning_rate": 0.0002, "loss": 0.6545, "step": 64000 }, { "epoch": 5.05, "grad_norm": 1.1508337259292603, "learning_rate": 0.0002, "loss": 0.6308, "step": 65000 }, { "epoch": 5.12, "grad_norm": 1.0901002883911133, "learning_rate": 0.0002, "loss": 0.6192, "step": 66000 }, { "epoch": 5.2, "grad_norm": 1.3939250707626343, "learning_rate": 0.0002, "loss": 0.6217, "step": 67000 }, { "epoch": 5.28, "grad_norm": 0.9105481505393982, "learning_rate": 0.0002, "loss": 0.6177, "step": 68000 }, { "epoch": 5.36, "grad_norm": 0.944652795791626, "learning_rate": 0.0002, "loss": 0.6227, "step": 69000 }, { "epoch": 5.43, "grad_norm": 1.1488838195800781, "learning_rate": 0.0002, "loss": 0.6215, "step": 70000 }, { "epoch": 5.43, "eval_bleu": 60.4701, "eval_gen_len": 28.7534, "eval_loss": 0.5161064863204956, "eval_runtime": 100.914, "eval_samples_per_second": 8.8, "eval_steps_per_second": 1.1, "step": 70000 }, { "epoch": 5.51, "grad_norm": 1.0799453258514404, "learning_rate": 0.0002, "loss": 0.6173, "step": 71000 }, { "epoch": 5.59, "grad_norm": 1.2405527830123901, "learning_rate": 0.0002, "loss": 0.6229, "step": 72000 }, { "epoch": 5.67, "grad_norm": 1.045590877532959, "learning_rate": 0.0002, "loss": 0.6183, "step": 73000 }, { "epoch": 5.74, "grad_norm": 1.4318771362304688, "learning_rate": 0.0002, "loss": 0.6146, "step": 74000 }, { "epoch": 5.82, "grad_norm": 1.0059374570846558, "learning_rate": 0.0002, "loss": 0.6175, "step": 75000 }, { "epoch": 5.9, "grad_norm": 1.0831586122512817, "learning_rate": 0.0002, "loss": 0.6165, "step": 76000 }, { "epoch": 5.98, "grad_norm": 1.4094606637954712, "learning_rate": 0.0002, "loss": 0.6158, "step": 77000 }, { "epoch": 6.05, "grad_norm": 1.2640748023986816, "learning_rate": 0.0002, "loss": 0.5827, "step": 78000 }, { "epoch": 6.13, "grad_norm": 1.0088295936584473, "learning_rate": 0.0002, "loss": 0.5699, "step": 79000 }, { "epoch": 6.21, "grad_norm": 0.9942078590393066, "learning_rate": 0.0002, "loss": 0.5756, "step": 80000 }, { "epoch": 6.21, "eval_bleu": 62.0864, "eval_gen_len": 28.6498, "eval_loss": 0.5067651867866516, "eval_runtime": 99.2392, "eval_samples_per_second": 8.948, "eval_steps_per_second": 1.119, "step": 80000 }, { "epoch": 6.29, "grad_norm": 1.838376522064209, "learning_rate": 0.0002, "loss": 0.5784, "step": 81000 }, { "epoch": 6.37, "grad_norm": 1.477137804031372, "learning_rate": 0.0002, "loss": 0.5743, "step": 82000 }, { "epoch": 6.44, "grad_norm": 1.1481742858886719, "learning_rate": 0.0002, "loss": 0.578, "step": 83000 }, { "epoch": 6.52, "grad_norm": 1.1412264108657837, "learning_rate": 0.0002, "loss": 0.5785, "step": 84000 }, { "epoch": 6.6, "grad_norm": 0.9707184433937073, "learning_rate": 0.0002, "loss": 0.5746, "step": 85000 }, { "epoch": 6.68, "grad_norm": 0.9683183431625366, "learning_rate": 0.0002, "loss": 0.5794, "step": 86000 }, { "epoch": 6.75, "grad_norm": 1.2322285175323486, "learning_rate": 0.0002, "loss": 0.5765, "step": 87000 }, { "epoch": 6.83, "grad_norm": 1.1276684999465942, "learning_rate": 0.0002, "loss": 0.5767, "step": 88000 }, { "epoch": 6.91, "grad_norm": 1.4353203773498535, "learning_rate": 0.0002, "loss": 0.5776, "step": 89000 }, { "epoch": 6.99, "grad_norm": 1.116827368736267, "learning_rate": 0.0002, "loss": 0.5738, "step": 90000 }, { "epoch": 6.99, "eval_bleu": 61.9714, "eval_gen_len": 28.5788, "eval_loss": 0.5005396008491516, "eval_runtime": 98.6774, "eval_samples_per_second": 8.999, "eval_steps_per_second": 1.125, "step": 90000 }, { "epoch": 7.06, "grad_norm": 1.0088461637496948, "learning_rate": 0.0002, "loss": 0.538, "step": 91000 }, { "epoch": 7.14, "grad_norm": 1.1016899347305298, "learning_rate": 0.0002, "loss": 0.5349, "step": 92000 }, { "epoch": 7.22, "grad_norm": 1.4406321048736572, "learning_rate": 0.0002, "loss": 0.5408, "step": 93000 }, { "epoch": 7.3, "grad_norm": 1.1226301193237305, "learning_rate": 0.0002, "loss": 0.5362, "step": 94000 }, { "epoch": 7.37, "grad_norm": 1.4062280654907227, "learning_rate": 0.0002, "loss": 0.5381, "step": 95000 }, { "epoch": 7.45, "grad_norm": 1.2851547002792358, "learning_rate": 0.0002, "loss": 0.5438, "step": 96000 }, { "epoch": 7.53, "grad_norm": 0.9344896078109741, "learning_rate": 0.0002, "loss": 0.5442, "step": 97000 }, { "epoch": 7.61, "grad_norm": 1.7169030904769897, "learning_rate": 0.0002, "loss": 0.539, "step": 98000 }, { "epoch": 7.69, "grad_norm": 1.1855800151824951, "learning_rate": 0.0002, "loss": 0.543, "step": 99000 }, { "epoch": 7.76, "grad_norm": 1.093122959136963, "learning_rate": 0.0002, "loss": 0.5384, "step": 100000 }, { "epoch": 7.76, "eval_bleu": 62.407, "eval_gen_len": 28.5282, "eval_loss": 0.49085742235183716, "eval_runtime": 98.8889, "eval_samples_per_second": 8.98, "eval_steps_per_second": 1.122, "step": 100000 }, { "epoch": 7.84, "grad_norm": 1.3849202394485474, "learning_rate": 0.0002, "loss": 0.5381, "step": 101000 }, { "epoch": 7.92, "grad_norm": 1.12773859500885, "learning_rate": 0.0002, "loss": 0.544, "step": 102000 }, { "epoch": 8.0, "grad_norm": 0.9132428169250488, "learning_rate": 0.0002, "loss": 0.5371, "step": 103000 }, { "epoch": 8.07, "grad_norm": 1.5978176593780518, "learning_rate": 0.0002, "loss": 0.5015, "step": 104000 }, { "epoch": 8.15, "grad_norm": 1.028082013130188, "learning_rate": 0.0002, "loss": 0.4956, "step": 105000 }, { "epoch": 8.23, "grad_norm": 1.0597223043441772, "learning_rate": 0.0002, "loss": 0.5074, "step": 106000 }, { "epoch": 8.31, "grad_norm": 1.47709059715271, "learning_rate": 0.0002, "loss": 0.5022, "step": 107000 }, { "epoch": 8.38, "grad_norm": 1.1731916666030884, "learning_rate": 0.0002, "loss": 0.5039, "step": 108000 }, { "epoch": 8.46, "grad_norm": 1.2147469520568848, "learning_rate": 0.0002, "loss": 0.5121, "step": 109000 }, { "epoch": 8.54, "grad_norm": 0.8521010279655457, "learning_rate": 0.0002, "loss": 0.5109, "step": 110000 }, { "epoch": 8.54, "eval_bleu": 62.1452, "eval_gen_len": 28.4617, "eval_loss": 0.4901648759841919, "eval_runtime": 98.4744, "eval_samples_per_second": 9.018, "eval_steps_per_second": 1.127, "step": 110000 }, { "epoch": 8.62, "grad_norm": 1.250752568244934, "learning_rate": 0.0002, "loss": 0.5072, "step": 111000 }, { "epoch": 8.69, "grad_norm": 1.2694823741912842, "learning_rate": 0.0002, "loss": 0.5126, "step": 112000 }, { "epoch": 8.77, "grad_norm": 1.0290015935897827, "learning_rate": 0.0002, "loss": 0.5065, "step": 113000 }, { "epoch": 8.85, "grad_norm": 1.222034215927124, "learning_rate": 0.0002, "loss": 0.514, "step": 114000 }, { "epoch": 8.93, "grad_norm": 1.0359649658203125, "learning_rate": 0.0002, "loss": 0.5127, "step": 115000 }, { "epoch": 9.0, "grad_norm": 1.19712495803833, "learning_rate": 0.0002, "loss": 0.5114, "step": 116000 }, { "epoch": 9.08, "grad_norm": 1.1766573190689087, "learning_rate": 0.0002, "loss": 0.4698, "step": 117000 }, { "epoch": 9.16, "grad_norm": 1.2681427001953125, "learning_rate": 0.0002, "loss": 0.4755, "step": 118000 }, { "epoch": 9.24, "grad_norm": 1.2988672256469727, "learning_rate": 0.0002, "loss": 0.4772, "step": 119000 }, { "epoch": 9.32, "grad_norm": 1.440721035003662, "learning_rate": 0.0002, "loss": 0.4816, "step": 120000 }, { "epoch": 9.32, "eval_bleu": 62.6499, "eval_gen_len": 28.5518, "eval_loss": 0.48750796914100647, "eval_runtime": 99.0223, "eval_samples_per_second": 8.968, "eval_steps_per_second": 1.121, "step": 120000 }, { "epoch": 9.39, "grad_norm": 1.038442611694336, "learning_rate": 0.0002, "loss": 0.4792, "step": 121000 }, { "epoch": 9.47, "grad_norm": 1.3428473472595215, "learning_rate": 0.0002, "loss": 0.4827, "step": 122000 }, { "epoch": 9.55, "grad_norm": 1.4756362438201904, "learning_rate": 0.0002, "loss": 0.4832, "step": 123000 }, { "epoch": 9.63, "grad_norm": 1.2109817266464233, "learning_rate": 0.0002, "loss": 0.481, "step": 124000 }, { "epoch": 9.7, "grad_norm": 1.2007863521575928, "learning_rate": 0.0002, "loss": 0.4815, "step": 125000 }, { "epoch": 9.78, "grad_norm": 1.1711379289627075, "learning_rate": 0.0002, "loss": 0.488, "step": 126000 }, { "epoch": 9.86, "grad_norm": 1.1571533679962158, "learning_rate": 0.0002, "loss": 0.4827, "step": 127000 }, { "epoch": 9.94, "grad_norm": 1.2341859340667725, "learning_rate": 0.0002, "loss": 0.4844, "step": 128000 }, { "epoch": 10.01, "grad_norm": 1.5796501636505127, "learning_rate": 0.0002, "loss": 0.4741, "step": 129000 }, { "epoch": 10.09, "grad_norm": 0.8122438788414001, "learning_rate": 0.0002, "loss": 0.4493, "step": 130000 }, { "epoch": 10.09, "eval_bleu": 62.6694, "eval_gen_len": 28.6993, "eval_loss": 0.4866686761379242, "eval_runtime": 100.7784, "eval_samples_per_second": 8.811, "eval_steps_per_second": 1.101, "step": 130000 }, { "epoch": 10.17, "grad_norm": 1.1835366487503052, "learning_rate": 0.0002, "loss": 0.448, "step": 131000 }, { "epoch": 10.25, "grad_norm": 1.0868804454803467, "learning_rate": 0.0002, "loss": 0.4517, "step": 132000 }, { "epoch": 10.32, "grad_norm": 0.9316431283950806, "learning_rate": 0.0002, "loss": 0.454, "step": 133000 }, { "epoch": 10.4, "grad_norm": 1.5438517332077026, "learning_rate": 0.0002, "loss": 0.4526, "step": 134000 }, { "epoch": 10.48, "grad_norm": 1.5842955112457275, "learning_rate": 0.0002, "loss": 0.4576, "step": 135000 }, { "epoch": 10.56, "grad_norm": 1.450462818145752, "learning_rate": 0.0002, "loss": 0.463, "step": 136000 }, { "epoch": 10.63, "grad_norm": 0.8578802347183228, "learning_rate": 0.0002, "loss": 0.4588, "step": 137000 }, { "epoch": 10.71, "grad_norm": 1.1508352756500244, "learning_rate": 0.0002, "loss": 0.4542, "step": 138000 }, { "epoch": 10.79, "grad_norm": 1.1183589696884155, "learning_rate": 0.0002, "loss": 0.4601, "step": 139000 }, { "epoch": 10.87, "grad_norm": 0.9286684393882751, "learning_rate": 0.0002, "loss": 0.4648, "step": 140000 }, { "epoch": 10.87, "eval_bleu": 63.3179, "eval_gen_len": 28.5495, "eval_loss": 0.4774705469608307, "eval_runtime": 98.6639, "eval_samples_per_second": 9.0, "eval_steps_per_second": 1.125, "step": 140000 }, { "epoch": 10.95, "grad_norm": 1.3315681219100952, "learning_rate": 0.0002, "loss": 0.4627, "step": 141000 }, { "epoch": 11.02, "grad_norm": 1.204750418663025, "learning_rate": 0.0002, "loss": 0.4497, "step": 142000 }, { "epoch": 11.1, "grad_norm": 1.0254498720169067, "learning_rate": 0.0002, "loss": 0.4242, "step": 143000 }, { "epoch": 11.18, "grad_norm": 1.052018404006958, "learning_rate": 0.0002, "loss": 0.4306, "step": 144000 }, { "epoch": 11.26, "grad_norm": 0.9426015019416809, "learning_rate": 0.0002, "loss": 0.4275, "step": 145000 }, { "epoch": 11.33, "grad_norm": 1.079633116722107, "learning_rate": 0.0002, "loss": 0.4303, "step": 146000 }, { "epoch": 11.41, "grad_norm": 1.025631070137024, "learning_rate": 0.0002, "loss": 0.433, "step": 147000 }, { "epoch": 11.49, "grad_norm": 1.21865713596344, "learning_rate": 0.0002, "loss": 0.4351, "step": 148000 }, { "epoch": 11.57, "grad_norm": 1.0304579734802246, "learning_rate": 0.0002, "loss": 0.4358, "step": 149000 }, { "epoch": 11.64, "grad_norm": 1.297282338142395, "learning_rate": 0.0002, "loss": 0.4414, "step": 150000 }, { "epoch": 11.64, "eval_bleu": 63.6928, "eval_gen_len": 28.4673, "eval_loss": 0.4786856770515442, "eval_runtime": 98.6768, "eval_samples_per_second": 8.999, "eval_steps_per_second": 1.125, "step": 150000 }, { "epoch": 11.72, "grad_norm": 0.973185658454895, "learning_rate": 0.0002, "loss": 0.4375, "step": 151000 }, { "epoch": 11.8, "grad_norm": 0.9765141010284424, "learning_rate": 0.0002, "loss": 0.4398, "step": 152000 }, { "epoch": 11.88, "grad_norm": 1.1599891185760498, "learning_rate": 0.0002, "loss": 0.4422, "step": 153000 }, { "epoch": 11.95, "grad_norm": 1.0690301656723022, "learning_rate": 0.0002, "loss": 0.4396, "step": 154000 }, { "epoch": 12.03, "grad_norm": 0.9184726476669312, "learning_rate": 0.0002, "loss": 0.4232, "step": 155000 }, { "epoch": 12.11, "grad_norm": 1.1572961807250977, "learning_rate": 0.0002, "loss": 0.4038, "step": 156000 }, { "epoch": 12.19, "grad_norm": 1.1003015041351318, "learning_rate": 0.0002, "loss": 0.4088, "step": 157000 }, { "epoch": 12.27, "grad_norm": 1.147965908050537, "learning_rate": 0.0002, "loss": 0.4099, "step": 158000 }, { "epoch": 12.34, "grad_norm": 1.3417842388153076, "learning_rate": 0.0002, "loss": 0.4108, "step": 159000 }, { "epoch": 12.42, "grad_norm": 0.9816053509712219, "learning_rate": 0.0002, "loss": 0.4158, "step": 160000 }, { "epoch": 12.42, "eval_bleu": 63.8752, "eval_gen_len": 28.5011, "eval_loss": 0.47919762134552, "eval_runtime": 98.6149, "eval_samples_per_second": 9.005, "eval_steps_per_second": 1.126, "step": 160000 }, { "epoch": 12.5, "grad_norm": 1.1307754516601562, "learning_rate": 0.0002, "loss": 0.4139, "step": 161000 }, { "epoch": 12.58, "grad_norm": 1.2909305095672607, "learning_rate": 0.0002, "loss": 0.4191, "step": 162000 }, { "epoch": 12.65, "grad_norm": 1.0675512552261353, "learning_rate": 0.0002, "loss": 0.4178, "step": 163000 }, { "epoch": 12.73, "grad_norm": 1.062435269355774, "learning_rate": 0.0002, "loss": 0.4183, "step": 164000 }, { "epoch": 12.81, "grad_norm": 1.2755943536758423, "learning_rate": 0.0002, "loss": 0.42, "step": 165000 }, { "epoch": 12.89, "grad_norm": 1.0879075527191162, "learning_rate": 0.0002, "loss": 0.4231, "step": 166000 }, { "epoch": 12.96, "grad_norm": 1.1521817445755005, "learning_rate": 0.0002, "loss": 0.4231, "step": 167000 }, { "epoch": 13.04, "grad_norm": 1.038859486579895, "learning_rate": 0.0002, "loss": 0.4014, "step": 168000 }, { "epoch": 13.12, "grad_norm": 1.001861333847046, "learning_rate": 0.0002, "loss": 0.3875, "step": 169000 }, { "epoch": 13.2, "grad_norm": 0.967998743057251, "learning_rate": 0.0002, "loss": 0.3895, "step": 170000 }, { "epoch": 13.2, "eval_bleu": 63.8429, "eval_gen_len": 28.6498, "eval_loss": 0.4793872535228729, "eval_runtime": 99.8944, "eval_samples_per_second": 8.889, "eval_steps_per_second": 1.111, "step": 170000 }, { "epoch": 13.27, "grad_norm": 1.1491278409957886, "learning_rate": 0.0002, "loss": 0.3947, "step": 171000 }, { "epoch": 13.35, "grad_norm": 1.0739213228225708, "learning_rate": 0.0002, "loss": 0.3937, "step": 172000 }, { "epoch": 13.43, "grad_norm": 1.3349049091339111, "learning_rate": 0.0002, "loss": 0.3932, "step": 173000 }, { "epoch": 13.51, "grad_norm": 1.4266788959503174, "learning_rate": 0.0002, "loss": 0.4012, "step": 174000 }, { "epoch": 13.58, "grad_norm": 1.064070701599121, "learning_rate": 0.0002, "loss": 0.4, "step": 175000 }, { "epoch": 13.66, "grad_norm": 1.930474877357483, "learning_rate": 0.0002, "loss": 0.399, "step": 176000 }, { "epoch": 13.74, "grad_norm": 0.994195282459259, "learning_rate": 0.0002, "loss": 0.4026, "step": 177000 }, { "epoch": 13.82, "grad_norm": 0.9755762815475464, "learning_rate": 0.0002, "loss": 0.4019, "step": 178000 }, { "epoch": 13.9, "grad_norm": 1.0802558660507202, "learning_rate": 0.0002, "loss": 0.4027, "step": 179000 }, { "epoch": 13.97, "grad_norm": 1.4257205724716187, "learning_rate": 0.0002, "loss": 0.4031, "step": 180000 }, { "epoch": 13.97, "eval_bleu": 63.9496, "eval_gen_len": 28.7264, "eval_loss": 0.4756912291049957, "eval_runtime": 100.6936, "eval_samples_per_second": 8.819, "eval_steps_per_second": 1.102, "step": 180000 }, { "epoch": 14.05, "grad_norm": 1.0172358751296997, "learning_rate": 0.0002, "loss": 0.3785, "step": 181000 }, { "epoch": 14.13, "grad_norm": 0.9525344967842102, "learning_rate": 0.0002, "loss": 0.367, "step": 182000 }, { "epoch": 14.21, "grad_norm": 1.0674399137496948, "learning_rate": 0.0002, "loss": 0.3723, "step": 183000 }, { "epoch": 14.28, "grad_norm": 1.311464548110962, "learning_rate": 0.0002, "loss": 0.3743, "step": 184000 }, { "epoch": 14.36, "grad_norm": 1.020115613937378, "learning_rate": 0.0002, "loss": 0.3743, "step": 185000 }, { "epoch": 14.44, "grad_norm": 0.9766080379486084, "learning_rate": 0.0002, "loss": 0.3751, "step": 186000 }, { "epoch": 14.52, "grad_norm": 1.0636546611785889, "learning_rate": 0.0002, "loss": 0.3839, "step": 187000 }, { "epoch": 14.59, "grad_norm": 1.5485342741012573, "learning_rate": 0.0002, "loss": 0.3864, "step": 188000 }, { "epoch": 14.67, "grad_norm": 1.189011812210083, "learning_rate": 0.0002, "loss": 0.3836, "step": 189000 }, { "epoch": 14.75, "grad_norm": 1.2171902656555176, "learning_rate": 0.0002, "loss": 0.3844, "step": 190000 }, { "epoch": 14.75, "eval_bleu": 63.7498, "eval_gen_len": 28.8288, "eval_loss": 0.48547232151031494, "eval_runtime": 102.2105, "eval_samples_per_second": 8.688, "eval_steps_per_second": 1.086, "step": 190000 }, { "epoch": 14.83, "grad_norm": 0.9004954695701599, "learning_rate": 0.0002, "loss": 0.3821, "step": 191000 }, { "epoch": 14.9, "grad_norm": 1.2197577953338623, "learning_rate": 0.0002, "loss": 0.3871, "step": 192000 }, { "epoch": 14.98, "grad_norm": 1.0094869136810303, "learning_rate": 0.0002, "loss": 0.3924, "step": 193000 }, { "epoch": 15.06, "grad_norm": 1.1337696313858032, "learning_rate": 0.0002, "loss": 0.3602, "step": 194000 }, { "epoch": 15.14, "grad_norm": 1.5074607133865356, "learning_rate": 0.0002, "loss": 0.3548, "step": 195000 }, { "epoch": 15.22, "grad_norm": 0.9171730279922485, "learning_rate": 0.0002, "loss": 0.3586, "step": 196000 }, { "epoch": 15.29, "grad_norm": 0.8581980466842651, "learning_rate": 0.0002, "loss": 0.3609, "step": 197000 }, { "epoch": 15.37, "grad_norm": 0.8790922164916992, "learning_rate": 0.0002, "loss": 0.363, "step": 198000 }, { "epoch": 15.45, "grad_norm": 1.012073278427124, "learning_rate": 0.0002, "loss": 0.3618, "step": 199000 }, { "epoch": 15.53, "grad_norm": 0.9808474183082581, "learning_rate": 0.0002, "loss": 0.3637, "step": 200000 }, { "epoch": 15.53, "eval_bleu": 64.2277, "eval_gen_len": 28.661, "eval_loss": 0.4799739420413971, "eval_runtime": 99.1042, "eval_samples_per_second": 8.96, "eval_steps_per_second": 1.12, "step": 200000 }, { "epoch": 15.6, "grad_norm": 0.995276689529419, "learning_rate": 0.0002, "loss": 0.3676, "step": 201000 }, { "epoch": 15.68, "grad_norm": 1.2943910360336304, "learning_rate": 0.0002, "loss": 0.3705, "step": 202000 }, { "epoch": 15.76, "grad_norm": 0.9539749026298523, "learning_rate": 0.0002, "loss": 0.3706, "step": 203000 }, { "epoch": 15.84, "grad_norm": 0.9351176619529724, "learning_rate": 0.0002, "loss": 0.3706, "step": 204000 }, { "epoch": 15.91, "grad_norm": 1.087781310081482, "learning_rate": 0.0002, "loss": 0.3713, "step": 205000 }, { "epoch": 15.99, "grad_norm": 1.2164143323898315, "learning_rate": 0.0002, "loss": 0.3729, "step": 206000 }, { "epoch": 16.07, "grad_norm": 1.1458275318145752, "learning_rate": 0.0002, "loss": 0.3417, "step": 207000 }, { "epoch": 16.15, "grad_norm": 0.9169874787330627, "learning_rate": 0.0002, "loss": 0.339, "step": 208000 }, { "epoch": 16.22, "grad_norm": 1.1594195365905762, "learning_rate": 0.0002, "loss": 0.3426, "step": 209000 }, { "epoch": 16.3, "grad_norm": 0.8710166215896606, "learning_rate": 0.0002, "loss": 0.3473, "step": 210000 }, { "epoch": 16.3, "eval_bleu": 64.4683, "eval_gen_len": 28.786, "eval_loss": 0.4854079782962799, "eval_runtime": 99.2915, "eval_samples_per_second": 8.943, "eval_steps_per_second": 1.118, "step": 210000 }, { "epoch": 16.38, "grad_norm": 1.1366904973983765, "learning_rate": 0.0002, "loss": 0.3483, "step": 211000 }, { "epoch": 16.46, "grad_norm": 1.135487675666809, "learning_rate": 0.0002, "loss": 0.3509, "step": 212000 }, { "epoch": 16.53, "grad_norm": 0.9458820819854736, "learning_rate": 0.0002, "loss": 0.3519, "step": 213000 }, { "epoch": 16.61, "grad_norm": 0.8842834830284119, "learning_rate": 0.0002, "loss": 0.3551, "step": 214000 }, { "epoch": 16.69, "grad_norm": 1.2772917747497559, "learning_rate": 0.0002, "loss": 0.3572, "step": 215000 }, { "epoch": 16.77, "grad_norm": 1.5344486236572266, "learning_rate": 0.0002, "loss": 0.3567, "step": 216000 }, { "epoch": 16.85, "grad_norm": 1.4000177383422852, "learning_rate": 0.0002, "loss": 0.3605, "step": 217000 }, { "epoch": 16.92, "grad_norm": 1.5617576837539673, "learning_rate": 0.0002, "loss": 0.3577, "step": 218000 }, { "epoch": 17.0, "grad_norm": 1.4207055568695068, "learning_rate": 0.0002, "loss": 0.3581, "step": 219000 }, { "epoch": 17.08, "grad_norm": 1.6633687019348145, "learning_rate": 0.0002, "loss": 0.3243, "step": 220000 }, { "epoch": 17.08, "eval_bleu": 64.7805, "eval_gen_len": 28.6791, "eval_loss": 0.490304172039032, "eval_runtime": 99.3675, "eval_samples_per_second": 8.937, "eval_steps_per_second": 1.117, "step": 220000 }, { "epoch": 17.16, "grad_norm": 1.1573091745376587, "learning_rate": 0.0002, "loss": 0.3298, "step": 221000 }, { "epoch": 17.23, "grad_norm": 1.046479344367981, "learning_rate": 0.0002, "loss": 0.3312, "step": 222000 }, { "epoch": 17.31, "grad_norm": 1.2901638746261597, "learning_rate": 0.0002, "loss": 0.3325, "step": 223000 }, { "epoch": 17.39, "grad_norm": 1.0912984609603882, "learning_rate": 0.0002, "loss": 0.3351, "step": 224000 }, { "epoch": 17.47, "grad_norm": 1.5278785228729248, "learning_rate": 0.0002, "loss": 0.3382, "step": 225000 }, { "epoch": 17.54, "grad_norm": 1.013113021850586, "learning_rate": 0.0002, "loss": 0.3386, "step": 226000 }, { "epoch": 17.62, "grad_norm": 1.254299283027649, "learning_rate": 0.0002, "loss": 0.3393, "step": 227000 }, { "epoch": 17.7, "grad_norm": 0.8990402221679688, "learning_rate": 0.0002, "loss": 0.3404, "step": 228000 }, { "epoch": 17.78, "grad_norm": 1.2247493267059326, "learning_rate": 0.0002, "loss": 0.3451, "step": 229000 }, { "epoch": 17.85, "grad_norm": 1.454061508178711, "learning_rate": 0.0002, "loss": 0.3426, "step": 230000 }, { "epoch": 17.85, "eval_bleu": 64.679, "eval_gen_len": 28.4809, "eval_loss": 0.4818822741508484, "eval_runtime": 98.7347, "eval_samples_per_second": 8.994, "eval_steps_per_second": 1.124, "step": 230000 }, { "epoch": 17.93, "grad_norm": 1.4103410243988037, "learning_rate": 0.0002, "loss": 0.3457, "step": 231000 }, { "epoch": 18.01, "grad_norm": 1.0248557329177856, "learning_rate": 0.0002, "loss": 0.3449, "step": 232000 }, { "epoch": 18.09, "grad_norm": 1.421231985092163, "learning_rate": 0.0002, "loss": 0.3115, "step": 233000 }, { "epoch": 18.16, "grad_norm": 1.0739413499832153, "learning_rate": 0.0002, "loss": 0.314, "step": 234000 }, { "epoch": 18.24, "grad_norm": 1.0976619720458984, "learning_rate": 0.0002, "loss": 0.3214, "step": 235000 }, { "epoch": 18.32, "grad_norm": 1.480944037437439, "learning_rate": 0.0002, "loss": 0.3173, "step": 236000 }, { "epoch": 18.4, "grad_norm": 1.249569296836853, "learning_rate": 0.0002, "loss": 0.3227, "step": 237000 }, { "epoch": 18.48, "grad_norm": 1.1228398084640503, "learning_rate": 0.0002, "loss": 0.3263, "step": 238000 }, { "epoch": 18.55, "grad_norm": 1.318242073059082, "learning_rate": 0.0002, "loss": 0.327, "step": 239000 }, { "epoch": 18.63, "grad_norm": 1.1360143423080444, "learning_rate": 0.0002, "loss": 0.3295, "step": 240000 }, { "epoch": 18.63, "eval_bleu": 65.3735, "eval_gen_len": 28.6014, "eval_loss": 0.4851875603199005, "eval_runtime": 98.572, "eval_samples_per_second": 9.009, "eval_steps_per_second": 1.126, "step": 240000 }, { "epoch": 18.71, "grad_norm": 1.4588408470153809, "learning_rate": 0.0002, "loss": 0.3268, "step": 241000 }, { "epoch": 18.79, "grad_norm": 1.1620702743530273, "learning_rate": 0.0002, "loss": 0.3341, "step": 242000 }, { "epoch": 18.86, "grad_norm": 1.0640859603881836, "learning_rate": 0.0002, "loss": 0.3346, "step": 243000 }, { "epoch": 18.94, "grad_norm": 1.096739649772644, "learning_rate": 0.0002, "loss": 0.3334, "step": 244000 }, { "epoch": 19.02, "grad_norm": 1.23462975025177, "learning_rate": 0.0002, "loss": 0.3273, "step": 245000 }, { "epoch": 19.1, "grad_norm": 0.9026219248771667, "learning_rate": 0.0002, "loss": 0.3003, "step": 246000 }, { "epoch": 19.17, "grad_norm": 1.0630195140838623, "learning_rate": 0.0002, "loss": 0.3042, "step": 247000 }, { "epoch": 19.25, "grad_norm": 1.0800952911376953, "learning_rate": 0.0002, "loss": 0.306, "step": 248000 }, { "epoch": 19.33, "grad_norm": 1.2505557537078857, "learning_rate": 0.0002, "loss": 0.311, "step": 249000 }, { "epoch": 19.41, "grad_norm": 0.822533369064331, "learning_rate": 0.0002, "loss": 0.3124, "step": 250000 }, { "epoch": 19.41, "eval_bleu": 64.5641, "eval_gen_len": 28.6745, "eval_loss": 0.4947471022605896, "eval_runtime": 99.0725, "eval_samples_per_second": 8.963, "eval_steps_per_second": 1.12, "step": 250000 }, { "epoch": 19.48, "grad_norm": 1.2964988946914673, "learning_rate": 0.0002, "loss": 0.312, "step": 251000 }, { "epoch": 19.56, "grad_norm": 1.0119915008544922, "learning_rate": 0.0002, "loss": 0.3151, "step": 252000 }, { "epoch": 19.64, "grad_norm": 1.2384612560272217, "learning_rate": 0.0002, "loss": 0.3171, "step": 253000 }, { "epoch": 19.72, "grad_norm": 1.7689512968063354, "learning_rate": 0.0002, "loss": 0.3177, "step": 254000 }, { "epoch": 19.8, "grad_norm": 1.3058741092681885, "learning_rate": 0.0002, "loss": 0.3195, "step": 255000 }, { "epoch": 19.87, "grad_norm": 1.2293740510940552, "learning_rate": 0.0002, "loss": 0.3226, "step": 256000 }, { "epoch": 19.95, "grad_norm": 1.398077368736267, "learning_rate": 0.0002, "loss": 0.3222, "step": 257000 }, { "epoch": 20.03, "grad_norm": 0.9053579568862915, "learning_rate": 0.0002, "loss": 0.3122, "step": 258000 }, { "epoch": 20.11, "grad_norm": 1.2684714794158936, "learning_rate": 0.0002, "loss": 0.2891, "step": 259000 }, { "epoch": 20.18, "grad_norm": 0.9774475693702698, "learning_rate": 0.0002, "loss": 0.2933, "step": 260000 }, { "epoch": 20.18, "eval_bleu": 65.1364, "eval_gen_len": 28.6419, "eval_loss": 0.49722200632095337, "eval_runtime": 99.0096, "eval_samples_per_second": 8.969, "eval_steps_per_second": 1.121, "step": 260000 }, { "epoch": 20.26, "grad_norm": 1.2156912088394165, "learning_rate": 0.0002, "loss": 0.296, "step": 261000 }, { "epoch": 20.34, "grad_norm": 1.221637487411499, "learning_rate": 0.0002, "loss": 0.2981, "step": 262000 }, { "epoch": 20.42, "grad_norm": 0.9357077479362488, "learning_rate": 0.0002, "loss": 0.3018, "step": 263000 }, { "epoch": 20.49, "grad_norm": 0.9926024079322815, "learning_rate": 0.0002, "loss": 0.3012, "step": 264000 }, { "epoch": 20.57, "grad_norm": 1.6473757028579712, "learning_rate": 0.0002, "loss": 0.3049, "step": 265000 }, { "epoch": 20.65, "grad_norm": 1.1541528701782227, "learning_rate": 0.0002, "loss": 0.3039, "step": 266000 }, { "epoch": 20.73, "grad_norm": 1.220951795578003, "learning_rate": 0.0002, "loss": 0.3074, "step": 267000 }, { "epoch": 20.8, "grad_norm": 1.074318289756775, "learning_rate": 0.0002, "loss": 0.3119, "step": 268000 }, { "epoch": 20.88, "grad_norm": 1.015864372253418, "learning_rate": 0.0002, "loss": 0.3116, "step": 269000 }, { "epoch": 20.96, "grad_norm": 1.0277948379516602, "learning_rate": 0.0002, "loss": 0.3101, "step": 270000 }, { "epoch": 20.96, "eval_bleu": 64.6747, "eval_gen_len": 28.6802, "eval_loss": 0.4901565611362457, "eval_runtime": 100.085, "eval_samples_per_second": 8.872, "eval_steps_per_second": 1.109, "step": 270000 }, { "epoch": 21.04, "grad_norm": 0.9389250874519348, "learning_rate": 0.0002, "loss": 0.2942, "step": 271000 }, { "epoch": 21.11, "grad_norm": 1.2478715181350708, "learning_rate": 0.0002, "loss": 0.2806, "step": 272000 }, { "epoch": 21.19, "grad_norm": 0.9297951459884644, "learning_rate": 0.0002, "loss": 0.2833, "step": 273000 }, { "epoch": 21.27, "grad_norm": 0.9602841734886169, "learning_rate": 0.0002, "loss": 0.2876, "step": 274000 }, { "epoch": 21.35, "grad_norm": 0.9561505317687988, "learning_rate": 0.0002, "loss": 0.2898, "step": 275000 }, { "epoch": 21.43, "grad_norm": 1.0724116563796997, "learning_rate": 0.0002, "loss": 0.2897, "step": 276000 }, { "epoch": 21.5, "grad_norm": 0.9960470199584961, "learning_rate": 0.0002, "loss": 0.2942, "step": 277000 }, { "epoch": 21.58, "grad_norm": 1.1480662822723389, "learning_rate": 0.0002, "loss": 0.2915, "step": 278000 }, { "epoch": 21.66, "grad_norm": 1.1175373792648315, "learning_rate": 0.0002, "loss": 0.2991, "step": 279000 }, { "epoch": 21.74, "grad_norm": 1.6251972913742065, "learning_rate": 0.0002, "loss": 0.2991, "step": 280000 }, { "epoch": 21.74, "eval_bleu": 64.9732, "eval_gen_len": 28.5653, "eval_loss": 0.4907251298427582, "eval_runtime": 98.6604, "eval_samples_per_second": 9.001, "eval_steps_per_second": 1.125, "step": 280000 }, { "epoch": 21.81, "grad_norm": 0.8608353137969971, "learning_rate": 0.0002, "loss": 0.2994, "step": 281000 }, { "epoch": 21.89, "grad_norm": 1.077614426612854, "learning_rate": 0.0002, "loss": 0.3012, "step": 282000 }, { "epoch": 21.97, "grad_norm": 1.6897170543670654, "learning_rate": 0.0002, "loss": 0.3012, "step": 283000 }, { "epoch": 22.05, "grad_norm": 1.3074902296066284, "learning_rate": 0.0002, "loss": 0.2814, "step": 284000 }, { "epoch": 22.12, "grad_norm": 0.9641602039337158, "learning_rate": 0.0002, "loss": 0.2717, "step": 285000 }, { "epoch": 22.2, "grad_norm": 1.634346842765808, "learning_rate": 0.0002, "loss": 0.2747, "step": 286000 }, { "epoch": 22.28, "grad_norm": 1.4014965295791626, "learning_rate": 0.0002, "loss": 0.2783, "step": 287000 }, { "epoch": 22.36, "grad_norm": 1.3981196880340576, "learning_rate": 0.0002, "loss": 0.2831, "step": 288000 }, { "epoch": 22.43, "grad_norm": 0.8403178453445435, "learning_rate": 0.0002, "loss": 0.2801, "step": 289000 }, { "epoch": 22.51, "grad_norm": 0.879589319229126, "learning_rate": 0.0002, "loss": 0.2828, "step": 290000 }, { "epoch": 22.51, "eval_bleu": 64.7552, "eval_gen_len": 28.6261, "eval_loss": 0.5037782192230225, "eval_runtime": 98.7235, "eval_samples_per_second": 8.995, "eval_steps_per_second": 1.124, "step": 290000 }, { "epoch": 22.59, "grad_norm": 1.751582145690918, "learning_rate": 0.0002, "loss": 0.2846, "step": 291000 }, { "epoch": 22.67, "grad_norm": 1.2374579906463623, "learning_rate": 0.0002, "loss": 0.2855, "step": 292000 }, { "epoch": 22.74, "grad_norm": 1.152079463005066, "learning_rate": 0.0002, "loss": 0.2916, "step": 293000 }, { "epoch": 22.82, "grad_norm": 1.2837114334106445, "learning_rate": 0.0002, "loss": 0.2889, "step": 294000 }, { "epoch": 22.9, "grad_norm": 1.161375880241394, "learning_rate": 0.0002, "loss": 0.2894, "step": 295000 }, { "epoch": 22.98, "grad_norm": 0.8594853281974792, "learning_rate": 0.0002, "loss": 0.2936, "step": 296000 }, { "epoch": 23.06, "grad_norm": 0.9489020705223083, "learning_rate": 0.0002, "loss": 0.2733, "step": 297000 }, { "epoch": 23.13, "grad_norm": 1.2100919485092163, "learning_rate": 0.0002, "loss": 0.263, "step": 298000 }, { "epoch": 23.21, "grad_norm": 2.140540361404419, "learning_rate": 0.0002, "loss": 0.2666, "step": 299000 }, { "epoch": 23.29, "grad_norm": 1.070940375328064, "learning_rate": 0.0002, "loss": 0.2688, "step": 300000 }, { "epoch": 23.29, "eval_bleu": 65.0702, "eval_gen_len": 28.7534, "eval_loss": 0.5042341351509094, "eval_runtime": 99.9941, "eval_samples_per_second": 8.881, "eval_steps_per_second": 1.11, "step": 300000 }, { "epoch": 23.37, "grad_norm": 1.0847973823547363, "learning_rate": 0.0002, "loss": 0.2692, "step": 301000 }, { "epoch": 23.44, "grad_norm": 1.098399043083191, "learning_rate": 0.0002, "loss": 0.2721, "step": 302000 }, { "epoch": 23.52, "grad_norm": 0.9735555648803711, "learning_rate": 0.0002, "loss": 0.2725, "step": 303000 }, { "epoch": 23.6, "grad_norm": 1.2928968667984009, "learning_rate": 0.0002, "loss": 0.2772, "step": 304000 }, { "epoch": 23.68, "grad_norm": 1.1871669292449951, "learning_rate": 0.0002, "loss": 0.2766, "step": 305000 }, { "epoch": 23.75, "grad_norm": 0.9379162788391113, "learning_rate": 0.0002, "loss": 0.2797, "step": 306000 }, { "epoch": 23.83, "grad_norm": 0.8844149112701416, "learning_rate": 0.0002, "loss": 0.2813, "step": 307000 }, { "epoch": 23.91, "grad_norm": 1.0218191146850586, "learning_rate": 0.0002, "loss": 0.2834, "step": 308000 }, { "epoch": 23.99, "grad_norm": 1.234649896621704, "learning_rate": 0.0002, "loss": 0.2827, "step": 309000 }, { "epoch": 24.06, "grad_norm": 0.8998326659202576, "learning_rate": 0.0002, "loss": 0.2555, "step": 310000 }, { "epoch": 24.06, "eval_bleu": 65.0378, "eval_gen_len": 29.089, "eval_loss": 0.5101344585418701, "eval_runtime": 101.77, "eval_samples_per_second": 8.726, "eval_steps_per_second": 1.091, "step": 310000 }, { "epoch": 24.14, "grad_norm": 0.9993298053741455, "learning_rate": 0.0002, "loss": 0.2575, "step": 311000 }, { "epoch": 24.22, "grad_norm": 1.078316569328308, "learning_rate": 0.0002, "loss": 0.2577, "step": 312000 }, { "epoch": 24.3, "grad_norm": 1.0775636434555054, "learning_rate": 0.0002, "loss": 0.2603, "step": 313000 }, { "epoch": 24.38, "grad_norm": 1.0711839199066162, "learning_rate": 0.0002, "loss": 0.2641, "step": 314000 }, { "epoch": 24.45, "grad_norm": 1.1953543424606323, "learning_rate": 0.0002, "loss": 0.2666, "step": 315000 }, { "epoch": 24.53, "grad_norm": 0.7338001132011414, "learning_rate": 0.0002, "loss": 0.2662, "step": 316000 }, { "epoch": 24.61, "grad_norm": 1.651564121246338, "learning_rate": 0.0002, "loss": 0.2658, "step": 317000 }, { "epoch": 24.69, "grad_norm": 0.8356152176856995, "learning_rate": 0.0002, "loss": 0.2706, "step": 318000 }, { "epoch": 24.76, "grad_norm": 0.8503906726837158, "learning_rate": 0.0002, "loss": 0.2715, "step": 319000 }, { "epoch": 24.84, "grad_norm": 0.9122622609138489, "learning_rate": 0.0002, "loss": 0.2692, "step": 320000 }, { "epoch": 24.84, "eval_bleu": 64.9991, "eval_gen_len": 28.6937, "eval_loss": 0.5021673440933228, "eval_runtime": 99.2776, "eval_samples_per_second": 8.945, "eval_steps_per_second": 1.118, "step": 320000 }, { "epoch": 24.92, "grad_norm": 1.0263617038726807, "learning_rate": 0.0002, "loss": 0.2725, "step": 321000 }, { "epoch": 25.0, "grad_norm": 1.140886902809143, "learning_rate": 0.0002, "loss": 0.2748, "step": 322000 }, { "epoch": 25.07, "grad_norm": 0.9275480508804321, "learning_rate": 0.0002, "loss": 0.2467, "step": 323000 }, { "epoch": 25.15, "grad_norm": 1.170021653175354, "learning_rate": 0.0002, "loss": 0.2465, "step": 324000 }, { "epoch": 25.23, "grad_norm": 1.1251965761184692, "learning_rate": 0.0002, "loss": 0.2492, "step": 325000 }, { "epoch": 25.31, "grad_norm": 1.0885039567947388, "learning_rate": 0.0002, "loss": 0.2518, "step": 326000 }, { "epoch": 25.38, "grad_norm": 1.2162927389144897, "learning_rate": 0.0002, "loss": 0.2535, "step": 327000 }, { "epoch": 25.46, "grad_norm": 1.0869230031967163, "learning_rate": 0.0002, "loss": 0.2534, "step": 328000 }, { "epoch": 25.54, "grad_norm": 0.9775025248527527, "learning_rate": 0.0002, "loss": 0.2586, "step": 329000 }, { "epoch": 25.62, "grad_norm": 0.8145058155059814, "learning_rate": 0.0002, "loss": 0.2593, "step": 330000 }, { "epoch": 25.62, "eval_bleu": 65.2478, "eval_gen_len": 28.6137, "eval_loss": 0.508499026298523, "eval_runtime": 98.6428, "eval_samples_per_second": 9.002, "eval_steps_per_second": 1.125, "step": 330000 }, { "epoch": 25.69, "grad_norm": 1.270075798034668, "learning_rate": 0.0002, "loss": 0.2613, "step": 331000 }, { "epoch": 25.77, "grad_norm": 1.431252121925354, "learning_rate": 0.0002, "loss": 0.2628, "step": 332000 }, { "epoch": 25.85, "grad_norm": 1.3506394624710083, "learning_rate": 0.0002, "loss": 0.2651, "step": 333000 }, { "epoch": 25.93, "grad_norm": 1.0612725019454956, "learning_rate": 0.0002, "loss": 0.2628, "step": 334000 }, { "epoch": 26.01, "grad_norm": 0.8760356307029724, "learning_rate": 0.0002, "loss": 0.2654, "step": 335000 }, { "epoch": 26.08, "grad_norm": 1.0780360698699951, "learning_rate": 0.0002, "loss": 0.2363, "step": 336000 }, { "epoch": 26.16, "grad_norm": 1.4022656679153442, "learning_rate": 0.0002, "loss": 0.2404, "step": 337000 }, { "epoch": 26.24, "grad_norm": 1.1530039310455322, "learning_rate": 0.0002, "loss": 0.2416, "step": 338000 }, { "epoch": 26.32, "grad_norm": 1.028208613395691, "learning_rate": 0.0002, "loss": 0.2453, "step": 339000 }, { "epoch": 26.39, "grad_norm": 0.8168412446975708, "learning_rate": 0.0002, "loss": 0.2439, "step": 340000 }, { "epoch": 26.39, "eval_bleu": 64.863, "eval_gen_len": 28.6464, "eval_loss": 0.5152307748794556, "eval_runtime": 99.207, "eval_samples_per_second": 8.951, "eval_steps_per_second": 1.119, "step": 340000 }, { "epoch": 26.47, "grad_norm": 1.415486216545105, "learning_rate": 0.0002, "loss": 0.2481, "step": 341000 }, { "epoch": 26.55, "grad_norm": 1.016444444656372, "learning_rate": 0.0002, "loss": 0.2515, "step": 342000 }, { "epoch": 26.63, "grad_norm": 1.0151183605194092, "learning_rate": 0.0002, "loss": 0.2519, "step": 343000 }, { "epoch": 26.7, "grad_norm": 0.8286064267158508, "learning_rate": 0.0002, "loss": 0.2528, "step": 344000 }, { "epoch": 26.78, "grad_norm": 1.0916731357574463, "learning_rate": 0.0002, "loss": 0.2529, "step": 345000 }, { "epoch": 26.86, "grad_norm": 1.0001248121261597, "learning_rate": 0.0002, "loss": 0.256, "step": 346000 }, { "epoch": 26.94, "grad_norm": 0.8120971322059631, "learning_rate": 0.0002, "loss": 0.2575, "step": 347000 }, { "epoch": 27.01, "grad_norm": 0.9800658822059631, "learning_rate": 0.0002, "loss": 0.2522, "step": 348000 }, { "epoch": 27.09, "grad_norm": 1.0135070085525513, "learning_rate": 0.0002, "loss": 0.2298, "step": 349000 }, { "epoch": 27.17, "grad_norm": 1.1721863746643066, "learning_rate": 0.0002, "loss": 0.2327, "step": 350000 }, { "epoch": 27.17, "eval_bleu": 65.0748, "eval_gen_len": 28.7286, "eval_loss": 0.5164603590965271, "eval_runtime": 99.7391, "eval_samples_per_second": 8.903, "eval_steps_per_second": 1.113, "step": 350000 }, { "epoch": 27.25, "grad_norm": 0.7238809466362, "learning_rate": 0.0002, "loss": 0.2337, "step": 351000 }, { "epoch": 27.32, "grad_norm": 0.8267261385917664, "learning_rate": 0.0002, "loss": 0.2357, "step": 352000 }, { "epoch": 27.4, "grad_norm": 1.0274128913879395, "learning_rate": 0.0002, "loss": 0.2398, "step": 353000 }, { "epoch": 27.48, "grad_norm": 0.9916879534721375, "learning_rate": 0.0002, "loss": 0.2401, "step": 354000 }, { "epoch": 27.56, "grad_norm": 1.095639944076538, "learning_rate": 0.0002, "loss": 0.2428, "step": 355000 }, { "epoch": 27.64, "grad_norm": 0.8598717451095581, "learning_rate": 0.0002, "loss": 0.2432, "step": 356000 }, { "epoch": 27.71, "grad_norm": 0.8891191482543945, "learning_rate": 0.0002, "loss": 0.2431, "step": 357000 }, { "epoch": 27.79, "grad_norm": 0.9431182146072388, "learning_rate": 0.0002, "loss": 0.2468, "step": 358000 }, { "epoch": 27.87, "grad_norm": 1.3781706094741821, "learning_rate": 0.0002, "loss": 0.2498, "step": 359000 }, { "epoch": 27.95, "grad_norm": 0.9336220622062683, "learning_rate": 0.0002, "loss": 0.249, "step": 360000 }, { "epoch": 27.95, "eval_bleu": 64.7249, "eval_gen_len": 28.6137, "eval_loss": 0.5116418600082397, "eval_runtime": 98.8178, "eval_samples_per_second": 8.986, "eval_steps_per_second": 1.123, "step": 360000 }, { "epoch": 28.02, "grad_norm": 1.2862168550491333, "learning_rate": 0.0002, "loss": 0.2416, "step": 361000 }, { "epoch": 28.1, "grad_norm": 0.8687452077865601, "learning_rate": 0.0002, "loss": 0.2204, "step": 362000 }, { "epoch": 28.18, "grad_norm": 1.8673216104507446, "learning_rate": 0.0002, "loss": 0.2254, "step": 363000 }, { "epoch": 28.26, "grad_norm": 0.9244999885559082, "learning_rate": 0.0002, "loss": 0.227, "step": 364000 }, { "epoch": 28.33, "grad_norm": 0.7414880990982056, "learning_rate": 0.0002, "loss": 0.2302, "step": 365000 }, { "epoch": 28.41, "grad_norm": 1.0677781105041504, "learning_rate": 0.0002, "loss": 0.2311, "step": 366000 }, { "epoch": 28.49, "grad_norm": 1.0712281465530396, "learning_rate": 0.0002, "loss": 0.2354, "step": 367000 }, { "epoch": 28.57, "grad_norm": 1.0177695751190186, "learning_rate": 0.0002, "loss": 0.2349, "step": 368000 }, { "epoch": 28.64, "grad_norm": 1.2082629203796387, "learning_rate": 0.0002, "loss": 0.2343, "step": 369000 }, { "epoch": 28.72, "grad_norm": 0.9800160527229309, "learning_rate": 0.0002, "loss": 0.238, "step": 370000 }, { "epoch": 28.72, "eval_bleu": 64.7651, "eval_gen_len": 28.5968, "eval_loss": 0.5202394723892212, "eval_runtime": 98.8503, "eval_samples_per_second": 8.983, "eval_steps_per_second": 1.123, "step": 370000 }, { "epoch": 28.8, "grad_norm": 1.4668409824371338, "learning_rate": 0.0002, "loss": 0.2417, "step": 371000 }, { "epoch": 28.88, "grad_norm": 0.9679712653160095, "learning_rate": 0.0002, "loss": 0.2397, "step": 372000 }, { "epoch": 28.96, "grad_norm": 1.0757184028625488, "learning_rate": 0.0002, "loss": 0.2419, "step": 373000 }, { "epoch": 29.03, "grad_norm": 1.3961704969406128, "learning_rate": 0.0002, "loss": 0.2326, "step": 374000 }, { "epoch": 29.11, "grad_norm": 1.4827901124954224, "learning_rate": 0.0002, "loss": 0.2159, "step": 375000 }, { "epoch": 29.19, "grad_norm": 1.065645456314087, "learning_rate": 0.0002, "loss": 0.2195, "step": 376000 }, { "epoch": 29.27, "grad_norm": 0.8756958842277527, "learning_rate": 0.0002, "loss": 0.2229, "step": 377000 }, { "epoch": 29.34, "grad_norm": 1.2630327939987183, "learning_rate": 0.0002, "loss": 0.2251, "step": 378000 }, { "epoch": 29.42, "grad_norm": 0.9434683322906494, "learning_rate": 0.0002, "loss": 0.2267, "step": 379000 }, { "epoch": 29.5, "grad_norm": 0.8589434623718262, "learning_rate": 0.0002, "loss": 0.2297, "step": 380000 }, { "epoch": 29.5, "eval_bleu": 65.3334, "eval_gen_len": 28.7005, "eval_loss": 0.5242559909820557, "eval_runtime": 99.2088, "eval_samples_per_second": 8.951, "eval_steps_per_second": 1.119, "step": 380000 }, { "epoch": 29.58, "grad_norm": 1.0252753496170044, "learning_rate": 0.0002, "loss": 0.229, "step": 381000 }, { "epoch": 29.65, "grad_norm": 1.4881134033203125, "learning_rate": 0.0002, "loss": 0.2301, "step": 382000 }, { "epoch": 29.73, "grad_norm": 1.0281462669372559, "learning_rate": 0.0002, "loss": 0.2307, "step": 383000 }, { "epoch": 29.81, "grad_norm": 1.1244617700576782, "learning_rate": 0.0002, "loss": 0.2335, "step": 384000 }, { "epoch": 29.89, "grad_norm": 1.1461416482925415, "learning_rate": 0.0002, "loss": 0.2355, "step": 385000 }, { "epoch": 29.96, "grad_norm": 1.742311716079712, "learning_rate": 0.0002, "loss": 0.2341, "step": 386000 }, { "epoch": 30.04, "grad_norm": 0.8539097309112549, "learning_rate": 0.0002, "loss": 0.2196, "step": 387000 }, { "epoch": 30.12, "grad_norm": 0.9865394830703735, "learning_rate": 0.0002, "loss": 0.2118, "step": 388000 }, { "epoch": 30.2, "grad_norm": 1.2487947940826416, "learning_rate": 0.0002, "loss": 0.2111, "step": 389000 }, { "epoch": 30.27, "grad_norm": 0.9401417970657349, "learning_rate": 0.0002, "loss": 0.2152, "step": 390000 }, { "epoch": 30.27, "eval_bleu": 64.9364, "eval_gen_len": 28.6081, "eval_loss": 0.533649206161499, "eval_runtime": 99.1874, "eval_samples_per_second": 8.953, "eval_steps_per_second": 1.119, "step": 390000 }, { "epoch": 30.35, "grad_norm": 1.5141676664352417, "learning_rate": 0.0002, "loss": 0.2185, "step": 391000 }, { "epoch": 30.43, "grad_norm": 1.4947956800460815, "learning_rate": 0.0002, "loss": 0.221, "step": 392000 }, { "epoch": 30.51, "grad_norm": 0.8870178461074829, "learning_rate": 0.0002, "loss": 0.2221, "step": 393000 }, { "epoch": 30.59, "grad_norm": 1.013377070426941, "learning_rate": 0.0002, "loss": 0.2223, "step": 394000 }, { "epoch": 30.66, "grad_norm": 1.2745546102523804, "learning_rate": 0.0002, "loss": 0.2242, "step": 395000 }, { "epoch": 30.74, "grad_norm": 1.3159047365188599, "learning_rate": 0.0002, "loss": 0.2286, "step": 396000 }, { "epoch": 30.82, "grad_norm": 0.8441556096076965, "learning_rate": 0.0002, "loss": 0.2269, "step": 397000 }, { "epoch": 30.9, "grad_norm": 1.0391247272491455, "learning_rate": 0.0002, "loss": 0.2297, "step": 398000 }, { "epoch": 30.97, "grad_norm": 1.0133869647979736, "learning_rate": 0.0002, "loss": 0.2277, "step": 399000 }, { "epoch": 31.05, "grad_norm": 1.5093469619750977, "learning_rate": 0.0002, "loss": 0.2106, "step": 400000 }, { "epoch": 31.05, "eval_bleu": 65.117, "eval_gen_len": 28.6745, "eval_loss": 0.540839433670044, "eval_runtime": 98.9473, "eval_samples_per_second": 8.974, "eval_steps_per_second": 1.122, "step": 400000 }, { "epoch": 31.13, "grad_norm": 1.0606015920639038, "learning_rate": 0.0002, "loss": 0.2031, "step": 401000 }, { "epoch": 31.21, "grad_norm": 1.4048112630844116, "learning_rate": 0.0002, "loss": 0.2043, "step": 402000 }, { "epoch": 31.28, "grad_norm": 1.1232408285140991, "learning_rate": 0.0002, "loss": 0.211, "step": 403000 }, { "epoch": 31.36, "grad_norm": 1.2367199659347534, "learning_rate": 0.0002, "loss": 0.2107, "step": 404000 }, { "epoch": 31.44, "grad_norm": 1.1147772073745728, "learning_rate": 0.0002, "loss": 0.2116, "step": 405000 }, { "epoch": 31.52, "grad_norm": 0.9711781740188599, "learning_rate": 0.0002, "loss": 0.2147, "step": 406000 }, { "epoch": 31.59, "grad_norm": 1.4205774068832397, "learning_rate": 0.0002, "loss": 0.2158, "step": 407000 }, { "epoch": 31.67, "grad_norm": 1.303250789642334, "learning_rate": 0.0002, "loss": 0.2187, "step": 408000 }, { "epoch": 31.75, "grad_norm": 2.3327102661132812, "learning_rate": 0.0002, "loss": 0.2196, "step": 409000 }, { "epoch": 31.83, "grad_norm": 1.9003146886825562, "learning_rate": 0.0002, "loss": 0.2234, "step": 410000 }, { "epoch": 31.83, "eval_bleu": 64.8926, "eval_gen_len": 28.6318, "eval_loss": 0.5249429941177368, "eval_runtime": 98.7286, "eval_samples_per_second": 8.994, "eval_steps_per_second": 1.124, "step": 410000 }, { "epoch": 31.9, "grad_norm": 0.9950889348983765, "learning_rate": 0.0002, "loss": 0.2232, "step": 411000 }, { "epoch": 31.98, "grad_norm": 0.8693845272064209, "learning_rate": 0.0002, "loss": 0.2236, "step": 412000 }, { "epoch": 32.06, "grad_norm": 0.9227551817893982, "learning_rate": 0.0002, "loss": 0.204, "step": 413000 }, { "epoch": 32.14, "grad_norm": 1.0269570350646973, "learning_rate": 0.0002, "loss": 0.2019, "step": 414000 }, { "epoch": 32.22, "grad_norm": 1.0199569463729858, "learning_rate": 0.0002, "loss": 0.2015, "step": 415000 }, { "epoch": 32.29, "grad_norm": 1.4488086700439453, "learning_rate": 0.0002, "loss": 0.2036, "step": 416000 }, { "epoch": 32.37, "grad_norm": 0.8843773007392883, "learning_rate": 0.0002, "loss": 0.2049, "step": 417000 }, { "epoch": 32.45, "grad_norm": 1.3630881309509277, "learning_rate": 0.0002, "loss": 0.2085, "step": 418000 }, { "epoch": 32.53, "grad_norm": 0.9767336845397949, "learning_rate": 0.0002, "loss": 0.2097, "step": 419000 }, { "epoch": 32.6, "grad_norm": 0.9147652983665466, "learning_rate": 0.0002, "loss": 0.2085, "step": 420000 }, { "epoch": 32.6, "eval_bleu": 65.5715, "eval_gen_len": 28.7984, "eval_loss": 0.5305626392364502, "eval_runtime": 100.0595, "eval_samples_per_second": 8.875, "eval_steps_per_second": 1.109, "step": 420000 }, { "epoch": 32.68, "grad_norm": 1.4235540628433228, "learning_rate": 0.0002, "loss": 0.212, "step": 421000 }, { "epoch": 32.76, "grad_norm": 0.9653807282447815, "learning_rate": 0.0002, "loss": 0.2129, "step": 422000 }, { "epoch": 32.84, "grad_norm": 1.0437246561050415, "learning_rate": 0.0002, "loss": 0.2153, "step": 423000 }, { "epoch": 32.91, "grad_norm": 1.0093231201171875, "learning_rate": 0.0002, "loss": 0.2146, "step": 424000 }, { "epoch": 32.99, "grad_norm": 0.9372303485870361, "learning_rate": 0.0002, "loss": 0.2176, "step": 425000 }, { "epoch": 33.07, "grad_norm": 0.990990161895752, "learning_rate": 0.0002, "loss": 0.1946, "step": 426000 }, { "epoch": 33.15, "grad_norm": 1.221752405166626, "learning_rate": 0.0002, "loss": 0.1937, "step": 427000 }, { "epoch": 33.22, "grad_norm": 1.0376135110855103, "learning_rate": 0.0002, "loss": 0.1971, "step": 428000 }, { "epoch": 33.3, "grad_norm": 1.2878087759017944, "learning_rate": 0.0002, "loss": 0.1993, "step": 429000 }, { "epoch": 33.38, "grad_norm": 1.702043890953064, "learning_rate": 0.0002, "loss": 0.2018, "step": 430000 }, { "epoch": 33.38, "eval_bleu": 64.9154, "eval_gen_len": 28.6351, "eval_loss": 0.5428734421730042, "eval_runtime": 99.9626, "eval_samples_per_second": 8.883, "eval_steps_per_second": 1.11, "step": 430000 }, { "epoch": 33.46, "grad_norm": 1.171934723854065, "learning_rate": 0.0002, "loss": 0.2042, "step": 431000 }, { "epoch": 33.54, "grad_norm": 0.9023895859718323, "learning_rate": 0.0002, "loss": 0.2032, "step": 432000 }, { "epoch": 33.61, "grad_norm": 1.5410844087600708, "learning_rate": 0.0002, "loss": 0.204, "step": 433000 }, { "epoch": 33.69, "grad_norm": 1.297434687614441, "learning_rate": 0.0002, "loss": 0.2057, "step": 434000 }, { "epoch": 33.77, "grad_norm": 1.636635422706604, "learning_rate": 0.0002, "loss": 0.2085, "step": 435000 }, { "epoch": 33.85, "grad_norm": 1.3059121370315552, "learning_rate": 0.0002, "loss": 0.2099, "step": 436000 }, { "epoch": 33.92, "grad_norm": 1.1616836786270142, "learning_rate": 0.0002, "loss": 0.2098, "step": 437000 }, { "epoch": 34.0, "grad_norm": 0.9708386063575745, "learning_rate": 0.0002, "loss": 0.2103, "step": 438000 }, { "epoch": 34.08, "grad_norm": 1.1958973407745361, "learning_rate": 0.0002, "loss": 0.1868, "step": 439000 }, { "epoch": 34.16, "grad_norm": 0.9669882655143738, "learning_rate": 0.0002, "loss": 0.1885, "step": 440000 }, { "epoch": 34.16, "eval_bleu": 65.0538, "eval_gen_len": 28.8525, "eval_loss": 0.5453199148178101, "eval_runtime": 98.9637, "eval_samples_per_second": 8.973, "eval_steps_per_second": 1.122, "step": 440000 }, { "epoch": 34.23, "grad_norm": 1.3960009813308716, "learning_rate": 0.0002, "loss": 0.192, "step": 441000 }, { "epoch": 34.31, "grad_norm": 1.1039202213287354, "learning_rate": 0.0002, "loss": 0.1928, "step": 442000 }, { "epoch": 34.39, "grad_norm": 1.4681973457336426, "learning_rate": 0.0002, "loss": 0.1963, "step": 443000 }, { "epoch": 34.47, "grad_norm": 1.1876535415649414, "learning_rate": 0.0002, "loss": 0.1955, "step": 444000 }, { "epoch": 34.54, "grad_norm": 1.0030099153518677, "learning_rate": 0.0002, "loss": 0.197, "step": 445000 }, { "epoch": 34.62, "grad_norm": 1.262609839439392, "learning_rate": 0.0002, "loss": 0.1965, "step": 446000 }, { "epoch": 34.7, "grad_norm": 4.133481979370117, "learning_rate": 0.0002, "loss": 0.2009, "step": 447000 }, { "epoch": 34.78, "grad_norm": 1.3214054107666016, "learning_rate": 0.0002, "loss": 0.2011, "step": 448000 }, { "epoch": 34.85, "grad_norm": 1.061333417892456, "learning_rate": 0.0002, "loss": 0.205, "step": 449000 }, { "epoch": 34.93, "grad_norm": 1.487025260925293, "learning_rate": 0.0002, "loss": 0.2049, "step": 450000 }, { "epoch": 34.93, "eval_bleu": 65.2857, "eval_gen_len": 28.7207, "eval_loss": 0.5434128046035767, "eval_runtime": 98.7363, "eval_samples_per_second": 8.994, "eval_steps_per_second": 1.124, "step": 450000 }, { "epoch": 35.01, "grad_norm": 1.3061411380767822, "learning_rate": 0.0002, "loss": 0.2042, "step": 451000 }, { "epoch": 35.09, "grad_norm": 0.9900358319282532, "learning_rate": 0.0002, "loss": 0.1803, "step": 452000 }, { "epoch": 35.17, "grad_norm": 1.2118251323699951, "learning_rate": 0.0002, "loss": 0.183, "step": 453000 }, { "epoch": 35.24, "grad_norm": 1.1625529527664185, "learning_rate": 0.0002, "loss": 0.1879, "step": 454000 }, { "epoch": 35.32, "grad_norm": 1.0669846534729004, "learning_rate": 0.0002, "loss": 0.1888, "step": 455000 }, { "epoch": 35.4, "grad_norm": 1.285409688949585, "learning_rate": 0.0002, "loss": 0.1908, "step": 456000 }, { "epoch": 35.48, "grad_norm": 1.292738914489746, "learning_rate": 0.0002, "loss": 0.1944, "step": 457000 }, { "epoch": 35.55, "grad_norm": 0.9169420599937439, "learning_rate": 0.0002, "loss": 0.1924, "step": 458000 }, { "epoch": 35.63, "grad_norm": 1.1117466688156128, "learning_rate": 0.0002, "loss": 0.1944, "step": 459000 }, { "epoch": 35.71, "grad_norm": 1.400664210319519, "learning_rate": 0.0002, "loss": 0.1957, "step": 460000 }, { "epoch": 35.71, "eval_bleu": 65.3436, "eval_gen_len": 28.714, "eval_loss": 0.549137532711029, "eval_runtime": 99.1079, "eval_samples_per_second": 8.96, "eval_steps_per_second": 1.12, "step": 460000 }, { "epoch": 35.79, "grad_norm": 1.1465002298355103, "learning_rate": 0.0002, "loss": 0.1974, "step": 461000 }, { "epoch": 35.86, "grad_norm": 0.9425164461135864, "learning_rate": 0.0002, "loss": 0.1967, "step": 462000 }, { "epoch": 35.94, "grad_norm": 1.0649182796478271, "learning_rate": 0.0002, "loss": 0.1974, "step": 463000 }, { "epoch": 36.02, "grad_norm": 0.9610468149185181, "learning_rate": 0.0002, "loss": 0.1943, "step": 464000 }, { "epoch": 36.1, "grad_norm": 1.0697602033615112, "learning_rate": 0.0002, "loss": 0.1785, "step": 465000 }, { "epoch": 36.17, "grad_norm": 0.8167102336883545, "learning_rate": 0.0002, "loss": 0.181, "step": 466000 }, { "epoch": 36.25, "grad_norm": 1.155148983001709, "learning_rate": 0.0002, "loss": 0.1804, "step": 467000 }, { "epoch": 36.33, "grad_norm": 1.036157250404358, "learning_rate": 0.0002, "loss": 0.1811, "step": 468000 }, { "epoch": 36.41, "grad_norm": 0.9966660141944885, "learning_rate": 0.0002, "loss": 0.1825, "step": 469000 }, { "epoch": 36.49, "grad_norm": 1.3554514646530151, "learning_rate": 0.0002, "loss": 0.1867, "step": 470000 }, { "epoch": 36.49, "eval_bleu": 65.4934, "eval_gen_len": 28.7939, "eval_loss": 0.5535929203033447, "eval_runtime": 99.8585, "eval_samples_per_second": 8.893, "eval_steps_per_second": 1.112, "step": 470000 }, { "epoch": 36.56, "grad_norm": 1.1400065422058105, "learning_rate": 0.0002, "loss": 0.1889, "step": 471000 }, { "epoch": 36.64, "grad_norm": 1.2936526536941528, "learning_rate": 0.0002, "loss": 0.1892, "step": 472000 }, { "epoch": 36.72, "grad_norm": 1.3375158309936523, "learning_rate": 0.0002, "loss": 0.1932, "step": 473000 }, { "epoch": 36.8, "grad_norm": 1.3976365327835083, "learning_rate": 0.0002, "loss": 0.1931, "step": 474000 }, { "epoch": 36.87, "grad_norm": 1.2075397968292236, "learning_rate": 0.0002, "loss": 0.1938, "step": 475000 }, { "epoch": 36.95, "grad_norm": 1.2333601713180542, "learning_rate": 0.0002, "loss": 0.1918, "step": 476000 }, { "epoch": 37.03, "grad_norm": 0.9724763631820679, "learning_rate": 0.0002, "loss": 0.1857, "step": 477000 }, { "epoch": 37.11, "grad_norm": 1.305141568183899, "learning_rate": 0.0002, "loss": 0.1741, "step": 478000 }, { "epoch": 37.18, "grad_norm": 1.2358112335205078, "learning_rate": 0.0002, "loss": 0.1727, "step": 479000 }, { "epoch": 37.26, "grad_norm": 1.040460228919983, "learning_rate": 0.0002, "loss": 0.1765, "step": 480000 }, { "epoch": 37.26, "eval_bleu": 65.5595, "eval_gen_len": 28.8255, "eval_loss": 0.5582976341247559, "eval_runtime": 99.6921, "eval_samples_per_second": 8.907, "eval_steps_per_second": 1.113, "step": 480000 }, { "epoch": 37.34, "grad_norm": 1.0049262046813965, "learning_rate": 0.0002, "loss": 0.1801, "step": 481000 }, { "epoch": 37.42, "grad_norm": 0.9716454148292542, "learning_rate": 0.0002, "loss": 0.1806, "step": 482000 }, { "epoch": 37.49, "grad_norm": 1.2684077024459839, "learning_rate": 0.0002, "loss": 0.1809, "step": 483000 }, { "epoch": 37.57, "grad_norm": 1.4772919416427612, "learning_rate": 0.0002, "loss": 0.1798, "step": 484000 }, { "epoch": 37.65, "grad_norm": 0.8240026831626892, "learning_rate": 0.0002, "loss": 0.1849, "step": 485000 }, { "epoch": 37.73, "grad_norm": 1.2247587442398071, "learning_rate": 0.0002, "loss": 0.1872, "step": 486000 }, { "epoch": 37.8, "grad_norm": 1.4645825624465942, "learning_rate": 0.0002, "loss": 0.1889, "step": 487000 }, { "epoch": 37.88, "grad_norm": 1.0552102327346802, "learning_rate": 0.0002, "loss": 0.1866, "step": 488000 }, { "epoch": 37.96, "grad_norm": 1.2899285554885864, "learning_rate": 0.0002, "loss": 0.1897, "step": 489000 }, { "epoch": 38.04, "grad_norm": 1.0461792945861816, "learning_rate": 0.0002, "loss": 0.1786, "step": 490000 }, { "epoch": 38.04, "eval_bleu": 65.6358, "eval_gen_len": 28.7691, "eval_loss": 0.5611980557441711, "eval_runtime": 99.4338, "eval_samples_per_second": 8.931, "eval_steps_per_second": 1.116, "step": 490000 }, { "epoch": 38.12, "grad_norm": 1.1956135034561157, "learning_rate": 0.0002, "loss": 0.1708, "step": 491000 }, { "epoch": 38.19, "grad_norm": 1.903419852256775, "learning_rate": 0.0002, "loss": 0.1726, "step": 492000 }, { "epoch": 38.27, "grad_norm": 1.4714049100875854, "learning_rate": 0.0002, "loss": 0.174, "step": 493000 }, { "epoch": 38.35, "grad_norm": 1.117650032043457, "learning_rate": 0.0002, "loss": 0.1753, "step": 494000 }, { "epoch": 38.43, "grad_norm": 0.9286689162254333, "learning_rate": 0.0002, "loss": 0.1766, "step": 495000 }, { "epoch": 38.5, "grad_norm": 1.0359840393066406, "learning_rate": 0.0002, "loss": 0.1774, "step": 496000 }, { "epoch": 38.58, "grad_norm": 0.9324952363967896, "learning_rate": 0.0002, "loss": 0.1795, "step": 497000 }, { "epoch": 38.66, "grad_norm": 1.2552545070648193, "learning_rate": 0.0002, "loss": 0.1795, "step": 498000 }, { "epoch": 38.74, "grad_norm": 0.9712297916412354, "learning_rate": 0.0002, "loss": 0.1798, "step": 499000 }, { "epoch": 38.81, "grad_norm": 1.3964751958847046, "learning_rate": 0.0002, "loss": 0.1809, "step": 500000 }, { "epoch": 38.81, "eval_bleu": 65.0266, "eval_gen_len": 28.7455, "eval_loss": 0.5573469996452332, "eval_runtime": 99.5788, "eval_samples_per_second": 8.918, "eval_steps_per_second": 1.115, "step": 500000 }, { "epoch": 38.81, "step": 500000, "total_flos": 5.131418179149005e+17, "train_loss": 0.40471466763305664, "train_runtime": 141171.7116, "train_samples_per_second": 56.669, "train_steps_per_second": 3.542 } ], "logging_steps": 1000, "max_steps": 500000, "num_input_tokens_seen": 0, "num_train_epochs": 39, "save_steps": 10000, "total_flos": 5.131418179149005e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }