diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7061 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.99891038666549, + "eval_steps": 1000, + "global_step": 84890, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01177960361633831, + "grad_norm": 0.00416200328618288, + "learning_rate": 2.355897596984451e-06, + "loss": 0.0049, + "step": 100 + }, + { + "epoch": 0.02355920723267662, + "grad_norm": 0.0011447813594713807, + "learning_rate": 4.711795193968902e-06, + "loss": 0.004, + "step": 200 + }, + { + "epoch": 0.03533881084901493, + "grad_norm": 0.0007957653142511845, + "learning_rate": 7.067692790953352e-06, + "loss": 0.0038, + "step": 300 + }, + { + "epoch": 0.04711841446535324, + "grad_norm": 0.000676482159178704, + "learning_rate": 9.423590387937803e-06, + "loss": 0.0037, + "step": 400 + }, + { + "epoch": 0.05889801808169155, + "grad_norm": 0.0006113381823524833, + "learning_rate": 1.1779487984922255e-05, + "loss": 0.0037, + "step": 500 + }, + { + "epoch": 0.07067762169802987, + "grad_norm": 0.0006428304477594793, + "learning_rate": 1.4135385581906704e-05, + "loss": 0.0037, + "step": 600 + }, + { + "epoch": 0.08245722531436817, + "grad_norm": 0.005013163201510906, + "learning_rate": 1.6491283178891155e-05, + "loss": 0.0036, + "step": 700 + }, + { + "epoch": 0.09423682893070648, + "grad_norm": 0.0020105824805796146, + "learning_rate": 1.8847180775875607e-05, + "loss": 0.0036, + "step": 800 + }, + { + "epoch": 0.10601643254704479, + "grad_norm": 0.0011909848544746637, + "learning_rate": 2.1203078372860058e-05, + "loss": 0.0035, + "step": 900 + }, + { + "epoch": 0.1177960361633831, + "grad_norm": 0.006216967944055796, + "learning_rate": 2.355897596984451e-05, + "loss": 0.0035, + "step": 1000 + }, + { + "epoch": 0.1177960361633831, + "eval_en-ja_loss": 0.0034329018089920282, + "eval_en-ja_mean_accuracy": 0.012147984538928768, + "eval_en-ja_negative_mse": -0.3447231352329254, + "eval_en-ja_runtime": 18.1178, + "eval_en-ja_samples_per_second": 399.828, + "eval_en-ja_src2trg_accuracy": 0.011457758144671452, + "eval_en-ja_steps_per_second": 1.601, + "eval_en-ja_trg2src_accuracy": 0.012838210933186085, + "eval_sequential_score": -0.1662875753469983, + "step": 1000 + }, + { + "epoch": 0.12957563977972142, + "grad_norm": 0.0032309722155332565, + "learning_rate": 2.5914873566828957e-05, + "loss": 0.0035, + "step": 1100 + }, + { + "epoch": 0.14135524339605973, + "grad_norm": 0.0020738590974360704, + "learning_rate": 2.827077116381341e-05, + "loss": 0.0034, + "step": 1200 + }, + { + "epoch": 0.15313484701239805, + "grad_norm": 0.00246366742067039, + "learning_rate": 3.062666876079786e-05, + "loss": 0.0034, + "step": 1300 + }, + { + "epoch": 0.16491445062873633, + "grad_norm": 0.002395735355094075, + "learning_rate": 3.298256635778231e-05, + "loss": 0.0034, + "step": 1400 + }, + { + "epoch": 0.17669405424507464, + "grad_norm": 0.001857622410170734, + "learning_rate": 3.533846395476676e-05, + "loss": 0.0033, + "step": 1500 + }, + { + "epoch": 0.18847365786141296, + "grad_norm": 0.0025074367877095938, + "learning_rate": 3.7694361551751213e-05, + "loss": 0.0033, + "step": 1600 + }, + { + "epoch": 0.20025326147775127, + "grad_norm": 0.0014156590914353728, + "learning_rate": 4.005025914873566e-05, + "loss": 0.0032, + "step": 1700 + }, + { + "epoch": 0.21203286509408958, + "grad_norm": 0.002935826312750578, + "learning_rate": 4.2406156745720116e-05, + "loss": 0.0032, + "step": 1800 + }, + { + "epoch": 0.2238124687104279, + "grad_norm": 0.0022236239165067673, + "learning_rate": 4.476205434270457e-05, + "loss": 0.0032, + "step": 1900 + }, + { + "epoch": 0.2355920723267662, + "grad_norm": 0.0017197765409946442, + "learning_rate": 4.711795193968902e-05, + "loss": 0.0031, + "step": 2000 + }, + { + "epoch": 0.2355920723267662, + "eval_en-ja_loss": 0.003080214373767376, + "eval_en-ja_mean_accuracy": 0.02788514632799558, + "eval_en-ja_negative_mse": -0.30219781398773193, + "eval_en-ja_runtime": 17.0719, + "eval_en-ja_samples_per_second": 424.323, + "eval_en-ja_src2trg_accuracy": 0.024710104914411928, + "eval_en-ja_steps_per_second": 1.699, + "eval_en-ja_trg2src_accuracy": 0.03106018774157924, + "eval_sequential_score": -0.13715633382986817, + "step": 2000 + }, + { + "epoch": 0.24737167594310452, + "grad_norm": 0.002077859826385975, + "learning_rate": 4.947384953667347e-05, + "loss": 0.0031, + "step": 2100 + }, + { + "epoch": 0.25915127955944284, + "grad_norm": 0.0019501947099342942, + "learning_rate": 5.1829747133657914e-05, + "loss": 0.0031, + "step": 2200 + }, + { + "epoch": 0.2709308831757811, + "grad_norm": 0.0019857033621519804, + "learning_rate": 5.418564473064237e-05, + "loss": 0.0031, + "step": 2300 + }, + { + "epoch": 0.28271048679211946, + "grad_norm": 0.0015554423443973064, + "learning_rate": 5.654154232762682e-05, + "loss": 0.003, + "step": 2400 + }, + { + "epoch": 0.29449009040845775, + "grad_norm": 0.0012266829144209623, + "learning_rate": 5.8897439924611275e-05, + "loss": 0.003, + "step": 2500 + }, + { + "epoch": 0.3062696940247961, + "grad_norm": 0.001571476343087852, + "learning_rate": 6.125333752159572e-05, + "loss": 0.003, + "step": 2600 + }, + { + "epoch": 0.3180492976411344, + "grad_norm": 0.00157955102622509, + "learning_rate": 6.360923511858017e-05, + "loss": 0.0029, + "step": 2700 + }, + { + "epoch": 0.32982890125747266, + "grad_norm": 0.0017730980180203915, + "learning_rate": 6.596513271556462e-05, + "loss": 0.0029, + "step": 2800 + }, + { + "epoch": 0.341608504873811, + "grad_norm": 0.002129686763510108, + "learning_rate": 6.832103031254907e-05, + "loss": 0.0029, + "step": 2900 + }, + { + "epoch": 0.3533881084901493, + "grad_norm": 0.0010249328333884478, + "learning_rate": 7.067692790953352e-05, + "loss": 0.0028, + "step": 3000 + }, + { + "epoch": 0.3533881084901493, + "eval_en-ja_loss": 0.002744419267401099, + "eval_en-ja_mean_accuracy": 0.08876311430149089, + "eval_en-ja_negative_mse": -0.273660272359848, + "eval_en-ja_runtime": 16.5603, + "eval_en-ja_samples_per_second": 437.431, + "eval_en-ja_src2trg_accuracy": 0.07771949199337383, + "eval_en-ja_steps_per_second": 1.751, + "eval_en-ja_trg2src_accuracy": 0.09980673660960795, + "eval_sequential_score": -0.09244857902917857, + "step": 3000 + }, + { + "epoch": 0.36516771210648763, + "grad_norm": 0.001475668279454112, + "learning_rate": 7.303282550651798e-05, + "loss": 0.0028, + "step": 3100 + }, + { + "epoch": 0.3769473157228259, + "grad_norm": 0.0015636230818927288, + "learning_rate": 7.538872310350243e-05, + "loss": 0.0028, + "step": 3200 + }, + { + "epoch": 0.38872691933916426, + "grad_norm": 0.0015552254626527429, + "learning_rate": 7.774462070048688e-05, + "loss": 0.0027, + "step": 3300 + }, + { + "epoch": 0.40050652295550254, + "grad_norm": 0.0012137816520407796, + "learning_rate": 8.010051829747132e-05, + "loss": 0.0027, + "step": 3400 + }, + { + "epoch": 0.4122861265718409, + "grad_norm": 0.0014763087965548038, + "learning_rate": 8.245641589445578e-05, + "loss": 0.0027, + "step": 3500 + }, + { + "epoch": 0.42406573018817917, + "grad_norm": 0.001373777282424271, + "learning_rate": 8.481231349144023e-05, + "loss": 0.0026, + "step": 3600 + }, + { + "epoch": 0.43584533380451745, + "grad_norm": 0.001388333854265511, + "learning_rate": 8.716821108842468e-05, + "loss": 0.0026, + "step": 3700 + }, + { + "epoch": 0.4476249374208558, + "grad_norm": 0.0010845715878531337, + "learning_rate": 8.952410868540913e-05, + "loss": 0.0026, + "step": 3800 + }, + { + "epoch": 0.4594045410371941, + "grad_norm": 0.0011637168936431408, + "learning_rate": 9.188000628239359e-05, + "loss": 0.0025, + "step": 3900 + }, + { + "epoch": 0.4711841446535324, + "grad_norm": 0.001119203050620854, + "learning_rate": 9.423590387937804e-05, + "loss": 0.0025, + "step": 4000 + }, + { + "epoch": 0.4711841446535324, + "eval_en-ja_loss": 0.0024201483465731144, + "eval_en-ja_mean_accuracy": 0.2314329099944782, + "eval_en-ja_negative_mse": -0.25388333201408386, + "eval_en-ja_runtime": 18.2806, + "eval_en-ja_samples_per_second": 396.268, + "eval_en-ja_src2trg_accuracy": 0.22239094422970734, + "eval_en-ja_steps_per_second": 1.586, + "eval_en-ja_trg2src_accuracy": 0.24047487575924903, + "eval_sequential_score": -0.011225211009802838, + "step": 4000 + }, + { + "epoch": 0.4829637482698707, + "grad_norm": 0.0011038570664823055, + "learning_rate": 9.659180147636247e-05, + "loss": 0.0025, + "step": 4100 + }, + { + "epoch": 0.49474335188620905, + "grad_norm": 0.0009608145337551832, + "learning_rate": 9.894769907334694e-05, + "loss": 0.0024, + "step": 4200 + }, + { + "epoch": 0.5065229555025473, + "grad_norm": 0.0010591832688078284, + "learning_rate": 0.00010130359667033139, + "loss": 0.0024, + "step": 4300 + }, + { + "epoch": 0.5183025591188857, + "grad_norm": 0.001131002209149301, + "learning_rate": 0.00010365949426731583, + "loss": 0.0024, + "step": 4400 + }, + { + "epoch": 0.5300821627352239, + "grad_norm": 0.0009896925184875727, + "learning_rate": 0.00010601539186430028, + "loss": 0.0023, + "step": 4500 + }, + { + "epoch": 0.5418617663515622, + "grad_norm": 0.001094823470339179, + "learning_rate": 0.00010837128946128474, + "loss": 0.0023, + "step": 4600 + }, + { + "epoch": 0.5536413699679006, + "grad_norm": 0.0010250096675008535, + "learning_rate": 0.0001107271870582692, + "loss": 0.0023, + "step": 4700 + }, + { + "epoch": 0.5654209735842389, + "grad_norm": 0.001149253686890006, + "learning_rate": 0.00011308308465525363, + "loss": 0.0023, + "step": 4800 + }, + { + "epoch": 0.5772005772005772, + "grad_norm": 0.0009457394480705261, + "learning_rate": 0.0001154389822522381, + "loss": 0.0022, + "step": 4900 + }, + { + "epoch": 0.5889801808169155, + "grad_norm": 0.0009560625185258687, + "learning_rate": 0.00011779487984922255, + "loss": 0.0022, + "step": 5000 + }, + { + "epoch": 0.5889801808169155, + "eval_en-ja_loss": 0.0021485858596861362, + "eval_en-ja_mean_accuracy": 0.37824406405300937, + "eval_en-ja_negative_mse": -0.23925437033176422, + "eval_en-ja_runtime": 16.4826, + "eval_en-ja_samples_per_second": 439.493, + "eval_en-ja_src2trg_accuracy": 0.37990060739922693, + "eval_en-ja_steps_per_second": 1.759, + "eval_en-ja_trg2src_accuracy": 0.3765875207067918, + "eval_sequential_score": 0.06949484686062257, + "step": 5000 + }, + { + "epoch": 0.6007597844332538, + "grad_norm": 0.00087635254021734, + "learning_rate": 0.00012015077744620699, + "loss": 0.0022, + "step": 5100 + }, + { + "epoch": 0.6125393880495922, + "grad_norm": 0.0008195164846256375, + "learning_rate": 0.00012250667504319144, + "loss": 0.0022, + "step": 5200 + }, + { + "epoch": 0.6243189916659304, + "grad_norm": 0.0008407826535403728, + "learning_rate": 0.0001248625726401759, + "loss": 0.0021, + "step": 5300 + }, + { + "epoch": 0.6360985952822688, + "grad_norm": 0.0008125107269734144, + "learning_rate": 0.00012721847023716034, + "loss": 0.0021, + "step": 5400 + }, + { + "epoch": 0.6478781988986071, + "grad_norm": 0.000785350042860955, + "learning_rate": 0.0001295743678341448, + "loss": 0.0021, + "step": 5500 + }, + { + "epoch": 0.6596578025149453, + "grad_norm": 0.0008201107266359031, + "learning_rate": 0.00013193026543112924, + "loss": 0.0021, + "step": 5600 + }, + { + "epoch": 0.6714374061312837, + "grad_norm": 0.0008951873751357198, + "learning_rate": 0.0001342861630281137, + "loss": 0.0021, + "step": 5700 + }, + { + "epoch": 0.683217009747622, + "grad_norm": 0.0008410403388552368, + "learning_rate": 0.00013664206062509815, + "loss": 0.002, + "step": 5800 + }, + { + "epoch": 0.6949966133639603, + "grad_norm": 0.0008280111360363662, + "learning_rate": 0.0001389979582220826, + "loss": 0.002, + "step": 5900 + }, + { + "epoch": 0.7067762169802986, + "grad_norm": 0.0008616075501777232, + "learning_rate": 0.00014135385581906705, + "loss": 0.002, + "step": 6000 + }, + { + "epoch": 0.7067762169802986, + "eval_en-ja_loss": 0.001952564693056047, + "eval_en-ja_mean_accuracy": 0.4659718387631143, + "eval_en-ja_negative_mse": -0.22806446254253387, + "eval_en-ja_runtime": 16.2611, + "eval_en-ja_samples_per_second": 445.481, + "eval_en-ja_src2trg_accuracy": 0.47390944229707344, + "eval_en-ja_steps_per_second": 1.783, + "eval_en-ja_trg2src_accuracy": 0.45803423522915515, + "eval_sequential_score": 0.11895368811029022, + "step": 6000 + }, + { + "epoch": 0.7185558205966369, + "grad_norm": 0.0009338635136373341, + "learning_rate": 0.0001437097534160515, + "loss": 0.002, + "step": 6100 + }, + { + "epoch": 0.7303354242129753, + "grad_norm": 0.0009498377912677824, + "learning_rate": 0.00014606565101303595, + "loss": 0.002, + "step": 6200 + }, + { + "epoch": 0.7421150278293135, + "grad_norm": 0.0008307321113534272, + "learning_rate": 0.0001484215486100204, + "loss": 0.002, + "step": 6300 + }, + { + "epoch": 0.7538946314456518, + "grad_norm": 0.0008533311192877591, + "learning_rate": 0.00015077744620700485, + "loss": 0.0019, + "step": 6400 + }, + { + "epoch": 0.7656742350619902, + "grad_norm": 0.0007098555797711015, + "learning_rate": 0.0001531333438039893, + "loss": 0.0019, + "step": 6500 + }, + { + "epoch": 0.7774538386783285, + "grad_norm": 0.0008207617793232203, + "learning_rate": 0.00015548924140097376, + "loss": 0.0019, + "step": 6600 + }, + { + "epoch": 0.7892334422946667, + "grad_norm": 0.0007547519053332508, + "learning_rate": 0.0001578451389979582, + "loss": 0.0019, + "step": 6700 + }, + { + "epoch": 0.8010130459110051, + "grad_norm": 0.0006804427248425782, + "learning_rate": 0.00016020103659494263, + "loss": 0.0019, + "step": 6800 + }, + { + "epoch": 0.8127926495273434, + "grad_norm": 0.0006934762350283563, + "learning_rate": 0.00016255693419192714, + "loss": 0.0019, + "step": 6900 + }, + { + "epoch": 0.8245722531436818, + "grad_norm": 0.0007264446467161179, + "learning_rate": 0.00016491283178891156, + "loss": 0.0018, + "step": 7000 + }, + { + "epoch": 0.8245722531436818, + "eval_en-ja_loss": 0.001812125789001584, + "eval_en-ja_mean_accuracy": 0.5271258972943125, + "eval_en-ja_negative_mse": -0.2197086066007614, + "eval_en-ja_runtime": 15.9421, + "eval_en-ja_samples_per_second": 454.396, + "eval_en-ja_src2trg_accuracy": 0.537548315847598, + "eval_en-ja_steps_per_second": 1.819, + "eval_en-ja_trg2src_accuracy": 0.5167034787410271, + "eval_sequential_score": 0.15370864534677553, + "step": 7000 + }, + { + "epoch": 0.83635185676002, + "grad_norm": 0.0007104870746843517, + "learning_rate": 0.000167268729385896, + "loss": 0.0018, + "step": 7100 + }, + { + "epoch": 0.8481314603763583, + "grad_norm": 0.0006919393199495971, + "learning_rate": 0.00016962462698288046, + "loss": 0.0018, + "step": 7200 + }, + { + "epoch": 0.8599110639926967, + "grad_norm": 0.0007842793711461127, + "learning_rate": 0.00017198052457986492, + "loss": 0.0018, + "step": 7300 + }, + { + "epoch": 0.8716906676090349, + "grad_norm": 0.0007096115732565522, + "learning_rate": 0.00017433642217684937, + "loss": 0.0018, + "step": 7400 + }, + { + "epoch": 0.8834702712253732, + "grad_norm": 0.0006692445022054017, + "learning_rate": 0.0001766923197738338, + "loss": 0.0018, + "step": 7500 + }, + { + "epoch": 0.8952498748417116, + "grad_norm": 0.0007711151847615838, + "learning_rate": 0.00017904821737081827, + "loss": 0.0018, + "step": 7600 + }, + { + "epoch": 0.9070294784580499, + "grad_norm": 0.000663991435430944, + "learning_rate": 0.00018140411496780272, + "loss": 0.0018, + "step": 7700 + }, + { + "epoch": 0.9188090820743882, + "grad_norm": 0.0006886798655614257, + "learning_rate": 0.00018376001256478717, + "loss": 0.0018, + "step": 7800 + }, + { + "epoch": 0.9305886856907265, + "grad_norm": 0.0006660849903710186, + "learning_rate": 0.00018611591016177162, + "loss": 0.0018, + "step": 7900 + }, + { + "epoch": 0.9423682893070648, + "grad_norm": 0.0006549587124027312, + "learning_rate": 0.00018847180775875607, + "loss": 0.0017, + "step": 8000 + }, + { + "epoch": 0.9423682893070648, + "eval_en-ja_loss": 0.0017117271199822426, + "eval_en-ja_mean_accuracy": 0.5574268360022088, + "eval_en-ja_negative_mse": -0.21387897431850433, + "eval_en-ja_runtime": 15.8954, + "eval_en-ja_samples_per_second": 455.73, + "eval_en-ja_src2trg_accuracy": 0.5705411374930978, + "eval_en-ja_steps_per_second": 1.824, + "eval_en-ja_trg2src_accuracy": 0.5443125345113197, + "eval_sequential_score": 0.17177393084185222, + "step": 8000 + }, + { + "epoch": 0.9541478929234031, + "grad_norm": 0.0006846050964668393, + "learning_rate": 0.00019082770535574053, + "loss": 0.0017, + "step": 8100 + }, + { + "epoch": 0.9659274965397414, + "grad_norm": 0.0007053768495097756, + "learning_rate": 0.00019318360295272495, + "loss": 0.0017, + "step": 8200 + }, + { + "epoch": 0.9777071001560798, + "grad_norm": 0.0007028156542219222, + "learning_rate": 0.00019553950054970943, + "loss": 0.0017, + "step": 8300 + }, + { + "epoch": 0.9894867037724181, + "grad_norm": 0.0006561035406775773, + "learning_rate": 0.00019789539814669388, + "loss": 0.0017, + "step": 8400 + }, + { + "epoch": 1.0011779603616338, + "grad_norm": 0.0008072027703747153, + "learning_rate": 0.00020025129574367833, + "loss": 0.0017, + "step": 8500 + }, + { + "epoch": 1.0129575639779722, + "grad_norm": 0.0007024264778010547, + "learning_rate": 0.00020260719334066278, + "loss": 0.0017, + "step": 8600 + }, + { + "epoch": 1.0247371675943104, + "grad_norm": 0.0007213215576484799, + "learning_rate": 0.00020496309093764723, + "loss": 0.0017, + "step": 8700 + }, + { + "epoch": 1.0365167712106487, + "grad_norm": 0.000628623238299042, + "learning_rate": 0.00020731898853463166, + "loss": 0.0017, + "step": 8800 + }, + { + "epoch": 1.0482963748269871, + "grad_norm": 0.0006642696680501103, + "learning_rate": 0.0002096748861316161, + "loss": 0.0017, + "step": 8900 + }, + { + "epoch": 1.0600759784433254, + "grad_norm": 0.0005983785958960652, + "learning_rate": 0.00021203078372860056, + "loss": 0.0017, + "step": 9000 + }, + { + "epoch": 1.0600759784433254, + "eval_en-ja_loss": 0.001634490443393588, + "eval_en-ja_mean_accuracy": 0.5752346769740475, + "eval_en-ja_negative_mse": -0.20854459702968597, + "eval_en-ja_runtime": 16.6796, + "eval_en-ja_samples_per_second": 434.302, + "eval_en-ja_src2trg_accuracy": 0.5904196576477084, + "eval_en-ja_steps_per_second": 1.739, + "eval_en-ja_trg2src_accuracy": 0.5600496963003865, + "eval_sequential_score": 0.18334503997218077, + "step": 9000 + }, + { + "epoch": 1.0718555820596638, + "grad_norm": 0.0006104575004428625, + "learning_rate": 0.00021438668132558504, + "loss": 0.0016, + "step": 9100 + }, + { + "epoch": 1.083635185676002, + "grad_norm": 0.0006298807566054165, + "learning_rate": 0.0002167425789225695, + "loss": 0.0016, + "step": 9200 + }, + { + "epoch": 1.0954147892923403, + "grad_norm": 0.0006076506688259542, + "learning_rate": 0.00021909847651955394, + "loss": 0.0016, + "step": 9300 + }, + { + "epoch": 1.1071943929086787, + "grad_norm": 0.0006297025829553604, + "learning_rate": 0.0002214543741165384, + "loss": 0.0016, + "step": 9400 + }, + { + "epoch": 1.118973996525017, + "grad_norm": 0.0005593663081526756, + "learning_rate": 0.00022381027171352282, + "loss": 0.0016, + "step": 9500 + }, + { + "epoch": 1.1307536001413552, + "grad_norm": 0.000578422041144222, + "learning_rate": 0.00022616616931050727, + "loss": 0.0016, + "step": 9600 + }, + { + "epoch": 1.1425332037576936, + "grad_norm": 0.0006351738120429218, + "learning_rate": 0.00022852206690749172, + "loss": 0.0016, + "step": 9700 + }, + { + "epoch": 1.1543128073740319, + "grad_norm": 0.0005694125429727137, + "learning_rate": 0.0002308779645044762, + "loss": 0.0016, + "step": 9800 + }, + { + "epoch": 1.16609241099037, + "grad_norm": 0.0005846963031217456, + "learning_rate": 0.00023323386210146065, + "loss": 0.0016, + "step": 9900 + }, + { + "epoch": 1.1778720146067085, + "grad_norm": 0.0005989396595396101, + "learning_rate": 0.0002355897596984451, + "loss": 0.0016, + "step": 10000 + }, + { + "epoch": 1.1778720146067085, + "eval_en-ja_loss": 0.001572986482642591, + "eval_en-ja_mean_accuracy": 0.5946300386526782, + "eval_en-ja_negative_mse": -0.2045869082212448, + "eval_en-ja_runtime": 16.5276, + "eval_en-ja_samples_per_second": 438.297, + "eval_en-ja_src2trg_accuracy": 0.609055770292656, + "eval_en-ja_steps_per_second": 1.755, + "eval_en-ja_trg2src_accuracy": 0.5802043070127002, + "eval_sequential_score": 0.19502156521571667, + "step": 10000 + }, + { + "epoch": 1.1896516182230468, + "grad_norm": 0.0005703960196115077, + "learning_rate": 0.00023794565729542952, + "loss": 0.0016, + "step": 10100 + }, + { + "epoch": 1.201431221839385, + "grad_norm": 0.0005499667022377253, + "learning_rate": 0.00024030155489241397, + "loss": 0.0016, + "step": 10200 + }, + { + "epoch": 1.2132108254557235, + "grad_norm": 0.0005858128424733877, + "learning_rate": 0.00024265745248939843, + "loss": 0.0016, + "step": 10300 + }, + { + "epoch": 1.2249904290720617, + "grad_norm": 0.0005962181021459401, + "learning_rate": 0.0002450133500863829, + "loss": 0.0016, + "step": 10400 + }, + { + "epoch": 1.2367700326884, + "grad_norm": 0.0006342564010992646, + "learning_rate": 0.00024736924768336736, + "loss": 0.0016, + "step": 10500 + }, + { + "epoch": 1.2485496363047384, + "grad_norm": 0.0006601563072763383, + "learning_rate": 0.0002497251452803518, + "loss": 0.0016, + "step": 10600 + }, + { + "epoch": 1.2603292399210766, + "grad_norm": 0.0005563644226640463, + "learning_rate": 0.00025208104287733626, + "loss": 0.0015, + "step": 10700 + }, + { + "epoch": 1.272108843537415, + "grad_norm": 0.0005471475305967033, + "learning_rate": 0.0002544369404743207, + "loss": 0.0015, + "step": 10800 + }, + { + "epoch": 1.2838884471537533, + "grad_norm": 0.000617621059063822, + "learning_rate": 0.00025679283807130516, + "loss": 0.0015, + "step": 10900 + }, + { + "epoch": 1.2956680507700915, + "grad_norm": 0.0005345616373233497, + "learning_rate": 0.0002591487356682896, + "loss": 0.0015, + "step": 11000 + }, + { + "epoch": 1.2956680507700915, + "eval_en-ja_loss": 0.001528572873212397, + "eval_en-ja_mean_accuracy": 0.6009801214798454, + "eval_en-ja_negative_mse": -0.20193737745285034, + "eval_en-ja_runtime": 15.6662, + "eval_en-ja_samples_per_second": 462.395, + "eval_en-ja_src2trg_accuracy": 0.6151297625621204, + "eval_en-ja_steps_per_second": 1.851, + "eval_en-ja_trg2src_accuracy": 0.5868304803975704, + "eval_sequential_score": 0.1995213720134975, + "step": 11000 + }, + { + "epoch": 1.30744765438643, + "grad_norm": 0.0005438519874587655, + "learning_rate": 0.000261504633265274, + "loss": 0.0015, + "step": 11100 + }, + { + "epoch": 1.3192272580027682, + "grad_norm": 0.0005742206703871489, + "learning_rate": 0.0002638605308622585, + "loss": 0.0015, + "step": 11200 + }, + { + "epoch": 1.3310068616191066, + "grad_norm": 0.0005811781156808138, + "learning_rate": 0.00026621642845924297, + "loss": 0.0015, + "step": 11300 + }, + { + "epoch": 1.3427864652354449, + "grad_norm": 0.0005445885471999645, + "learning_rate": 0.0002685723260562274, + "loss": 0.0015, + "step": 11400 + }, + { + "epoch": 1.354566068851783, + "grad_norm": 0.000584499619435519, + "learning_rate": 0.00027092822365321187, + "loss": 0.0015, + "step": 11500 + }, + { + "epoch": 1.3663456724681216, + "grad_norm": 0.0005839588702656329, + "learning_rate": 0.0002732841212501963, + "loss": 0.0015, + "step": 11600 + }, + { + "epoch": 1.3781252760844598, + "grad_norm": 0.0005749035626649857, + "learning_rate": 0.0002756400188471807, + "loss": 0.0015, + "step": 11700 + }, + { + "epoch": 1.389904879700798, + "grad_norm": 0.0005430486053228378, + "learning_rate": 0.0002779959164441652, + "loss": 0.0015, + "step": 11800 + }, + { + "epoch": 1.4016844833171365, + "grad_norm": 0.0005305896047502756, + "learning_rate": 0.0002803518140411497, + "loss": 0.0015, + "step": 11900 + }, + { + "epoch": 1.4134640869334747, + "grad_norm": 0.0005816232296638191, + "learning_rate": 0.0002827077116381341, + "loss": 0.0015, + "step": 12000 + }, + { + "epoch": 1.4134640869334747, + "eval_en-ja_loss": 0.001488846493884921, + "eval_en-ja_mean_accuracy": 0.6131971286581999, + "eval_en-ja_negative_mse": -0.19939835369586945, + "eval_en-ja_runtime": 16.5948, + "eval_en-ja_samples_per_second": 436.523, + "eval_en-ja_src2trg_accuracy": 0.6271397018221977, + "eval_en-ja_steps_per_second": 1.748, + "eval_en-ja_trg2src_accuracy": 0.5992545554942021, + "eval_sequential_score": 0.2068993874811652, + "step": 12000 + }, + { + "epoch": 1.425243690549813, + "grad_norm": 0.0005349160637706518, + "learning_rate": 0.0002850636092351186, + "loss": 0.0015, + "step": 12100 + }, + { + "epoch": 1.4370232941661514, + "grad_norm": 0.0005729619297198951, + "learning_rate": 0.000287419506832103, + "loss": 0.0015, + "step": 12200 + }, + { + "epoch": 1.4488028977824896, + "grad_norm": 0.0005212129326537251, + "learning_rate": 0.0002897754044290875, + "loss": 0.0015, + "step": 12300 + }, + { + "epoch": 1.4605825013988278, + "grad_norm": 0.0005661833565682173, + "learning_rate": 0.0002921313020260719, + "loss": 0.0015, + "step": 12400 + }, + { + "epoch": 1.4723621050151663, + "grad_norm": 0.0005432522157207131, + "learning_rate": 0.0002944871996230563, + "loss": 0.0015, + "step": 12500 + }, + { + "epoch": 1.4841417086315045, + "grad_norm": 0.0005210420349612832, + "learning_rate": 0.0002968430972200408, + "loss": 0.0015, + "step": 12600 + }, + { + "epoch": 1.4959213122478427, + "grad_norm": 0.0005303003708831966, + "learning_rate": 0.0002991989948170253, + "loss": 0.0015, + "step": 12700 + }, + { + "epoch": 1.5077009158641812, + "grad_norm": 0.0005309931002557278, + "learning_rate": 0.00029972559454515213, + "loss": 0.0015, + "step": 12800 + }, + { + "epoch": 1.5194805194805194, + "grad_norm": 0.0005355182220228016, + "learning_rate": 0.00029930982870447363, + "loss": 0.0015, + "step": 12900 + }, + { + "epoch": 1.5312601230968577, + "grad_norm": 0.0005120536661706865, + "learning_rate": 0.0002988940628637951, + "loss": 0.0015, + "step": 13000 + }, + { + "epoch": 1.5312601230968577, + "eval_en-ja_loss": 0.0014483643462881446, + "eval_en-ja_mean_accuracy": 0.6252070679182772, + "eval_en-ja_negative_mse": -0.19621890783309937, + "eval_en-ja_runtime": 16.3086, + "eval_en-ja_samples_per_second": 444.182, + "eval_en-ja_src2trg_accuracy": 0.6399779127553837, + "eval_en-ja_steps_per_second": 1.778, + "eval_en-ja_trg2src_accuracy": 0.6104362230811706, + "eval_sequential_score": 0.2144940800425889, + "step": 13000 + }, + { + "epoch": 1.5430397267131961, + "grad_norm": 0.0005324038793332875, + "learning_rate": 0.0002984782970231166, + "loss": 0.0014, + "step": 13100 + }, + { + "epoch": 1.5548193303295346, + "grad_norm": 0.0005366950063034892, + "learning_rate": 0.000298062531182438, + "loss": 0.0014, + "step": 13200 + }, + { + "epoch": 1.5665989339458726, + "grad_norm": 0.0005314123118296266, + "learning_rate": 0.0002976467653417595, + "loss": 0.0014, + "step": 13300 + }, + { + "epoch": 1.578378537562211, + "grad_norm": 0.0005177312414161861, + "learning_rate": 0.00029723099950108097, + "loss": 0.0014, + "step": 13400 + }, + { + "epoch": 1.5901581411785495, + "grad_norm": 0.0004988547880202532, + "learning_rate": 0.00029681523366040247, + "loss": 0.0014, + "step": 13500 + }, + { + "epoch": 1.6019377447948875, + "grad_norm": 0.0005393567262217402, + "learning_rate": 0.0002963994678197239, + "loss": 0.0014, + "step": 13600 + }, + { + "epoch": 1.613717348411226, + "grad_norm": 0.0005347069818526506, + "learning_rate": 0.00029598370197904536, + "loss": 0.0014, + "step": 13700 + }, + { + "epoch": 1.6254969520275644, + "grad_norm": 0.00047747965436428785, + "learning_rate": 0.00029556793613836686, + "loss": 0.0014, + "step": 13800 + }, + { + "epoch": 1.6372765556439026, + "grad_norm": 0.0004842175403609872, + "learning_rate": 0.0002951521702976883, + "loss": 0.0014, + "step": 13900 + }, + { + "epoch": 1.6490561592602409, + "grad_norm": 0.0004956109332852066, + "learning_rate": 0.0002947364044570098, + "loss": 0.0014, + "step": 14000 + }, + { + "epoch": 1.6490561592602409, + "eval_en-ja_loss": 0.0014226146740838885, + "eval_en-ja_mean_accuracy": 0.6284511319712865, + "eval_en-ja_negative_mse": -0.19430138170719147, + "eval_en-ja_runtime": 16.06, + "eval_en-ja_samples_per_second": 451.057, + "eval_en-ja_src2trg_accuracy": 0.6423246824958586, + "eval_en-ja_steps_per_second": 1.806, + "eval_en-ja_trg2src_accuracy": 0.6145775814467145, + "eval_sequential_score": 0.21707487513204754, + "step": 14000 + }, + { + "epoch": 1.6608357628765793, + "grad_norm": 0.0005480212857946754, + "learning_rate": 0.00029432063861633125, + "loss": 0.0014, + "step": 14100 + }, + { + "epoch": 1.6726153664929175, + "grad_norm": 0.0005384628311730921, + "learning_rate": 0.00029390487277565274, + "loss": 0.0014, + "step": 14200 + }, + { + "epoch": 1.6843949701092558, + "grad_norm": 0.0005003534024581313, + "learning_rate": 0.0002934891069349742, + "loss": 0.0014, + "step": 14300 + }, + { + "epoch": 1.6961745737255942, + "grad_norm": 0.0004881983040831983, + "learning_rate": 0.0002930733410942957, + "loss": 0.0014, + "step": 14400 + }, + { + "epoch": 1.7079541773419324, + "grad_norm": 0.0004753917164634913, + "learning_rate": 0.00029265757525361713, + "loss": 0.0014, + "step": 14500 + }, + { + "epoch": 1.7197337809582707, + "grad_norm": 0.0004935594624839723, + "learning_rate": 0.00029224180941293863, + "loss": 0.0014, + "step": 14600 + }, + { + "epoch": 1.7315133845746091, + "grad_norm": 0.0004387231601867825, + "learning_rate": 0.0002918260435722601, + "loss": 0.0014, + "step": 14700 + }, + { + "epoch": 1.7432929881909474, + "grad_norm": 0.0005050732870586216, + "learning_rate": 0.0002914102777315816, + "loss": 0.0014, + "step": 14800 + }, + { + "epoch": 1.7550725918072856, + "grad_norm": 0.00045474476064555347, + "learning_rate": 0.000290994511890903, + "loss": 0.0014, + "step": 14900 + }, + { + "epoch": 1.766852195423624, + "grad_norm": 0.00047812945558689535, + "learning_rate": 0.0002905787460502245, + "loss": 0.0014, + "step": 15000 + }, + { + "epoch": 1.766852195423624, + "eval_en-ja_loss": 0.0013963269302621484, + "eval_en-ja_mean_accuracy": 0.6341109884041966, + "eval_en-ja_negative_mse": -0.19249634444713593, + "eval_en-ja_runtime": 16.0558, + "eval_en-ja_samples_per_second": 451.177, + "eval_en-ja_src2trg_accuracy": 0.6472943125345113, + "eval_en-ja_steps_per_second": 1.806, + "eval_en-ja_trg2src_accuracy": 0.6209276642738818, + "eval_sequential_score": 0.2208073219785303, + "step": 15000 + }, + { + "epoch": 1.7786317990399623, + "grad_norm": 0.0004608567978721112, + "learning_rate": 0.00029016298020954597, + "loss": 0.0014, + "step": 15100 + }, + { + "epoch": 1.7904114026563005, + "grad_norm": 0.0004582498222589493, + "learning_rate": 0.00028974721436886747, + "loss": 0.0014, + "step": 15200 + }, + { + "epoch": 1.802191006272639, + "grad_norm": 0.00048188300570473075, + "learning_rate": 0.0002893314485281889, + "loss": 0.0014, + "step": 15300 + }, + { + "epoch": 1.8139706098889772, + "grad_norm": 0.0004572905309032649, + "learning_rate": 0.00028891568268751036, + "loss": 0.0014, + "step": 15400 + }, + { + "epoch": 1.8257502135053154, + "grad_norm": 0.0004831681726500392, + "learning_rate": 0.0002884999168468318, + "loss": 0.0014, + "step": 15500 + }, + { + "epoch": 1.8375298171216539, + "grad_norm": 0.00048430077731609344, + "learning_rate": 0.0002880841510061533, + "loss": 0.0014, + "step": 15600 + }, + { + "epoch": 1.8493094207379923, + "grad_norm": 0.0004495294124353677, + "learning_rate": 0.00028766838516547475, + "loss": 0.0014, + "step": 15700 + }, + { + "epoch": 1.8610890243543303, + "grad_norm": 0.00046128584654070437, + "learning_rate": 0.00028725261932479625, + "loss": 0.0014, + "step": 15800 + }, + { + "epoch": 1.8728686279706688, + "grad_norm": 0.0004842077032662928, + "learning_rate": 0.0002868368534841177, + "loss": 0.0014, + "step": 15900 + }, + { + "epoch": 1.8846482315870072, + "grad_norm": 0.0004887509276159108, + "learning_rate": 0.0002864210876434392, + "loss": 0.0014, + "step": 16000 + }, + { + "epoch": 1.8846482315870072, + "eval_en-ja_loss": 0.0013752784579992294, + "eval_en-ja_mean_accuracy": 0.6385974599668691, + "eval_en-ja_negative_mse": -0.19071045517921448, + "eval_en-ja_runtime": 16.1718, + "eval_en-ja_samples_per_second": 447.941, + "eval_en-ja_src2trg_accuracy": 0.6510215350635008, + "eval_en-ja_steps_per_second": 1.793, + "eval_en-ja_trg2src_accuracy": 0.6261733848702374, + "eval_sequential_score": 0.22394350239382732, + "step": 16000 + }, + { + "epoch": 1.8964278352033455, + "grad_norm": 0.0005181377637200058, + "learning_rate": 0.00028600532180276064, + "loss": 0.0014, + "step": 16100 + }, + { + "epoch": 1.9082074388196837, + "grad_norm": 0.0005232061957940459, + "learning_rate": 0.00028558955596208214, + "loss": 0.0014, + "step": 16200 + }, + { + "epoch": 1.9199870424360221, + "grad_norm": 0.00047923726378940046, + "learning_rate": 0.0002851737901214036, + "loss": 0.0014, + "step": 16300 + }, + { + "epoch": 1.9317666460523604, + "grad_norm": 0.0005227027577348053, + "learning_rate": 0.0002847580242807251, + "loss": 0.0014, + "step": 16400 + }, + { + "epoch": 1.9435462496686986, + "grad_norm": 0.0004532890161499381, + "learning_rate": 0.0002843422584400465, + "loss": 0.0014, + "step": 16500 + }, + { + "epoch": 1.955325853285037, + "grad_norm": 0.00045343692181631923, + "learning_rate": 0.000283926492599368, + "loss": 0.0014, + "step": 16600 + }, + { + "epoch": 1.9671054569013753, + "grad_norm": 0.000506916840095073, + "learning_rate": 0.00028351072675868947, + "loss": 0.0014, + "step": 16700 + }, + { + "epoch": 1.9788850605177135, + "grad_norm": 0.0004299329302739352, + "learning_rate": 0.00028309496091801097, + "loss": 0.0014, + "step": 16800 + }, + { + "epoch": 1.990664664134052, + "grad_norm": 0.0004374514101073146, + "learning_rate": 0.0002826791950773324, + "loss": 0.0014, + "step": 16900 + }, + { + "epoch": 2.0023559207232675, + "grad_norm": 0.0004381030157674104, + "learning_rate": 0.0002822634292366539, + "loss": 0.0013, + "step": 17000 + }, + { + "epoch": 2.0023559207232675, + "eval_en-ja_loss": 0.0013553217286244035, + "eval_en-ja_mean_accuracy": 0.6415654334621756, + "eval_en-ja_negative_mse": -0.1893077939748764, + "eval_en-ja_runtime": 15.6258, + "eval_en-ja_samples_per_second": 463.592, + "eval_en-ja_src2trg_accuracy": 0.6565433462175594, + "eval_en-ja_steps_per_second": 1.856, + "eval_en-ja_trg2src_accuracy": 0.6265875207067918, + "eval_sequential_score": 0.22612881974364962, + "step": 17000 + }, + { + "epoch": 2.014135524339606, + "grad_norm": 0.00044455964234657586, + "learning_rate": 0.00028184766339597536, + "loss": 0.0013, + "step": 17100 + }, + { + "epoch": 2.0259151279559444, + "grad_norm": 0.00049649114953354, + "learning_rate": 0.00028143189755529686, + "loss": 0.0013, + "step": 17200 + }, + { + "epoch": 2.0376947315722824, + "grad_norm": 0.0004488581034820527, + "learning_rate": 0.0002810161317146183, + "loss": 0.0013, + "step": 17300 + }, + { + "epoch": 2.049474335188621, + "grad_norm": 0.00048318435437977314, + "learning_rate": 0.00028060036587393975, + "loss": 0.0013, + "step": 17400 + }, + { + "epoch": 2.0612539388049593, + "grad_norm": 0.0004453736764844507, + "learning_rate": 0.00028018460003326125, + "loss": 0.0013, + "step": 17500 + }, + { + "epoch": 2.0730335424212973, + "grad_norm": 0.00043465656926855445, + "learning_rate": 0.0002797688341925827, + "loss": 0.0013, + "step": 17600 + }, + { + "epoch": 2.084813146037636, + "grad_norm": 0.0004763482138514519, + "learning_rate": 0.0002793530683519042, + "loss": 0.0013, + "step": 17700 + }, + { + "epoch": 2.0965927496539742, + "grad_norm": 0.0004519970971159637, + "learning_rate": 0.00027893730251122564, + "loss": 0.0013, + "step": 17800 + }, + { + "epoch": 2.1083723532703122, + "grad_norm": 0.0004956221673637629, + "learning_rate": 0.00027852153667054714, + "loss": 0.0013, + "step": 17900 + }, + { + "epoch": 2.1201519568866507, + "grad_norm": 0.0004852432757616043, + "learning_rate": 0.0002781057708298686, + "loss": 0.0013, + "step": 18000 + }, + { + "epoch": 2.1201519568866507, + "eval_en-ja_loss": 0.001341437455266714, + "eval_en-ja_mean_accuracy": 0.6452926559911651, + "eval_en-ja_negative_mse": -0.18814200162887573, + "eval_en-ja_runtime": 16.451, + "eval_en-ja_samples_per_second": 440.339, + "eval_en-ja_src2trg_accuracy": 0.6598564329099945, + "eval_en-ja_steps_per_second": 1.763, + "eval_en-ja_trg2src_accuracy": 0.6307288790723358, + "eval_sequential_score": 0.2285753271811447, + "step": 18000 + }, + { + "epoch": 2.131931560502989, + "grad_norm": 0.0004702436563093215, + "learning_rate": 0.0002776900049891901, + "loss": 0.0013, + "step": 18100 + }, + { + "epoch": 2.1437111641193276, + "grad_norm": 0.0004409950051922351, + "learning_rate": 0.00027727423914851153, + "loss": 0.0013, + "step": 18200 + }, + { + "epoch": 2.1554907677356656, + "grad_norm": 0.00045741169014945626, + "learning_rate": 0.00027685847330783303, + "loss": 0.0013, + "step": 18300 + }, + { + "epoch": 2.167270371352004, + "grad_norm": 0.00047351213288493454, + "learning_rate": 0.0002764427074671545, + "loss": 0.0013, + "step": 18400 + }, + { + "epoch": 2.1790499749683425, + "grad_norm": 0.000433657958637923, + "learning_rate": 0.000276026941626476, + "loss": 0.0013, + "step": 18500 + }, + { + "epoch": 2.1908295785846805, + "grad_norm": 0.0004532559250947088, + "learning_rate": 0.0002756111757857974, + "loss": 0.0013, + "step": 18600 + }, + { + "epoch": 2.202609182201019, + "grad_norm": 0.00046346973977051675, + "learning_rate": 0.0002751954099451189, + "loss": 0.0013, + "step": 18700 + }, + { + "epoch": 2.2143887858173574, + "grad_norm": 0.00044125530985184014, + "learning_rate": 0.00027477964410444036, + "loss": 0.0013, + "step": 18800 + }, + { + "epoch": 2.2261683894336954, + "grad_norm": 0.0004909245180897415, + "learning_rate": 0.00027436387826376186, + "loss": 0.0013, + "step": 18900 + }, + { + "epoch": 2.237947993050034, + "grad_norm": 0.0004369357484392822, + "learning_rate": 0.0002739481124230833, + "loss": 0.0013, + "step": 19000 + }, + { + "epoch": 2.237947993050034, + "eval_en-ja_loss": 0.0013303811429068446, + "eval_en-ja_mean_accuracy": 0.6468801766979569, + "eval_en-ja_negative_mse": -0.18757328391075134, + "eval_en-ja_runtime": 16.2031, + "eval_en-ja_samples_per_second": 447.076, + "eval_en-ja_src2trg_accuracy": 0.6619271120927664, + "eval_en-ja_steps_per_second": 1.79, + "eval_en-ja_trg2src_accuracy": 0.6318332413031474, + "eval_sequential_score": 0.2296534463936028, + "step": 19000 + }, + { + "epoch": 2.2497275966663723, + "grad_norm": 0.00043820307473652065, + "learning_rate": 0.0002735323465824048, + "loss": 0.0013, + "step": 19100 + }, + { + "epoch": 2.2615072002827104, + "grad_norm": 0.0004597469815053046, + "learning_rate": 0.0002731165807417262, + "loss": 0.0013, + "step": 19200 + }, + { + "epoch": 2.273286803899049, + "grad_norm": 0.00047903601080179214, + "learning_rate": 0.0002727008149010477, + "loss": 0.0013, + "step": 19300 + }, + { + "epoch": 2.2850664075153873, + "grad_norm": 0.00043814408127218485, + "learning_rate": 0.00027228504906036914, + "loss": 0.0013, + "step": 19400 + }, + { + "epoch": 2.2968460111317253, + "grad_norm": 0.0004361189785413444, + "learning_rate": 0.00027186928321969064, + "loss": 0.0013, + "step": 19500 + }, + { + "epoch": 2.3086256147480637, + "grad_norm": 0.00047146808356046677, + "learning_rate": 0.0002714535173790121, + "loss": 0.0013, + "step": 19600 + }, + { + "epoch": 2.320405218364402, + "grad_norm": 0.00045028969179838896, + "learning_rate": 0.0002710377515383336, + "loss": 0.0013, + "step": 19700 + }, + { + "epoch": 2.33218482198074, + "grad_norm": 0.0004939438658766448, + "learning_rate": 0.00027062198569765503, + "loss": 0.0013, + "step": 19800 + }, + { + "epoch": 2.3439644255970786, + "grad_norm": 0.0004272513906471431, + "learning_rate": 0.00027020621985697653, + "loss": 0.0013, + "step": 19900 + }, + { + "epoch": 2.355744029213417, + "grad_norm": 0.00044389726826921105, + "learning_rate": 0.000269790454016298, + "loss": 0.0013, + "step": 20000 + }, + { + "epoch": 2.355744029213417, + "eval_en-ja_loss": 0.0013153383042663336, + "eval_en-ja_mean_accuracy": 0.6433600220872446, + "eval_en-ja_negative_mse": -0.1863013356924057, + "eval_en-ja_runtime": 15.8253, + "eval_en-ja_samples_per_second": 457.747, + "eval_en-ja_src2trg_accuracy": 0.6577857537272225, + "eval_en-ja_steps_per_second": 1.833, + "eval_en-ja_trg2src_accuracy": 0.6289342904472667, + "eval_sequential_score": 0.22852934319741947, + "step": 20000 + }, + { + "epoch": 2.367523632829755, + "grad_norm": 0.0004159809905104339, + "learning_rate": 0.0002693746881756195, + "loss": 0.0013, + "step": 20100 + }, + { + "epoch": 2.3793032364460935, + "grad_norm": 0.0004204251745250076, + "learning_rate": 0.0002689589223349409, + "loss": 0.0013, + "step": 20200 + }, + { + "epoch": 2.391082840062432, + "grad_norm": 0.00044815154979005456, + "learning_rate": 0.0002685431564942624, + "loss": 0.0013, + "step": 20300 + }, + { + "epoch": 2.40286244367877, + "grad_norm": 0.0005053160712122917, + "learning_rate": 0.00026812739065358387, + "loss": 0.0013, + "step": 20400 + }, + { + "epoch": 2.4146420472951085, + "grad_norm": 0.00046544845099560916, + "learning_rate": 0.00026771162481290537, + "loss": 0.0013, + "step": 20500 + }, + { + "epoch": 2.426421650911447, + "grad_norm": 0.00048804539255797863, + "learning_rate": 0.0002672958589722268, + "loss": 0.0013, + "step": 20600 + }, + { + "epoch": 2.438201254527785, + "grad_norm": 0.00045345915714278817, + "learning_rate": 0.0002668800931315483, + "loss": 0.0013, + "step": 20700 + }, + { + "epoch": 2.4499808581441234, + "grad_norm": 0.0004578677471727133, + "learning_rate": 0.00026646432729086976, + "loss": 0.0013, + "step": 20800 + }, + { + "epoch": 2.461760461760462, + "grad_norm": 0.00045993231469765306, + "learning_rate": 0.00026604856145019125, + "loss": 0.0013, + "step": 20900 + }, + { + "epoch": 2.4735400653768, + "grad_norm": 0.000428997038397938, + "learning_rate": 0.0002656327956095127, + "loss": 0.0013, + "step": 21000 + }, + { + "epoch": 2.4735400653768, + "eval_en-ja_loss": 0.0013059406774118543, + "eval_en-ja_mean_accuracy": 0.6488128106018773, + "eval_en-ja_negative_mse": -0.18556642532348633, + "eval_en-ja_runtime": 16.2458, + "eval_en-ja_samples_per_second": 445.901, + "eval_en-ja_src2trg_accuracy": 0.6630314743235781, + "eval_en-ja_steps_per_second": 1.785, + "eval_en-ja_trg2src_accuracy": 0.6345941468801767, + "eval_sequential_score": 0.2316231926391955, + "step": 21000 + }, + { + "epoch": 2.4853196689931383, + "grad_norm": 0.0004242642899043858, + "learning_rate": 0.00026521702976883415, + "loss": 0.0013, + "step": 21100 + }, + { + "epoch": 2.4970992726094767, + "grad_norm": 0.00046763860154896975, + "learning_rate": 0.00026480126392815564, + "loss": 0.0013, + "step": 21200 + }, + { + "epoch": 2.5088788762258147, + "grad_norm": 0.00039489008486270905, + "learning_rate": 0.0002643854980874771, + "loss": 0.0013, + "step": 21300 + }, + { + "epoch": 2.520658479842153, + "grad_norm": 0.0004224838921800256, + "learning_rate": 0.0002639697322467986, + "loss": 0.0013, + "step": 21400 + }, + { + "epoch": 2.5324380834584916, + "grad_norm": 0.00042137960554100573, + "learning_rate": 0.00026355396640612003, + "loss": 0.0013, + "step": 21500 + }, + { + "epoch": 2.54421768707483, + "grad_norm": 0.00045712743303738534, + "learning_rate": 0.00026313820056544153, + "loss": 0.0013, + "step": 21600 + }, + { + "epoch": 2.555997290691168, + "grad_norm": 0.0004368867084849626, + "learning_rate": 0.000262722434724763, + "loss": 0.0013, + "step": 21700 + }, + { + "epoch": 2.5677768943075066, + "grad_norm": 0.0004699106502812356, + "learning_rate": 0.0002623066688840845, + "loss": 0.0013, + "step": 21800 + }, + { + "epoch": 2.579556497923845, + "grad_norm": 0.00045353127643465996, + "learning_rate": 0.0002618909030434059, + "loss": 0.0013, + "step": 21900 + }, + { + "epoch": 2.591336101540183, + "grad_norm": 0.00043989199912175536, + "learning_rate": 0.0002614751372027274, + "loss": 0.0013, + "step": 22000 + }, + { + "epoch": 2.591336101540183, + "eval_en-ja_loss": 0.001299669616855681, + "eval_en-ja_mean_accuracy": 0.6485367200441745, + "eval_en-ja_negative_mse": -0.1850576549768448, + "eval_en-ja_runtime": 16.2221, + "eval_en-ja_samples_per_second": 446.55, + "eval_en-ja_src2trg_accuracy": 0.6639977912755384, + "eval_en-ja_steps_per_second": 1.788, + "eval_en-ja_trg2src_accuracy": 0.6330756488128106, + "eval_sequential_score": 0.23173953253366486, + "step": 22000 + }, + { + "epoch": 2.6031157051565215, + "grad_norm": 0.0004401329788379371, + "learning_rate": 0.00026105937136204887, + "loss": 0.0013, + "step": 22100 + }, + { + "epoch": 2.61489530877286, + "grad_norm": 0.00043456064304336905, + "learning_rate": 0.00026064360552137037, + "loss": 0.0013, + "step": 22200 + }, + { + "epoch": 2.6266749123891984, + "grad_norm": 0.00045636334107257426, + "learning_rate": 0.0002602278396806918, + "loss": 0.0013, + "step": 22300 + }, + { + "epoch": 2.6384545160055364, + "grad_norm": 0.00040117601747624576, + "learning_rate": 0.0002598120738400133, + "loss": 0.0013, + "step": 22400 + }, + { + "epoch": 2.650234119621875, + "grad_norm": 0.00039009543252177536, + "learning_rate": 0.00025939630799933476, + "loss": 0.0013, + "step": 22500 + }, + { + "epoch": 2.6620137232382133, + "grad_norm": 0.00041120583773590624, + "learning_rate": 0.00025898054215865626, + "loss": 0.0013, + "step": 22600 + }, + { + "epoch": 2.6737933268545513, + "grad_norm": 0.00045477598905563354, + "learning_rate": 0.0002585647763179777, + "loss": 0.0013, + "step": 22700 + }, + { + "epoch": 2.6855729304708897, + "grad_norm": 0.0003941941831726581, + "learning_rate": 0.0002581490104772992, + "loss": 0.0013, + "step": 22800 + }, + { + "epoch": 2.697352534087228, + "grad_norm": 0.0004140040837228298, + "learning_rate": 0.0002577332446366206, + "loss": 0.0013, + "step": 22900 + }, + { + "epoch": 2.709132137703566, + "grad_norm": 0.0004226738528814167, + "learning_rate": 0.0002573174787959421, + "loss": 0.0013, + "step": 23000 + }, + { + "epoch": 2.709132137703566, + "eval_en-ja_loss": 0.0012869955971837044, + "eval_en-ja_mean_accuracy": 0.6517807840971839, + "eval_en-ja_negative_mse": -0.18397435545921326, + "eval_en-ja_runtime": 15.8046, + "eval_en-ja_samples_per_second": 458.347, + "eval_en-ja_src2trg_accuracy": 0.6668967421314191, + "eval_en-ja_steps_per_second": 1.835, + "eval_en-ja_trg2src_accuracy": 0.6366648260629486, + "eval_sequential_score": 0.2339032143189853, + "step": 23000 + }, + { + "epoch": 2.7209117413199047, + "grad_norm": 0.00044005323434248567, + "learning_rate": 0.00025690171295526354, + "loss": 0.0013, + "step": 23100 + }, + { + "epoch": 2.732691344936243, + "grad_norm": 0.00042875733925029635, + "learning_rate": 0.00025648594711458504, + "loss": 0.0013, + "step": 23200 + }, + { + "epoch": 2.744470948552581, + "grad_norm": 0.0004314716497901827, + "learning_rate": 0.0002560701812739065, + "loss": 0.0013, + "step": 23300 + }, + { + "epoch": 2.7562505521689196, + "grad_norm": 0.0004338954167906195, + "learning_rate": 0.000255654415433228, + "loss": 0.0013, + "step": 23400 + }, + { + "epoch": 2.768030155785258, + "grad_norm": 0.000435475172707811, + "learning_rate": 0.0002552386495925494, + "loss": 0.0013, + "step": 23500 + }, + { + "epoch": 2.779809759401596, + "grad_norm": 0.0004142407269682735, + "learning_rate": 0.0002548228837518709, + "loss": 0.0013, + "step": 23600 + }, + { + "epoch": 2.7915893630179345, + "grad_norm": 0.000396355171687901, + "learning_rate": 0.00025440711791119237, + "loss": 0.0013, + "step": 23700 + }, + { + "epoch": 2.803368966634273, + "grad_norm": 0.0004667534085456282, + "learning_rate": 0.00025399135207051387, + "loss": 0.0013, + "step": 23800 + }, + { + "epoch": 2.815148570250611, + "grad_norm": 0.0004124873667024076, + "learning_rate": 0.0002535755862298353, + "loss": 0.0013, + "step": 23900 + }, + { + "epoch": 2.8269281738669494, + "grad_norm": 0.00040506647201254964, + "learning_rate": 0.0002531598203891568, + "loss": 0.0013, + "step": 24000 + }, + { + "epoch": 2.8269281738669494, + "eval_en-ja_loss": 0.0012779892422258854, + "eval_en-ja_mean_accuracy": 0.6546797349530646, + "eval_en-ja_negative_mse": -0.18347077071666718, + "eval_en-ja_runtime": 16.2483, + "eval_en-ja_samples_per_second": 445.831, + "eval_en-ja_src2trg_accuracy": 0.6711761457758144, + "eval_en-ja_steps_per_second": 1.785, + "eval_en-ja_trg2src_accuracy": 0.6381833241303148, + "eval_sequential_score": 0.23560448211819873, + "step": 24000 + }, + { + "epoch": 2.838707777483288, + "grad_norm": 0.00043864856706932187, + "learning_rate": 0.00025274405454847826, + "loss": 0.0013, + "step": 24100 + }, + { + "epoch": 2.850487381099626, + "grad_norm": 0.00041525359847582877, + "learning_rate": 0.00025232828870779976, + "loss": 0.0013, + "step": 24200 + }, + { + "epoch": 2.8622669847159643, + "grad_norm": 0.0004281564033590257, + "learning_rate": 0.0002519125228671212, + "loss": 0.0013, + "step": 24300 + }, + { + "epoch": 2.8740465883323028, + "grad_norm": 0.00040322105633094907, + "learning_rate": 0.0002514967570264427, + "loss": 0.0013, + "step": 24400 + }, + { + "epoch": 2.8858261919486408, + "grad_norm": 0.00043089487007819116, + "learning_rate": 0.00025108099118576415, + "loss": 0.0013, + "step": 24500 + }, + { + "epoch": 2.897605795564979, + "grad_norm": 0.0004425602965056896, + "learning_rate": 0.00025066522534508565, + "loss": 0.0013, + "step": 24600 + }, + { + "epoch": 2.9093853991813177, + "grad_norm": 0.0003883236204273999, + "learning_rate": 0.0002502494595044071, + "loss": 0.0013, + "step": 24700 + }, + { + "epoch": 2.9211650027976557, + "grad_norm": 0.0004450333653949201, + "learning_rate": 0.00024983369366372854, + "loss": 0.0013, + "step": 24800 + }, + { + "epoch": 2.932944606413994, + "grad_norm": 0.0004472383589018136, + "learning_rate": 0.00024941792782305004, + "loss": 0.0013, + "step": 24900 + }, + { + "epoch": 2.9447242100303326, + "grad_norm": 0.00044354653800837696, + "learning_rate": 0.0002490021619823715, + "loss": 0.0013, + "step": 25000 + }, + { + "epoch": 2.9447242100303326, + "eval_en-ja_loss": 0.0012722989777103066, + "eval_en-ja_mean_accuracy": 0.6518498067366096, + "eval_en-ja_negative_mse": -0.18285994231700897, + "eval_en-ja_runtime": 16.0974, + "eval_en-ja_samples_per_second": 450.01, + "eval_en-ja_src2trg_accuracy": 0.6668967421314191, + "eval_en-ja_steps_per_second": 1.802, + "eval_en-ja_trg2src_accuracy": 0.6368028713418001, + "eval_sequential_score": 0.23449493220980033, + "step": 25000 + }, + { + "epoch": 2.9565038136466706, + "grad_norm": 0.00044725442421622574, + "learning_rate": 0.000248586396141693, + "loss": 0.0013, + "step": 25100 + }, + { + "epoch": 2.968283417263009, + "grad_norm": 0.00042140312143601477, + "learning_rate": 0.00024817063030101443, + "loss": 0.0013, + "step": 25200 + }, + { + "epoch": 2.9800630208793475, + "grad_norm": 0.0004184370918665081, + "learning_rate": 0.00024775486446033593, + "loss": 0.0013, + "step": 25300 + }, + { + "epoch": 2.9918426244956855, + "grad_norm": 0.00042939913691952825, + "learning_rate": 0.0002473390986196574, + "loss": 0.0013, + "step": 25400 + }, + { + "epoch": 3.0035338810849015, + "grad_norm": 0.0004355109704192728, + "learning_rate": 0.0002469233327789789, + "loss": 0.0012, + "step": 25500 + }, + { + "epoch": 3.01531348470124, + "grad_norm": 0.0003800842387136072, + "learning_rate": 0.0002465075669383003, + "loss": 0.0013, + "step": 25600 + }, + { + "epoch": 3.027093088317578, + "grad_norm": 0.00040753348730504513, + "learning_rate": 0.0002460918010976218, + "loss": 0.0012, + "step": 25700 + }, + { + "epoch": 3.0388726919339164, + "grad_norm": 0.00040229971637018025, + "learning_rate": 0.00024567603525694326, + "loss": 0.0013, + "step": 25800 + }, + { + "epoch": 3.050652295550255, + "grad_norm": 0.0004158740339335054, + "learning_rate": 0.00024526026941626476, + "loss": 0.0012, + "step": 25900 + }, + { + "epoch": 3.062431899166593, + "grad_norm": 0.00043101966730318964, + "learning_rate": 0.0002448445035755862, + "loss": 0.0012, + "step": 26000 + }, + { + "epoch": 3.062431899166593, + "eval_en-ja_loss": 0.0012646203394979239, + "eval_en-ja_mean_accuracy": 0.6535753727222529, + "eval_en-ja_negative_mse": -0.18248549103736877, + "eval_en-ja_runtime": 16.8043, + "eval_en-ja_samples_per_second": 431.079, + "eval_en-ja_src2trg_accuracy": 0.6681391496410822, + "eval_en-ja_steps_per_second": 1.726, + "eval_en-ja_trg2src_accuracy": 0.6390115958034235, + "eval_sequential_score": 0.23554494084244204, + "step": 26000 + }, + { + "epoch": 3.0742115027829313, + "grad_norm": 0.00043358991388231516, + "learning_rate": 0.0002444287377349077, + "loss": 0.0012, + "step": 26100 + }, + { + "epoch": 3.0859911063992698, + "grad_norm": 0.0004287803894840181, + "learning_rate": 0.00024401297189422915, + "loss": 0.0012, + "step": 26200 + }, + { + "epoch": 3.0977707100156078, + "grad_norm": 0.00041712954407557845, + "learning_rate": 0.00024359720605355062, + "loss": 0.0012, + "step": 26300 + }, + { + "epoch": 3.1095503136319462, + "grad_norm": 0.0004254399973433465, + "learning_rate": 0.0002431814402128721, + "loss": 0.0012, + "step": 26400 + }, + { + "epoch": 3.1213299172482847, + "grad_norm": 0.00040419885772280395, + "learning_rate": 0.00024276567437219357, + "loss": 0.0012, + "step": 26500 + }, + { + "epoch": 3.1331095208646227, + "grad_norm": 0.00042851027683354914, + "learning_rate": 0.00024234990853151501, + "loss": 0.0012, + "step": 26600 + }, + { + "epoch": 3.144889124480961, + "grad_norm": 0.0004129032895434648, + "learning_rate": 0.0002419341426908365, + "loss": 0.0012, + "step": 26700 + }, + { + "epoch": 3.1566687280972996, + "grad_norm": 0.0004250045749358833, + "learning_rate": 0.00024151837685015796, + "loss": 0.0012, + "step": 26800 + }, + { + "epoch": 3.168448331713638, + "grad_norm": 0.00040028098737820983, + "learning_rate": 0.00024110261100947943, + "loss": 0.0012, + "step": 26900 + }, + { + "epoch": 3.180227935329976, + "grad_norm": 0.00039669900434091687, + "learning_rate": 0.0002406868451688009, + "loss": 0.0012, + "step": 27000 + }, + { + "epoch": 3.180227935329976, + "eval_en-ja_loss": 0.0012597093591466546, + "eval_en-ja_mean_accuracy": 0.656888459414688, + "eval_en-ja_negative_mse": -0.18206669390201569, + "eval_en-ja_runtime": 16.3707, + "eval_en-ja_samples_per_second": 442.497, + "eval_en-ja_src2trg_accuracy": 0.6691054665930425, + "eval_en-ja_steps_per_second": 1.771, + "eval_en-ja_trg2src_accuracy": 0.6446714522363335, + "eval_sequential_score": 0.23741088275633615, + "step": 27000 + }, + { + "epoch": 3.1920075389463145, + "grad_norm": 0.000373284361558035, + "learning_rate": 0.00024027107932812238, + "loss": 0.0012, + "step": 27100 + }, + { + "epoch": 3.203787142562653, + "grad_norm": 0.00043669273145496845, + "learning_rate": 0.00023985531348744385, + "loss": 0.0012, + "step": 27200 + }, + { + "epoch": 3.215566746178991, + "grad_norm": 0.0004902679356746376, + "learning_rate": 0.00023943954764676532, + "loss": 0.0012, + "step": 27300 + }, + { + "epoch": 3.2273463497953294, + "grad_norm": 0.00042229739483445883, + "learning_rate": 0.0002390237818060868, + "loss": 0.0012, + "step": 27400 + }, + { + "epoch": 3.239125953411668, + "grad_norm": 0.0003798280085902661, + "learning_rate": 0.00023860801596540827, + "loss": 0.0012, + "step": 27500 + }, + { + "epoch": 3.250905557028006, + "grad_norm": 0.00037951284321025014, + "learning_rate": 0.00023819225012472974, + "loss": 0.0012, + "step": 27600 + }, + { + "epoch": 3.2626851606443443, + "grad_norm": 0.00042725802632048726, + "learning_rate": 0.0002377764842840512, + "loss": 0.0012, + "step": 27700 + }, + { + "epoch": 3.274464764260683, + "grad_norm": 0.00043336834642104805, + "learning_rate": 0.00023736071844337268, + "loss": 0.0012, + "step": 27800 + }, + { + "epoch": 3.286244367877021, + "grad_norm": 0.00040534150321036577, + "learning_rate": 0.00023694495260269415, + "loss": 0.0012, + "step": 27900 + }, + { + "epoch": 3.2980239714933592, + "grad_norm": 0.0004134470073040575, + "learning_rate": 0.00023652918676201563, + "loss": 0.0012, + "step": 28000 + }, + { + "epoch": 3.2980239714933592, + "eval_en-ja_loss": 0.0012517004506662488, + "eval_en-ja_mean_accuracy": 0.6595803423522916, + "eval_en-ja_negative_mse": -0.1813642680644989, + "eval_en-ja_runtime": 16.0819, + "eval_en-ja_samples_per_second": 450.444, + "eval_en-ja_src2trg_accuracy": 0.6742131419105467, + "eval_en-ja_steps_per_second": 1.803, + "eval_en-ja_trg2src_accuracy": 0.6449475427940364, + "eval_sequential_score": 0.23910803714389633, + "step": 28000 + }, + { + "epoch": 3.3098035751096977, + "grad_norm": 0.00041689653880894184, + "learning_rate": 0.0002361134209213371, + "loss": 0.0012, + "step": 28100 + }, + { + "epoch": 3.3215831787260357, + "grad_norm": 0.0004240849521011114, + "learning_rate": 0.00023569765508065857, + "loss": 0.0012, + "step": 28200 + }, + { + "epoch": 3.333362782342374, + "grad_norm": 0.0004039300256408751, + "learning_rate": 0.00023528188923998004, + "loss": 0.0012, + "step": 28300 + }, + { + "epoch": 3.3451423859587126, + "grad_norm": 0.0003837968106381595, + "learning_rate": 0.00023486612339930152, + "loss": 0.0012, + "step": 28400 + }, + { + "epoch": 3.3569219895750506, + "grad_norm": 0.0004096463380847126, + "learning_rate": 0.00023445035755862293, + "loss": 0.0012, + "step": 28500 + }, + { + "epoch": 3.368701593191389, + "grad_norm": 0.00042112142546102405, + "learning_rate": 0.0002340345917179444, + "loss": 0.0012, + "step": 28600 + }, + { + "epoch": 3.3804811968077275, + "grad_norm": 0.000393895898014307, + "learning_rate": 0.00023361882587726588, + "loss": 0.0012, + "step": 28700 + }, + { + "epoch": 3.392260800424066, + "grad_norm": 0.0004040790081489831, + "learning_rate": 0.00023320306003658735, + "loss": 0.0012, + "step": 28800 + }, + { + "epoch": 3.404040404040404, + "grad_norm": 0.00040151781286112964, + "learning_rate": 0.00023278729419590882, + "loss": 0.0012, + "step": 28900 + }, + { + "epoch": 3.4158200076567424, + "grad_norm": 0.0004011776181869209, + "learning_rate": 0.0002323715283552303, + "loss": 0.0012, + "step": 29000 + }, + { + "epoch": 3.4158200076567424, + "eval_en-ja_loss": 0.0012456197291612625, + "eval_en-ja_mean_accuracy": 0.6586140254003313, + "eval_en-ja_negative_mse": -0.18090644478797913, + "eval_en-ja_runtime": 15.8859, + "eval_en-ja_samples_per_second": 456.003, + "eval_en-ja_src2trg_accuracy": 0.6746272777471011, + "eval_en-ja_steps_per_second": 1.826, + "eval_en-ja_trg2src_accuracy": 0.6426007730535616, + "eval_sequential_score": 0.2388537903061761, + "step": 29000 + }, + { + "epoch": 3.427599611273081, + "grad_norm": 0.00037304588477127254, + "learning_rate": 0.00023195576251455177, + "loss": 0.0012, + "step": 29100 + }, + { + "epoch": 3.439379214889419, + "grad_norm": 0.00042709140689112246, + "learning_rate": 0.00023153999667387324, + "loss": 0.0012, + "step": 29200 + }, + { + "epoch": 3.4511588185057573, + "grad_norm": 0.0003809314512182027, + "learning_rate": 0.0002311242308331947, + "loss": 0.0012, + "step": 29300 + }, + { + "epoch": 3.462938422122096, + "grad_norm": 0.00043137872125953436, + "learning_rate": 0.00023070846499251619, + "loss": 0.0012, + "step": 29400 + }, + { + "epoch": 3.474718025738434, + "grad_norm": 0.00039477323298342526, + "learning_rate": 0.00023029269915183766, + "loss": 0.0012, + "step": 29500 + }, + { + "epoch": 3.4864976293547723, + "grad_norm": 0.0004039652703795582, + "learning_rate": 0.00022987693331115913, + "loss": 0.0012, + "step": 29600 + }, + { + "epoch": 3.4982772329711107, + "grad_norm": 0.0004246470925863832, + "learning_rate": 0.0002294611674704806, + "loss": 0.0012, + "step": 29700 + }, + { + "epoch": 3.5100568365874487, + "grad_norm": 0.00042530911741778255, + "learning_rate": 0.00022904540162980207, + "loss": 0.0012, + "step": 29800 + }, + { + "epoch": 3.521836440203787, + "grad_norm": 0.00041012169094756246, + "learning_rate": 0.00022862963578912355, + "loss": 0.0012, + "step": 29900 + }, + { + "epoch": 3.5336160438201256, + "grad_norm": 0.0003883550816681236, + "learning_rate": 0.00022821386994844502, + "loss": 0.0012, + "step": 30000 + }, + { + "epoch": 3.5336160438201256, + "eval_en-ja_loss": 0.0012427582405507565, + "eval_en-ja_mean_accuracy": 0.6602015461071231, + "eval_en-ja_negative_mse": -0.1808730661869049, + "eval_en-ja_runtime": 16.3352, + "eval_en-ja_samples_per_second": 443.46, + "eval_en-ja_src2trg_accuracy": 0.6739370513528438, + "eval_en-ja_steps_per_second": 1.775, + "eval_en-ja_trg2src_accuracy": 0.6464660408614026, + "eval_sequential_score": 0.2396642399601091, + "step": 30000 + }, + { + "epoch": 3.5453956474364636, + "grad_norm": 0.00040749143227003515, + "learning_rate": 0.0002277981041077665, + "loss": 0.0012, + "step": 30100 + }, + { + "epoch": 3.557175251052802, + "grad_norm": 0.0004207154852338135, + "learning_rate": 0.00022738233826708796, + "loss": 0.0012, + "step": 30200 + }, + { + "epoch": 3.5689548546691405, + "grad_norm": 0.00041166969458572567, + "learning_rate": 0.0002269665724264094, + "loss": 0.0012, + "step": 30300 + }, + { + "epoch": 3.5807344582854785, + "grad_norm": 0.0004028420662507415, + "learning_rate": 0.00022655080658573088, + "loss": 0.0012, + "step": 30400 + }, + { + "epoch": 3.592514061901817, + "grad_norm": 0.00039729077252559364, + "learning_rate": 0.00022613504074505235, + "loss": 0.0012, + "step": 30500 + }, + { + "epoch": 3.6042936655181554, + "grad_norm": 0.00039063766598701477, + "learning_rate": 0.00022571927490437383, + "loss": 0.0012, + "step": 30600 + }, + { + "epoch": 3.6160732691344935, + "grad_norm": 0.0004220531554892659, + "learning_rate": 0.0002253035090636953, + "loss": 0.0012, + "step": 30700 + }, + { + "epoch": 3.627852872750832, + "grad_norm": 0.00041758501902222633, + "learning_rate": 0.00022488774322301677, + "loss": 0.0012, + "step": 30800 + }, + { + "epoch": 3.6396324763671704, + "grad_norm": 0.00042649926035664976, + "learning_rate": 0.00022447197738233824, + "loss": 0.0012, + "step": 30900 + }, + { + "epoch": 3.6514120799835084, + "grad_norm": 0.000397971278289333, + "learning_rate": 0.00022405621154165972, + "loss": 0.0012, + "step": 31000 + }, + { + "epoch": 3.6514120799835084, + "eval_en-ja_loss": 0.0012356024235486984, + "eval_en-ja_mean_accuracy": 0.6612368856985091, + "eval_en-ja_negative_mse": -0.17998678982257843, + "eval_en-ja_runtime": 16.6594, + "eval_en-ja_samples_per_second": 434.829, + "eval_en-ja_src2trg_accuracy": 0.6762838210933186, + "eval_en-ja_steps_per_second": 1.741, + "eval_en-ja_trg2src_accuracy": 0.6461899503036996, + "eval_sequential_score": 0.24062504793796535, + "step": 31000 + }, + { + "epoch": 3.663191683599847, + "grad_norm": 0.0003889747313223779, + "learning_rate": 0.0002236404457009812, + "loss": 0.0012, + "step": 31100 + }, + { + "epoch": 3.6749712872161853, + "grad_norm": 0.0004040562198497355, + "learning_rate": 0.00022322467986030266, + "loss": 0.0012, + "step": 31200 + }, + { + "epoch": 3.6867508908325233, + "grad_norm": 0.0003806521708611399, + "learning_rate": 0.00022280891401962413, + "loss": 0.0012, + "step": 31300 + }, + { + "epoch": 3.6985304944488617, + "grad_norm": 0.00042317816405557096, + "learning_rate": 0.0002223931481789456, + "loss": 0.0012, + "step": 31400 + }, + { + "epoch": 3.7103100980652, + "grad_norm": 0.0004032641008961946, + "learning_rate": 0.00022197738233826708, + "loss": 0.0012, + "step": 31500 + }, + { + "epoch": 3.722089701681538, + "grad_norm": 0.00039092215592972934, + "learning_rate": 0.00022156161649758855, + "loss": 0.0012, + "step": 31600 + }, + { + "epoch": 3.7338693052978766, + "grad_norm": 0.0004013598372694105, + "learning_rate": 0.00022114585065691002, + "loss": 0.0012, + "step": 31700 + }, + { + "epoch": 3.745648908914215, + "grad_norm": 0.0004221323470119387, + "learning_rate": 0.0002207300848162315, + "loss": 0.0012, + "step": 31800 + }, + { + "epoch": 3.757428512530553, + "grad_norm": 0.00042156135896220803, + "learning_rate": 0.00022031431897555297, + "loss": 0.0012, + "step": 31900 + }, + { + "epoch": 3.7692081161468916, + "grad_norm": 0.00043856134288944304, + "learning_rate": 0.00021989855313487444, + "loss": 0.0012, + "step": 32000 + }, + { + "epoch": 3.7692081161468916, + "eval_en-ja_loss": 0.0012327299918979406, + "eval_en-ja_mean_accuracy": 0.6630314743235781, + "eval_en-ja_negative_mse": -0.1799149215221405, + "eval_en-ja_runtime": 16.368, + "eval_en-ja_samples_per_second": 442.57, + "eval_en-ja_src2trg_accuracy": 0.6778023191606847, + "eval_en-ja_steps_per_second": 1.772, + "eval_en-ja_trg2src_accuracy": 0.6482606294864716, + "eval_sequential_score": 0.2415582764007188, + "step": 32000 + }, + { + "epoch": 3.78098771976323, + "grad_norm": 0.00044048833660781384, + "learning_rate": 0.0002194827872941959, + "loss": 0.0012, + "step": 32100 + }, + { + "epoch": 3.792767323379568, + "grad_norm": 0.00042770113213919103, + "learning_rate": 0.00021906702145351736, + "loss": 0.0012, + "step": 32200 + }, + { + "epoch": 3.8045469269959065, + "grad_norm": 0.00039903837023302913, + "learning_rate": 0.00021865125561283883, + "loss": 0.0012, + "step": 32300 + }, + { + "epoch": 3.816326530612245, + "grad_norm": 0.00040695304051041603, + "learning_rate": 0.0002182354897721603, + "loss": 0.0012, + "step": 32400 + }, + { + "epoch": 3.8281061342285834, + "grad_norm": 0.0004184556019026786, + "learning_rate": 0.00021781972393148177, + "loss": 0.0012, + "step": 32500 + }, + { + "epoch": 3.8398857378449214, + "grad_norm": 0.00042907221359200776, + "learning_rate": 0.00021740395809080322, + "loss": 0.0012, + "step": 32600 + }, + { + "epoch": 3.85166534146126, + "grad_norm": 0.0003954211715608835, + "learning_rate": 0.0002169881922501247, + "loss": 0.0012, + "step": 32700 + }, + { + "epoch": 3.8634449450775983, + "grad_norm": 0.0003964339557569474, + "learning_rate": 0.00021657242640944616, + "loss": 0.0012, + "step": 32800 + }, + { + "epoch": 3.8752245486939363, + "grad_norm": 0.00038019847124814987, + "learning_rate": 0.00021615666056876764, + "loss": 0.0012, + "step": 32900 + }, + { + "epoch": 3.8870041523102747, + "grad_norm": 0.0003944500058423728, + "learning_rate": 0.0002157408947280891, + "loss": 0.0012, + "step": 33000 + }, + { + "epoch": 3.8870041523102747, + "eval_en-ja_loss": 0.0012272456660866737, + "eval_en-ja_mean_accuracy": 0.6631695196024296, + "eval_en-ja_negative_mse": -0.17947280406951904, + "eval_en-ja_runtime": 16.4811, + "eval_en-ja_samples_per_second": 439.535, + "eval_en-ja_src2trg_accuracy": 0.676697956929873, + "eval_en-ja_steps_per_second": 1.76, + "eval_en-ja_trg2src_accuracy": 0.6496410822749862, + "eval_sequential_score": 0.2418483577664553, + "step": 33000 + }, + { + "epoch": 3.898783755926613, + "grad_norm": 0.00040569185512140393, + "learning_rate": 0.00021532512888741058, + "loss": 0.0012, + "step": 33100 + }, + { + "epoch": 3.9105633595429516, + "grad_norm": 0.00040382929728366435, + "learning_rate": 0.00021490936304673205, + "loss": 0.0012, + "step": 33200 + }, + { + "epoch": 3.9223429631592897, + "grad_norm": 0.0004069951828569174, + "learning_rate": 0.00021449359720605352, + "loss": 0.0012, + "step": 33300 + }, + { + "epoch": 3.934122566775628, + "grad_norm": 0.0003631413565017283, + "learning_rate": 0.000214077831365375, + "loss": 0.0012, + "step": 33400 + }, + { + "epoch": 3.9459021703919666, + "grad_norm": 0.00043017862481065094, + "learning_rate": 0.00021366206552469647, + "loss": 0.0012, + "step": 33500 + }, + { + "epoch": 3.9576817740083046, + "grad_norm": 0.0004062317020725459, + "learning_rate": 0.00021324629968401794, + "loss": 0.0012, + "step": 33600 + }, + { + "epoch": 3.969461377624643, + "grad_norm": 0.0004248351906426251, + "learning_rate": 0.00021283053384333941, + "loss": 0.0012, + "step": 33700 + }, + { + "epoch": 3.9812409812409815, + "grad_norm": 0.00038846390089020133, + "learning_rate": 0.0002124147680026609, + "loss": 0.0012, + "step": 33800 + }, + { + "epoch": 3.9930205848573195, + "grad_norm": 0.00041310692904517055, + "learning_rate": 0.00021199900216198236, + "loss": 0.0012, + "step": 33900 + }, + { + "epoch": 4.004711841446535, + "grad_norm": 0.00039199512684717774, + "learning_rate": 0.00021158323632130383, + "loss": 0.0012, + "step": 34000 + }, + { + "epoch": 4.004711841446535, + "eval_en-ja_loss": 0.0012215257156640291, + "eval_en-ja_mean_accuracy": 0.6642738818332413, + "eval_en-ja_negative_mse": -0.17887508869171143, + "eval_en-ja_runtime": 15.9288, + "eval_en-ja_samples_per_second": 454.774, + "eval_en-ja_src2trg_accuracy": 0.6786305908337935, + "eval_en-ja_steps_per_second": 1.821, + "eval_en-ja_trg2src_accuracy": 0.6499171728326891, + "eval_sequential_score": 0.24269939657076495, + "step": 34000 + }, + { + "epoch": 4.016491445062874, + "grad_norm": 0.00038304130430333316, + "learning_rate": 0.00021116747048062528, + "loss": 0.0012, + "step": 34100 + }, + { + "epoch": 4.028271048679212, + "grad_norm": 0.00038354500429704785, + "learning_rate": 0.00021075170463994675, + "loss": 0.0012, + "step": 34200 + }, + { + "epoch": 4.04005065229555, + "grad_norm": 0.00041586393490433693, + "learning_rate": 0.00021033593879926822, + "loss": 0.0012, + "step": 34300 + }, + { + "epoch": 4.051830255911889, + "grad_norm": 0.00042095009121112525, + "learning_rate": 0.0002099201729585897, + "loss": 0.0012, + "step": 34400 + }, + { + "epoch": 4.063609859528227, + "grad_norm": 0.000394458940718323, + "learning_rate": 0.00020950440711791117, + "loss": 0.0012, + "step": 34500 + }, + { + "epoch": 4.075389463144565, + "grad_norm": 0.00041008874541148543, + "learning_rate": 0.00020908864127723264, + "loss": 0.0012, + "step": 34600 + }, + { + "epoch": 4.087169066760904, + "grad_norm": 0.00039519424899481237, + "learning_rate": 0.0002086728754365541, + "loss": 0.0012, + "step": 34700 + }, + { + "epoch": 4.098948670377242, + "grad_norm": 0.0004004191723652184, + "learning_rate": 0.00020825710959587558, + "loss": 0.0012, + "step": 34800 + }, + { + "epoch": 4.11072827399358, + "grad_norm": 0.0004183524288237095, + "learning_rate": 0.00020784134375519706, + "loss": 0.0012, + "step": 34900 + }, + { + "epoch": 4.122507877609919, + "grad_norm": 0.000399937474867329, + "learning_rate": 0.00020742557791451853, + "loss": 0.0012, + "step": 35000 + }, + { + "epoch": 4.122507877609919, + "eval_en-ja_loss": 0.0012192502617835999, + "eval_en-ja_mean_accuracy": 0.6647570403092213, + "eval_en-ja_negative_mse": -0.17876982688903809, + "eval_en-ja_runtime": 16.6606, + "eval_en-ja_samples_per_second": 434.797, + "eval_en-ja_src2trg_accuracy": 0.6794588625069022, + "eval_en-ja_steps_per_second": 1.741, + "eval_en-ja_trg2src_accuracy": 0.6500552181115405, + "eval_sequential_score": 0.24299360671009163, + "step": 35000 + }, + { + "epoch": 4.134287481226257, + "grad_norm": 0.0003754813806153834, + "learning_rate": 0.00020700981207384, + "loss": 0.0012, + "step": 35100 + }, + { + "epoch": 4.146067084842595, + "grad_norm": 0.0004425659717526287, + "learning_rate": 0.00020659404623316147, + "loss": 0.0012, + "step": 35200 + }, + { + "epoch": 4.157846688458934, + "grad_norm": 0.0003931986866518855, + "learning_rate": 0.00020617828039248294, + "loss": 0.0012, + "step": 35300 + }, + { + "epoch": 4.169626292075272, + "grad_norm": 0.000405570724979043, + "learning_rate": 0.00020576251455180442, + "loss": 0.0012, + "step": 35400 + }, + { + "epoch": 4.18140589569161, + "grad_norm": 0.000407026702305302, + "learning_rate": 0.0002053467487111259, + "loss": 0.0012, + "step": 35500 + }, + { + "epoch": 4.1931854993079485, + "grad_norm": 0.00041934460750781, + "learning_rate": 0.00020493098287044736, + "loss": 0.0012, + "step": 35600 + }, + { + "epoch": 4.2049651029242865, + "grad_norm": 0.00039289152482524514, + "learning_rate": 0.00020451521702976883, + "loss": 0.0012, + "step": 35700 + }, + { + "epoch": 4.2167447065406245, + "grad_norm": 0.00039810172165744007, + "learning_rate": 0.0002040994511890903, + "loss": 0.0012, + "step": 35800 + }, + { + "epoch": 4.228524310156963, + "grad_norm": 0.0003941806498914957, + "learning_rate": 0.00020368368534841175, + "loss": 0.0012, + "step": 35900 + }, + { + "epoch": 4.240303913773301, + "grad_norm": 0.0004113887553103268, + "learning_rate": 0.00020326791950773322, + "loss": 0.0012, + "step": 36000 + }, + { + "epoch": 4.240303913773301, + "eval_en-ja_loss": 0.0012173178838565946, + "eval_en-ja_mean_accuracy": 0.6628934290447267, + "eval_en-ja_negative_mse": -0.17866188287734985, + "eval_en-ja_runtime": 16.3574, + "eval_en-ja_samples_per_second": 442.857, + "eval_en-ja_src2trg_accuracy": 0.6776642738818333, + "eval_en-ja_steps_per_second": 1.773, + "eval_en-ja_trg2src_accuracy": 0.6481225842076201, + "eval_sequential_score": 0.24211577308368842, + "step": 36000 + }, + { + "epoch": 4.252083517389639, + "grad_norm": 0.00036664155777543783, + "learning_rate": 0.0002028521536670547, + "loss": 0.0012, + "step": 36100 + }, + { + "epoch": 4.263863121005978, + "grad_norm": 0.0003922642208635807, + "learning_rate": 0.00020243638782637617, + "loss": 0.0012, + "step": 36200 + }, + { + "epoch": 4.275642724622316, + "grad_norm": 0.0003725190181285143, + "learning_rate": 0.00020202062198569764, + "loss": 0.0012, + "step": 36300 + }, + { + "epoch": 4.287422328238655, + "grad_norm": 0.0004184598510619253, + "learning_rate": 0.0002016048561450191, + "loss": 0.0012, + "step": 36400 + }, + { + "epoch": 4.299201931854993, + "grad_norm": 0.0004114755429327488, + "learning_rate": 0.00020118909030434059, + "loss": 0.0012, + "step": 36500 + }, + { + "epoch": 4.310981535471331, + "grad_norm": 0.0003939278540201485, + "learning_rate": 0.00020077332446366206, + "loss": 0.0012, + "step": 36600 + }, + { + "epoch": 4.322761139087669, + "grad_norm": 0.00038292151293717325, + "learning_rate": 0.0002003575586229835, + "loss": 0.0012, + "step": 36700 + }, + { + "epoch": 4.334540742704008, + "grad_norm": 0.0003719251253642142, + "learning_rate": 0.00019994179278230498, + "loss": 0.0012, + "step": 36800 + }, + { + "epoch": 4.346320346320346, + "grad_norm": 0.000418243434978649, + "learning_rate": 0.00019952602694162645, + "loss": 0.0012, + "step": 36900 + }, + { + "epoch": 4.358099949936685, + "grad_norm": 0.00039279472548514605, + "learning_rate": 0.00019911026110094792, + "loss": 0.0012, + "step": 37000 + }, + { + "epoch": 4.358099949936685, + "eval_en-ja_loss": 0.001213202252984047, + "eval_en-ja_mean_accuracy": 0.6642048591938156, + "eval_en-ja_negative_mse": -0.17826373875141144, + "eval_en-ja_runtime": 16.0311, + "eval_en-ja_samples_per_second": 451.872, + "eval_en-ja_src2trg_accuracy": 0.6787686361126449, + "eval_en-ja_steps_per_second": 1.809, + "eval_en-ja_trg2src_accuracy": 0.6496410822749862, + "eval_sequential_score": 0.24297056022120206, + "step": 37000 + }, + { + "epoch": 4.369879553553023, + "grad_norm": 0.00037332347710616887, + "learning_rate": 0.0001986944952602694, + "loss": 0.0012, + "step": 37100 + }, + { + "epoch": 4.381659157169361, + "grad_norm": 0.00040860893204808235, + "learning_rate": 0.00019827872941959086, + "loss": 0.0012, + "step": 37200 + }, + { + "epoch": 4.3934387607857, + "grad_norm": 0.00038928238791413605, + "learning_rate": 0.00019786296357891234, + "loss": 0.0012, + "step": 37300 + }, + { + "epoch": 4.405218364402038, + "grad_norm": 0.0004379466117825359, + "learning_rate": 0.0001974471977382338, + "loss": 0.0012, + "step": 37400 + }, + { + "epoch": 4.416997968018376, + "grad_norm": 0.000410744221881032, + "learning_rate": 0.00019703143189755528, + "loss": 0.0012, + "step": 37500 + }, + { + "epoch": 4.428777571634715, + "grad_norm": 0.0004211909254081547, + "learning_rate": 0.00019661566605687675, + "loss": 0.0012, + "step": 37600 + }, + { + "epoch": 4.440557175251053, + "grad_norm": 0.00037585542304441333, + "learning_rate": 0.00019619990021619823, + "loss": 0.0012, + "step": 37700 + }, + { + "epoch": 4.452336778867391, + "grad_norm": 0.00038279069121927023, + "learning_rate": 0.00019578413437551967, + "loss": 0.0012, + "step": 37800 + }, + { + "epoch": 4.46411638248373, + "grad_norm": 0.00039621832547709346, + "learning_rate": 0.00019536836853484114, + "loss": 0.0012, + "step": 37900 + }, + { + "epoch": 4.475895986100068, + "grad_norm": 0.0004314736579544842, + "learning_rate": 0.00019495260269416262, + "loss": 0.0012, + "step": 38000 + }, + { + "epoch": 4.475895986100068, + "eval_en-ja_loss": 0.0012103316839784384, + "eval_en-ja_mean_accuracy": 0.665378244064053, + "eval_en-ja_negative_mse": -0.17813009023666382, + "eval_en-ja_runtime": 16.0967, + "eval_en-ja_samples_per_second": 450.03, + "eval_en-ja_src2trg_accuracy": 0.6804251794588625, + "eval_en-ja_steps_per_second": 1.802, + "eval_en-ja_trg2src_accuracy": 0.6503313086692435, + "eval_sequential_score": 0.2436240769136946, + "step": 38000 + }, + { + "epoch": 4.487675589716406, + "grad_norm": 0.0004273569502402097, + "learning_rate": 0.0001945368368534841, + "loss": 0.0012, + "step": 38100 + }, + { + "epoch": 4.499455193332745, + "grad_norm": 0.00037945678923279047, + "learning_rate": 0.00019412107101280556, + "loss": 0.0012, + "step": 38200 + }, + { + "epoch": 4.511234796949083, + "grad_norm": 0.0004015901358798146, + "learning_rate": 0.00019370530517212703, + "loss": 0.0012, + "step": 38300 + }, + { + "epoch": 4.523014400565421, + "grad_norm": 0.0003696521744132042, + "learning_rate": 0.0001932895393314485, + "loss": 0.0012, + "step": 38400 + }, + { + "epoch": 4.53479400418176, + "grad_norm": 0.0004108089196961373, + "learning_rate": 0.00019287377349076998, + "loss": 0.0012, + "step": 38500 + }, + { + "epoch": 4.546573607798098, + "grad_norm": 0.00040406297193840146, + "learning_rate": 0.00019245800765009145, + "loss": 0.0012, + "step": 38600 + }, + { + "epoch": 4.558353211414436, + "grad_norm": 0.0003694118349812925, + "learning_rate": 0.00019204224180941292, + "loss": 0.0012, + "step": 38700 + }, + { + "epoch": 4.5701328150307745, + "grad_norm": 0.00039542344165965915, + "learning_rate": 0.0001916264759687344, + "loss": 0.0012, + "step": 38800 + }, + { + "epoch": 4.5819124186471125, + "grad_norm": 0.00037531484849750996, + "learning_rate": 0.00019121071012805587, + "loss": 0.0012, + "step": 38900 + }, + { + "epoch": 4.5936920222634505, + "grad_norm": 0.00040082831401377916, + "learning_rate": 0.00019079494428737734, + "loss": 0.0012, + "step": 39000 + }, + { + "epoch": 4.5936920222634505, + "eval_en-ja_loss": 0.0012068564537912607, + "eval_en-ja_mean_accuracy": 0.6677250138045279, + "eval_en-ja_negative_mse": -0.17787422239780426, + "eval_en-ja_runtime": 15.7089, + "eval_en-ja_samples_per_second": 461.141, + "eval_en-ja_src2trg_accuracy": 0.6812534511319713, + "eval_en-ja_steps_per_second": 1.846, + "eval_en-ja_trg2src_accuracy": 0.6541965764770845, + "eval_sequential_score": 0.2449253957033618, + "step": 39000 + }, + { + "epoch": 4.605471625879789, + "grad_norm": 0.00040375528624281287, + "learning_rate": 0.0001903791784466988, + "loss": 0.0012, + "step": 39100 + }, + { + "epoch": 4.617251229496127, + "grad_norm": 0.0003803670988418162, + "learning_rate": 0.00018996341260602028, + "loss": 0.0012, + "step": 39200 + }, + { + "epoch": 4.629030833112465, + "grad_norm": 0.00040498352609574795, + "learning_rate": 0.00018954764676534176, + "loss": 0.0012, + "step": 39300 + }, + { + "epoch": 4.640810436728804, + "grad_norm": 0.00039148807991296053, + "learning_rate": 0.00018913188092466323, + "loss": 0.0012, + "step": 39400 + }, + { + "epoch": 4.652590040345142, + "grad_norm": 0.00041940875235013664, + "learning_rate": 0.0001887161150839847, + "loss": 0.0012, + "step": 39500 + }, + { + "epoch": 4.66436964396148, + "grad_norm": 0.0003799602563958615, + "learning_rate": 0.00018830034924330617, + "loss": 0.0012, + "step": 39600 + }, + { + "epoch": 4.676149247577819, + "grad_norm": 0.0003974252031184733, + "learning_rate": 0.00018788458340262762, + "loss": 0.0012, + "step": 39700 + }, + { + "epoch": 4.687928851194157, + "grad_norm": 0.0003942203475162387, + "learning_rate": 0.0001874688175619491, + "loss": 0.0012, + "step": 39800 + }, + { + "epoch": 4.699708454810495, + "grad_norm": 0.00040850575896911323, + "learning_rate": 0.00018705305172127056, + "loss": 0.0012, + "step": 39900 + }, + { + "epoch": 4.711488058426834, + "grad_norm": 0.0003978704335168004, + "learning_rate": 0.00018663728588059204, + "loss": 0.0012, + "step": 40000 + }, + { + "epoch": 4.711488058426834, + "eval_en-ja_loss": 0.0012028900673612952, + "eval_en-ja_mean_accuracy": 0.6631004969630039, + "eval_en-ja_negative_mse": -0.17763866484165192, + "eval_en-ja_runtime": 16.2435, + "eval_en-ja_samples_per_second": 445.964, + "eval_en-ja_src2trg_accuracy": 0.6771120927664274, + "eval_en-ja_steps_per_second": 1.785, + "eval_en-ja_trg2src_accuracy": 0.6490889011595803, + "eval_sequential_score": 0.24273091606067598, + "step": 40000 + }, + { + "epoch": 4.723267662043172, + "grad_norm": 0.0004426835512276739, + "learning_rate": 0.0001862215200399135, + "loss": 0.0012, + "step": 40100 + }, + { + "epoch": 4.73504726565951, + "grad_norm": 0.00042884275899268687, + "learning_rate": 0.00018580575419923498, + "loss": 0.0012, + "step": 40200 + }, + { + "epoch": 4.746826869275849, + "grad_norm": 0.0004092632734682411, + "learning_rate": 0.00018538998835855645, + "loss": 0.0012, + "step": 40300 + }, + { + "epoch": 4.758606472892187, + "grad_norm": 0.0003698091022670269, + "learning_rate": 0.00018497422251787792, + "loss": 0.0012, + "step": 40400 + }, + { + "epoch": 4.770386076508526, + "grad_norm": 0.00040019475272856653, + "learning_rate": 0.0001845584566771994, + "loss": 0.0012, + "step": 40500 + }, + { + "epoch": 4.782165680124864, + "grad_norm": 0.00038194056833162904, + "learning_rate": 0.00018414269083652087, + "loss": 0.0012, + "step": 40600 + }, + { + "epoch": 4.793945283741202, + "grad_norm": 0.0004225564480293542, + "learning_rate": 0.00018372692499584234, + "loss": 0.0012, + "step": 40700 + }, + { + "epoch": 4.80572488735754, + "grad_norm": 0.0004254919185768813, + "learning_rate": 0.00018331115915516381, + "loss": 0.0012, + "step": 40800 + }, + { + "epoch": 4.817504490973879, + "grad_norm": 0.00041228727786801755, + "learning_rate": 0.00018289539331448526, + "loss": 0.0012, + "step": 40900 + }, + { + "epoch": 4.829284094590217, + "grad_norm": 0.000392022542655468, + "learning_rate": 0.00018247962747380673, + "loss": 0.0012, + "step": 41000 + }, + { + "epoch": 4.829284094590217, + "eval_en-ja_loss": 0.001201606821268797, + "eval_en-ja_mean_accuracy": 0.6683462175593595, + "eval_en-ja_negative_mse": -0.17747028172016144, + "eval_en-ja_runtime": 16.2684, + "eval_en-ja_samples_per_second": 445.28, + "eval_en-ja_src2trg_accuracy": 0.6824958586416344, + "eval_en-ja_steps_per_second": 1.783, + "eval_en-ja_trg2src_accuracy": 0.6541965764770845, + "eval_sequential_score": 0.24543796791959904, + "step": 41000 + }, + { + "epoch": 4.841063698206556, + "grad_norm": 0.0004006191447842866, + "learning_rate": 0.0001820638616331282, + "loss": 0.0012, + "step": 41100 + }, + { + "epoch": 4.852843301822894, + "grad_norm": 0.0004167920851614326, + "learning_rate": 0.00018164809579244968, + "loss": 0.0012, + "step": 41200 + }, + { + "epoch": 4.864622905439232, + "grad_norm": 0.000408497522585094, + "learning_rate": 0.00018123232995177115, + "loss": 0.0012, + "step": 41300 + }, + { + "epoch": 4.87640250905557, + "grad_norm": 0.0004027536779176444, + "learning_rate": 0.00018081656411109262, + "loss": 0.0012, + "step": 41400 + }, + { + "epoch": 4.888182112671909, + "grad_norm": 0.00039491569623351097, + "learning_rate": 0.00018040079827041407, + "loss": 0.0012, + "step": 41500 + }, + { + "epoch": 4.899961716288247, + "grad_norm": 0.0004136124043725431, + "learning_rate": 0.00017998503242973554, + "loss": 0.0012, + "step": 41600 + }, + { + "epoch": 4.911741319904586, + "grad_norm": 0.00038105872226879, + "learning_rate": 0.000179569266589057, + "loss": 0.0012, + "step": 41700 + }, + { + "epoch": 4.923520923520924, + "grad_norm": 0.0003911652020178735, + "learning_rate": 0.00017915350074837848, + "loss": 0.0012, + "step": 41800 + }, + { + "epoch": 4.935300527137262, + "grad_norm": 0.0003846845356747508, + "learning_rate": 0.00017873773490769996, + "loss": 0.0012, + "step": 41900 + }, + { + "epoch": 4.9470801307536, + "grad_norm": 0.00038711552042514086, + "learning_rate": 0.00017832196906702143, + "loss": 0.0012, + "step": 42000 + }, + { + "epoch": 4.9470801307536, + "eval_en-ja_loss": 0.0011976456735283136, + "eval_en-ja_mean_accuracy": 0.6677250138045279, + "eval_en-ja_negative_mse": -0.17697350680828094, + "eval_en-ja_runtime": 16.1613, + "eval_en-ja_samples_per_second": 448.23, + "eval_en-ja_src2trg_accuracy": 0.6813914964108228, + "eval_en-ja_steps_per_second": 1.794, + "eval_en-ja_trg2src_accuracy": 0.654058531198233, + "eval_sequential_score": 0.24537575349812346, + "step": 42000 + }, + { + "epoch": 4.9588597343699385, + "grad_norm": 0.00038468558341264725, + "learning_rate": 0.0001779062032263429, + "loss": 0.0012, + "step": 42100 + }, + { + "epoch": 4.9706393379862766, + "grad_norm": 0.0003667905693873763, + "learning_rate": 0.00017749043738566437, + "loss": 0.0012, + "step": 42200 + }, + { + "epoch": 4.9824189416026154, + "grad_norm": 0.00038799893809482455, + "learning_rate": 0.00017707467154498584, + "loss": 0.0012, + "step": 42300 + }, + { + "epoch": 4.9941985452189535, + "grad_norm": 0.00041846206295304, + "learning_rate": 0.00017665890570430732, + "loss": 0.0012, + "step": 42400 + }, + { + "epoch": 5.005889801808169, + "grad_norm": 0.00035967648727819324, + "learning_rate": 0.0001762431398636288, + "loss": 0.0012, + "step": 42500 + }, + { + "epoch": 5.017669405424508, + "grad_norm": 0.00040002024616114795, + "learning_rate": 0.00017582737402295026, + "loss": 0.0012, + "step": 42600 + }, + { + "epoch": 5.029449009040846, + "grad_norm": 0.00041002099169418216, + "learning_rate": 0.00017541160818227173, + "loss": 0.0012, + "step": 42700 + }, + { + "epoch": 5.041228612657184, + "grad_norm": 0.0004048975824844092, + "learning_rate": 0.0001749958423415932, + "loss": 0.0012, + "step": 42800 + }, + { + "epoch": 5.053008216273523, + "grad_norm": 0.0003687113930936903, + "learning_rate": 0.00017458007650091468, + "loss": 0.0012, + "step": 42900 + }, + { + "epoch": 5.064787819889861, + "grad_norm": 0.000392257614294067, + "learning_rate": 0.00017416431066023615, + "loss": 0.0012, + "step": 43000 + }, + { + "epoch": 5.064787819889861, + "eval_en-ja_loss": 0.0011978611582890153, + "eval_en-ja_mean_accuracy": 0.6653092214246272, + "eval_en-ja_negative_mse": -0.1771976202726364, + "eval_en-ja_runtime": 16.4191, + "eval_en-ja_samples_per_second": 441.194, + "eval_en-ja_src2trg_accuracy": 0.6798729983434566, + "eval_en-ja_steps_per_second": 1.766, + "eval_en-ja_trg2src_accuracy": 0.6507454445057979, + "eval_sequential_score": 0.2440558005759954, + "step": 43000 + }, + { + "epoch": 5.076567423506199, + "grad_norm": 0.00038272241363301873, + "learning_rate": 0.00017374854481955762, + "loss": 0.0012, + "step": 43100 + }, + { + "epoch": 5.088347027122538, + "grad_norm": 0.0003854671085719019, + "learning_rate": 0.0001733327789788791, + "loss": 0.0012, + "step": 43200 + }, + { + "epoch": 5.100126630738876, + "grad_norm": 0.00039120338624343276, + "learning_rate": 0.00017291701313820057, + "loss": 0.0012, + "step": 43300 + }, + { + "epoch": 5.111906234355214, + "grad_norm": 0.00042545076576061547, + "learning_rate": 0.000172501247297522, + "loss": 0.0012, + "step": 43400 + }, + { + "epoch": 5.123685837971553, + "grad_norm": 0.0003778001118917018, + "learning_rate": 0.00017208548145684349, + "loss": 0.0012, + "step": 43500 + }, + { + "epoch": 5.135465441587891, + "grad_norm": 0.0003942952025681734, + "learning_rate": 0.00017166971561616496, + "loss": 0.0012, + "step": 43600 + }, + { + "epoch": 5.147245045204229, + "grad_norm": 0.0003515453136060387, + "learning_rate": 0.00017125394977548643, + "loss": 0.0012, + "step": 43700 + }, + { + "epoch": 5.1590246488205675, + "grad_norm": 0.00040901461034081876, + "learning_rate": 0.0001708381839348079, + "loss": 0.0012, + "step": 43800 + }, + { + "epoch": 5.170804252436906, + "grad_norm": 0.00042175903217867017, + "learning_rate": 0.00017042241809412937, + "loss": 0.0012, + "step": 43900 + }, + { + "epoch": 5.182583856053244, + "grad_norm": 0.0003721881948877126, + "learning_rate": 0.00017000665225345085, + "loss": 0.0012, + "step": 44000 + }, + { + "epoch": 5.182583856053244, + "eval_en-ja_loss": 0.0011926753213629127, + "eval_en-ja_mean_accuracy": 0.6693125345113198, + "eval_en-ja_negative_mse": -0.17653656005859375, + "eval_en-ja_runtime": 16.6302, + "eval_en-ja_samples_per_second": 435.592, + "eval_en-ja_src2trg_accuracy": 0.6847045831032579, + "eval_en-ja_steps_per_second": 1.744, + "eval_en-ja_trg2src_accuracy": 0.6539204859193816, + "eval_sequential_score": 0.24638798722636301, + "step": 44000 + }, + { + "epoch": 5.1943634596695825, + "grad_norm": 0.0003989835677202791, + "learning_rate": 0.00016959088641277232, + "loss": 0.0012, + "step": 44100 + }, + { + "epoch": 5.2061430632859205, + "grad_norm": 0.0004079790669493377, + "learning_rate": 0.0001691751205720938, + "loss": 0.0012, + "step": 44200 + }, + { + "epoch": 5.2179226669022585, + "grad_norm": 0.0004024998052045703, + "learning_rate": 0.00016875935473141526, + "loss": 0.0012, + "step": 44300 + }, + { + "epoch": 5.229702270518597, + "grad_norm": 0.00039305054815486073, + "learning_rate": 0.00016834358889073674, + "loss": 0.0012, + "step": 44400 + }, + { + "epoch": 5.241481874134935, + "grad_norm": 0.0003982719499617815, + "learning_rate": 0.0001679278230500582, + "loss": 0.0012, + "step": 44500 + }, + { + "epoch": 5.253261477751273, + "grad_norm": 0.00039064843440428376, + "learning_rate": 0.00016751205720937968, + "loss": 0.0012, + "step": 44600 + }, + { + "epoch": 5.265041081367612, + "grad_norm": 0.00038375219446606934, + "learning_rate": 0.00016709629136870115, + "loss": 0.0012, + "step": 44700 + }, + { + "epoch": 5.27682068498395, + "grad_norm": 0.00037349684862419963, + "learning_rate": 0.00016668052552802263, + "loss": 0.0012, + "step": 44800 + }, + { + "epoch": 5.288600288600288, + "grad_norm": 0.0003691337478812784, + "learning_rate": 0.0001662647596873441, + "loss": 0.0012, + "step": 44900 + }, + { + "epoch": 5.300379892216627, + "grad_norm": 0.00039388175355270505, + "learning_rate": 0.00016584899384666554, + "loss": 0.0012, + "step": 45000 + }, + { + "epoch": 5.300379892216627, + "eval_en-ja_loss": 0.0011914001079276204, + "eval_en-ja_mean_accuracy": 0.6682771949199338, + "eval_en-ja_negative_mse": -0.17657607793807983, + "eval_en-ja_runtime": 16.7433, + "eval_en-ja_samples_per_second": 432.65, + "eval_en-ja_src2trg_accuracy": 0.6819436775262286, + "eval_en-ja_steps_per_second": 1.732, + "eval_en-ja_trg2src_accuracy": 0.6546107123136389, + "eval_sequential_score": 0.24585055849092696, + "step": 45000 + }, + { + "epoch": 5.312159495832965, + "grad_norm": 0.000359655445208773, + "learning_rate": 0.00016543322800598702, + "loss": 0.0012, + "step": 45100 + }, + { + "epoch": 5.323939099449303, + "grad_norm": 0.00042170396773144603, + "learning_rate": 0.00016501746216530846, + "loss": 0.0012, + "step": 45200 + }, + { + "epoch": 5.335718703065642, + "grad_norm": 0.00041019340278580785, + "learning_rate": 0.00016460169632462993, + "loss": 0.0012, + "step": 45300 + }, + { + "epoch": 5.34749830668198, + "grad_norm": 0.00040671046008355916, + "learning_rate": 0.0001641859304839514, + "loss": 0.0012, + "step": 45400 + }, + { + "epoch": 5.359277910298318, + "grad_norm": 0.00043165666284039617, + "learning_rate": 0.00016377016464327288, + "loss": 0.0012, + "step": 45500 + }, + { + "epoch": 5.371057513914657, + "grad_norm": 0.0003742928965948522, + "learning_rate": 0.00016335439880259435, + "loss": 0.0012, + "step": 45600 + }, + { + "epoch": 5.382837117530995, + "grad_norm": 0.00035031032166443765, + "learning_rate": 0.00016293863296191582, + "loss": 0.0012, + "step": 45700 + }, + { + "epoch": 5.394616721147333, + "grad_norm": 0.00040437004645355046, + "learning_rate": 0.0001625228671212373, + "loss": 0.0012, + "step": 45800 + }, + { + "epoch": 5.406396324763672, + "grad_norm": 0.00039041691343300045, + "learning_rate": 0.00016210710128055877, + "loss": 0.0012, + "step": 45900 + }, + { + "epoch": 5.41817592838001, + "grad_norm": 0.00038605445297434926, + "learning_rate": 0.00016169133543988024, + "loss": 0.0012, + "step": 46000 + }, + { + "epoch": 5.41817592838001, + "eval_en-ja_loss": 0.0011886211577802896, + "eval_en-ja_mean_accuracy": 0.6693815571507454, + "eval_en-ja_negative_mse": -0.17643414437770844, + "eval_en-ja_runtime": 16.2944, + "eval_en-ja_samples_per_second": 444.57, + "eval_en-ja_src2trg_accuracy": 0.6824958586416344, + "eval_en-ja_steps_per_second": 1.78, + "eval_en-ja_trg2src_accuracy": 0.6562672556598564, + "eval_sequential_score": 0.2464737063865185, + "step": 46000 + }, + { + "epoch": 5.429955531996348, + "grad_norm": 0.00036408661981113255, + "learning_rate": 0.0001612755695992017, + "loss": 0.0012, + "step": 46100 + }, + { + "epoch": 5.441735135612687, + "grad_norm": 0.00039911747444421053, + "learning_rate": 0.00016085980375852318, + "loss": 0.0012, + "step": 46200 + }, + { + "epoch": 5.453514739229025, + "grad_norm": 0.0004047693801112473, + "learning_rate": 0.00016044403791784466, + "loss": 0.0012, + "step": 46300 + }, + { + "epoch": 5.465294342845363, + "grad_norm": 0.0003829447377938777, + "learning_rate": 0.00016002827207716613, + "loss": 0.0012, + "step": 46400 + }, + { + "epoch": 5.477073946461702, + "grad_norm": 0.00040052857366390526, + "learning_rate": 0.0001596125062364876, + "loss": 0.0012, + "step": 46500 + }, + { + "epoch": 5.48885355007804, + "grad_norm": 0.00037631968734785914, + "learning_rate": 0.00015919674039580907, + "loss": 0.0012, + "step": 46600 + }, + { + "epoch": 5.500633153694379, + "grad_norm": 0.0004114199837204069, + "learning_rate": 0.00015878097455513055, + "loss": 0.0012, + "step": 46700 + }, + { + "epoch": 5.512412757310717, + "grad_norm": 0.0004358474106993526, + "learning_rate": 0.00015836520871445202, + "loss": 0.0012, + "step": 46800 + }, + { + "epoch": 5.524192360927055, + "grad_norm": 0.0003654324682429433, + "learning_rate": 0.0001579494428737735, + "loss": 0.0012, + "step": 46900 + }, + { + "epoch": 5.535971964543393, + "grad_norm": 0.0003593885339796543, + "learning_rate": 0.00015753367703309496, + "loss": 0.0012, + "step": 47000 + }, + { + "epoch": 5.535971964543393, + "eval_en-ja_loss": 0.0011859071673825383, + "eval_en-ja_mean_accuracy": 0.6678630590833794, + "eval_en-ja_negative_mse": -0.17619922757148743, + "eval_en-ja_runtime": 15.6634, + "eval_en-ja_samples_per_second": 462.478, + "eval_en-ja_src2trg_accuracy": 0.6815295416896742, + "eval_en-ja_steps_per_second": 1.851, + "eval_en-ja_trg2src_accuracy": 0.6541965764770845, + "eval_sequential_score": 0.24583191575594598, + "step": 47000 + }, + { + "epoch": 5.547751568159732, + "grad_norm": 0.00039393085171468556, + "learning_rate": 0.0001571179111924164, + "loss": 0.0012, + "step": 47100 + }, + { + "epoch": 5.55953117177607, + "grad_norm": 0.0003785512817557901, + "learning_rate": 0.00015670214535173788, + "loss": 0.0012, + "step": 47200 + }, + { + "epoch": 5.5713107753924085, + "grad_norm": 0.0003934150154236704, + "learning_rate": 0.00015628637951105935, + "loss": 0.0012, + "step": 47300 + }, + { + "epoch": 5.5830903790087465, + "grad_norm": 0.0003620222269091755, + "learning_rate": 0.00015587061367038082, + "loss": 0.0012, + "step": 47400 + }, + { + "epoch": 5.5948699826250845, + "grad_norm": 0.00037242332473397255, + "learning_rate": 0.0001554548478297023, + "loss": 0.0012, + "step": 47500 + }, + { + "epoch": 5.6066495862414225, + "grad_norm": 0.0004254773666616529, + "learning_rate": 0.00015503908198902377, + "loss": 0.0012, + "step": 47600 + }, + { + "epoch": 5.618429189857761, + "grad_norm": 0.0003814904484897852, + "learning_rate": 0.00015462331614834524, + "loss": 0.0012, + "step": 47700 + }, + { + "epoch": 5.630208793474099, + "grad_norm": 0.0003692252212204039, + "learning_rate": 0.00015420755030766671, + "loss": 0.0012, + "step": 47800 + }, + { + "epoch": 5.641988397090438, + "grad_norm": 0.0003839476266875863, + "learning_rate": 0.00015379178446698819, + "loss": 0.0012, + "step": 47900 + }, + { + "epoch": 5.653768000706776, + "grad_norm": 0.0004012791032437235, + "learning_rate": 0.00015337601862630966, + "loss": 0.0012, + "step": 48000 + }, + { + "epoch": 5.653768000706776, + "eval_en-ja_loss": 0.0011836208868771791, + "eval_en-ja_mean_accuracy": 0.6693815571507454, + "eval_en-ja_negative_mse": -0.17586451768875122, + "eval_en-ja_runtime": 16.7845, + "eval_en-ja_samples_per_second": 431.59, + "eval_en-ja_src2trg_accuracy": 0.6830480397570403, + "eval_en-ja_steps_per_second": 1.728, + "eval_en-ja_trg2src_accuracy": 0.6557150745444505, + "eval_sequential_score": 0.2467585197309971, + "step": 48000 + }, + { + "epoch": 5.665547604323114, + "grad_norm": 0.0003722771944012493, + "learning_rate": 0.00015296025278563113, + "loss": 0.0012, + "step": 48100 + }, + { + "epoch": 5.677327207939453, + "grad_norm": 0.0003710024175234139, + "learning_rate": 0.0001525444869449526, + "loss": 0.0012, + "step": 48200 + }, + { + "epoch": 5.689106811555791, + "grad_norm": 0.0003682856331579387, + "learning_rate": 0.00015212872110427408, + "loss": 0.0012, + "step": 48300 + }, + { + "epoch": 5.700886415172129, + "grad_norm": 0.0004051876603625715, + "learning_rate": 0.00015171295526359555, + "loss": 0.0012, + "step": 48400 + }, + { + "epoch": 5.712666018788468, + "grad_norm": 0.00038708909414708614, + "learning_rate": 0.00015129718942291702, + "loss": 0.0012, + "step": 48500 + }, + { + "epoch": 5.724445622404806, + "grad_norm": 0.00037619852810166776, + "learning_rate": 0.0001508814235822385, + "loss": 0.0012, + "step": 48600 + }, + { + "epoch": 5.736225226021144, + "grad_norm": 0.0003671462764032185, + "learning_rate": 0.00015046565774155996, + "loss": 0.0012, + "step": 48700 + }, + { + "epoch": 5.748004829637483, + "grad_norm": 0.00037034478737041354, + "learning_rate": 0.00015004989190088144, + "loss": 0.0012, + "step": 48800 + }, + { + "epoch": 5.759784433253821, + "grad_norm": 0.00038947086432017386, + "learning_rate": 0.00014963412606020288, + "loss": 0.0012, + "step": 48900 + }, + { + "epoch": 5.771564036870159, + "grad_norm": 0.000386897154385224, + "learning_rate": 0.00014921836021952435, + "loss": 0.0012, + "step": 49000 + }, + { + "epoch": 5.771564036870159, + "eval_en-ja_loss": 0.0011831930605694652, + "eval_en-ja_mean_accuracy": 0.6686913307564881, + "eval_en-ja_negative_mse": -0.17605459690093994, + "eval_en-ja_runtime": 16.2634, + "eval_en-ja_samples_per_second": 445.418, + "eval_en-ja_src2trg_accuracy": 0.6815295416896742, + "eval_en-ja_steps_per_second": 1.783, + "eval_en-ja_trg2src_accuracy": 0.6558531198233021, + "eval_sequential_score": 0.2463183669277741, + "step": 49000 + }, + { + "epoch": 5.783343640486498, + "grad_norm": 0.000380645360564813, + "learning_rate": 0.00014880259437884583, + "loss": 0.0012, + "step": 49100 + }, + { + "epoch": 5.795123244102836, + "grad_norm": 0.00039909081533551216, + "learning_rate": 0.0001483868285381673, + "loss": 0.0012, + "step": 49200 + }, + { + "epoch": 5.806902847719174, + "grad_norm": 0.00038039733772166073, + "learning_rate": 0.00014797106269748877, + "loss": 0.0012, + "step": 49300 + }, + { + "epoch": 5.818682451335513, + "grad_norm": 0.0003781640261877328, + "learning_rate": 0.00014755529685681022, + "loss": 0.0012, + "step": 49400 + }, + { + "epoch": 5.830462054951851, + "grad_norm": 0.0003868028870783746, + "learning_rate": 0.0001471395310161317, + "loss": 0.0012, + "step": 49500 + }, + { + "epoch": 5.842241658568189, + "grad_norm": 0.00038260227302089334, + "learning_rate": 0.00014672376517545316, + "loss": 0.0012, + "step": 49600 + }, + { + "epoch": 5.854021262184528, + "grad_norm": 0.00037628598511219025, + "learning_rate": 0.00014630799933477463, + "loss": 0.0012, + "step": 49700 + }, + { + "epoch": 5.865800865800866, + "grad_norm": 0.0003797787067014724, + "learning_rate": 0.0001458922334940961, + "loss": 0.0012, + "step": 49800 + }, + { + "epoch": 5.877580469417204, + "grad_norm": 0.00037431600503623486, + "learning_rate": 0.00014547646765341758, + "loss": 0.0012, + "step": 49900 + }, + { + "epoch": 5.889360073033543, + "grad_norm": 0.0004018870531581342, + "learning_rate": 0.00014506070181273905, + "loss": 0.0012, + "step": 50000 + }, + { + "epoch": 5.889360073033543, + "eval_en-ja_loss": 0.0011818819912150502, + "eval_en-ja_mean_accuracy": 0.6676559911651021, + "eval_en-ja_negative_mse": -0.17587146162986755, + "eval_en-ja_runtime": 15.5251, + "eval_en-ja_samples_per_second": 466.6, + "eval_en-ja_src2trg_accuracy": 0.6811154058531198, + "eval_en-ja_steps_per_second": 1.868, + "eval_en-ja_trg2src_accuracy": 0.6541965764770845, + "eval_sequential_score": 0.24589226476761727, + "step": 50000 + }, + { + "epoch": 5.901139676649881, + "grad_norm": 0.0003732093027792871, + "learning_rate": 0.00014464493597206052, + "loss": 0.0012, + "step": 50100 + }, + { + "epoch": 5.912919280266219, + "grad_norm": 0.00037999494816176593, + "learning_rate": 0.000144229170131382, + "loss": 0.0012, + "step": 50200 + }, + { + "epoch": 5.924698883882558, + "grad_norm": 0.0003807779576163739, + "learning_rate": 0.00014381340429070347, + "loss": 0.0012, + "step": 50300 + }, + { + "epoch": 5.936478487498896, + "grad_norm": 0.0004126227577216923, + "learning_rate": 0.0001433976384500249, + "loss": 0.0012, + "step": 50400 + }, + { + "epoch": 5.948258091115234, + "grad_norm": 0.000385318067856133, + "learning_rate": 0.00014298187260934639, + "loss": 0.0012, + "step": 50500 + }, + { + "epoch": 5.9600376947315725, + "grad_norm": 0.00037791082286275923, + "learning_rate": 0.00014256610676866786, + "loss": 0.0012, + "step": 50600 + }, + { + "epoch": 5.9718172983479105, + "grad_norm": 0.0003764526336453855, + "learning_rate": 0.00014215034092798933, + "loss": 0.0012, + "step": 50700 + }, + { + "epoch": 5.9835969019642485, + "grad_norm": 0.00038927022251300514, + "learning_rate": 0.0001417345750873108, + "loss": 0.0012, + "step": 50800 + }, + { + "epoch": 5.995376505580587, + "grad_norm": 0.00038241027505137026, + "learning_rate": 0.00014131880924663227, + "loss": 0.0012, + "step": 50900 + }, + { + "epoch": 6.007067762169803, + "grad_norm": 0.0003638810303527862, + "learning_rate": 0.00014090304340595375, + "loss": 0.0011, + "step": 51000 + }, + { + "epoch": 6.007067762169803, + "eval_en-ja_loss": 0.0011762846261262894, + "eval_en-ja_mean_accuracy": 0.6716593042517947, + "eval_en-ja_negative_mse": -0.17524662613868713, + "eval_en-ja_runtime": 15.9531, + "eval_en-ja_samples_per_second": 454.08, + "eval_en-ja_src2trg_accuracy": 0.6858089453340696, + "eval_en-ja_steps_per_second": 1.818, + "eval_en-ja_trg2src_accuracy": 0.6575096631695196, + "eval_sequential_score": 0.24820633905655376, + "step": 51000 + }, + { + "epoch": 6.018847365786141, + "grad_norm": 0.00038111716276034713, + "learning_rate": 0.00014048727756527522, + "loss": 0.0012, + "step": 51100 + }, + { + "epoch": 6.03062696940248, + "grad_norm": 0.00038617817335762084, + "learning_rate": 0.0001400715117245967, + "loss": 0.0012, + "step": 51200 + }, + { + "epoch": 6.042406573018818, + "grad_norm": 0.00035966778523288667, + "learning_rate": 0.00013965574588391816, + "loss": 0.0012, + "step": 51300 + }, + { + "epoch": 6.054186176635156, + "grad_norm": 0.0003945520147681236, + "learning_rate": 0.00013923998004323964, + "loss": 0.0012, + "step": 51400 + }, + { + "epoch": 6.065965780251495, + "grad_norm": 0.0003713990154210478, + "learning_rate": 0.0001388242142025611, + "loss": 0.0012, + "step": 51500 + }, + { + "epoch": 6.077745383867833, + "grad_norm": 0.0003901557647623122, + "learning_rate": 0.00013840844836188258, + "loss": 0.0012, + "step": 51600 + }, + { + "epoch": 6.089524987484171, + "grad_norm": 0.0004185380239505321, + "learning_rate": 0.00013799268252120405, + "loss": 0.0012, + "step": 51700 + }, + { + "epoch": 6.10130459110051, + "grad_norm": 0.0003854265087284148, + "learning_rate": 0.00013757691668052553, + "loss": 0.0012, + "step": 51800 + }, + { + "epoch": 6.113084194716848, + "grad_norm": 0.0003727605508174747, + "learning_rate": 0.000137161150839847, + "loss": 0.0012, + "step": 51900 + }, + { + "epoch": 6.124863798333186, + "grad_norm": 0.00038400900666601956, + "learning_rate": 0.00013674538499916847, + "loss": 0.0012, + "step": 52000 + }, + { + "epoch": 6.124863798333186, + "eval_en-ja_loss": 0.0011772337602451444, + "eval_en-ja_mean_accuracy": 0.6695886250690226, + "eval_en-ja_negative_mse": -0.175467386841774, + "eval_en-ja_runtime": 17.3911, + "eval_en-ja_samples_per_second": 416.534, + "eval_en-ja_src2trg_accuracy": 0.6844284925455549, + "eval_en-ja_steps_per_second": 1.668, + "eval_en-ja_trg2src_accuracy": 0.6547487575924903, + "eval_sequential_score": 0.2470606191136243, + "step": 52000 + }, + { + "epoch": 6.136643401949525, + "grad_norm": 0.00039985417970456183, + "learning_rate": 0.00013632961915848994, + "loss": 0.0012, + "step": 52100 + }, + { + "epoch": 6.148423005565863, + "grad_norm": 0.00042014484643004835, + "learning_rate": 0.0001359138533178114, + "loss": 0.0012, + "step": 52200 + }, + { + "epoch": 6.160202609182201, + "grad_norm": 0.0004050545976497233, + "learning_rate": 0.00013549808747713286, + "loss": 0.0012, + "step": 52300 + }, + { + "epoch": 6.1719822127985395, + "grad_norm": 0.0003727364237420261, + "learning_rate": 0.00013508232163645433, + "loss": 0.0012, + "step": 52400 + }, + { + "epoch": 6.1837618164148775, + "grad_norm": 0.0003856962430290878, + "learning_rate": 0.0001346665557957758, + "loss": 0.0012, + "step": 52500 + }, + { + "epoch": 6.1955414200312156, + "grad_norm": 0.0003911394451279193, + "learning_rate": 0.00013425078995509728, + "loss": 0.0012, + "step": 52600 + }, + { + "epoch": 6.2073210236475544, + "grad_norm": 0.00036832279874943197, + "learning_rate": 0.00013383502411441875, + "loss": 0.0012, + "step": 52700 + }, + { + "epoch": 6.2191006272638925, + "grad_norm": 0.0003894712426699698, + "learning_rate": 0.00013341925827374022, + "loss": 0.0012, + "step": 52800 + }, + { + "epoch": 6.2308802308802305, + "grad_norm": 0.00039791001472622156, + "learning_rate": 0.0001330034924330617, + "loss": 0.0012, + "step": 52900 + }, + { + "epoch": 6.242659834496569, + "grad_norm": 0.0003839196579065174, + "learning_rate": 0.00013258772659238317, + "loss": 0.0012, + "step": 53000 + }, + { + "epoch": 6.242659834496569, + "eval_en-ja_loss": 0.0011730262776836753, + "eval_en-ja_mean_accuracy": 0.6686913307564881, + "eval_en-ja_negative_mse": -0.1750001162290573, + "eval_en-ja_runtime": 18.1015, + "eval_en-ja_samples_per_second": 400.188, + "eval_en-ja_src2trg_accuracy": 0.6834621755935947, + "eval_en-ja_steps_per_second": 1.602, + "eval_en-ja_trg2src_accuracy": 0.6539204859193816, + "eval_sequential_score": 0.2468456072637154, + "step": 53000 + }, + { + "epoch": 6.254439438112907, + "grad_norm": 0.0003898798313457519, + "learning_rate": 0.00013217196075170464, + "loss": 0.0012, + "step": 53100 + }, + { + "epoch": 6.266219041729245, + "grad_norm": 0.00037188423448242247, + "learning_rate": 0.00013175619491102608, + "loss": 0.0012, + "step": 53200 + }, + { + "epoch": 6.277998645345584, + "grad_norm": 0.00035944601404480636, + "learning_rate": 0.00013134042907034756, + "loss": 0.0012, + "step": 53300 + }, + { + "epoch": 6.289778248961922, + "grad_norm": 0.0003827316395472735, + "learning_rate": 0.00013092466322966903, + "loss": 0.0012, + "step": 53400 + }, + { + "epoch": 6.301557852578261, + "grad_norm": 0.0003913321706932038, + "learning_rate": 0.0001305088973889905, + "loss": 0.0011, + "step": 53500 + }, + { + "epoch": 6.313337456194599, + "grad_norm": 0.00037501187762245536, + "learning_rate": 0.00013009313154831197, + "loss": 0.0012, + "step": 53600 + }, + { + "epoch": 6.325117059810937, + "grad_norm": 0.0004179560346528888, + "learning_rate": 0.00012967736570763345, + "loss": 0.0012, + "step": 53700 + }, + { + "epoch": 6.336896663427276, + "grad_norm": 0.00039296175236813724, + "learning_rate": 0.00012926159986695492, + "loss": 0.0012, + "step": 53800 + }, + { + "epoch": 6.348676267043614, + "grad_norm": 0.00038269319338724017, + "learning_rate": 0.0001288458340262764, + "loss": 0.0012, + "step": 53900 + }, + { + "epoch": 6.360455870659952, + "grad_norm": 0.0003961716138292104, + "learning_rate": 0.00012843006818559786, + "loss": 0.0012, + "step": 54000 + }, + { + "epoch": 6.360455870659952, + "eval_en-ja_loss": 0.0011721947230398655, + "eval_en-ja_mean_accuracy": 0.6727636664826062, + "eval_en-ja_negative_mse": -0.17505289614200592, + "eval_en-ja_runtime": 15.2258, + "eval_en-ja_samples_per_second": 475.77, + "eval_en-ja_src2trg_accuracy": 0.6869133075648812, + "eval_en-ja_steps_per_second": 1.905, + "eval_en-ja_trg2src_accuracy": 0.6586140254003313, + "eval_sequential_score": 0.24885538517030015, + "step": 54000 + }, + { + "epoch": 6.372235474276291, + "grad_norm": 0.0003851813671644777, + "learning_rate": 0.00012801430234491934, + "loss": 0.0012, + "step": 54100 + }, + { + "epoch": 6.384015077892629, + "grad_norm": 0.00038847135147079825, + "learning_rate": 0.0001275985365042408, + "loss": 0.0012, + "step": 54200 + }, + { + "epoch": 6.395794681508967, + "grad_norm": 0.00038582016713917255, + "learning_rate": 0.00012718277066356228, + "loss": 0.0012, + "step": 54300 + }, + { + "epoch": 6.407574285125306, + "grad_norm": 0.00038994336500763893, + "learning_rate": 0.00012676700482288375, + "loss": 0.0012, + "step": 54400 + }, + { + "epoch": 6.419353888741644, + "grad_norm": 0.00037291229818947613, + "learning_rate": 0.0001263512389822052, + "loss": 0.0012, + "step": 54500 + }, + { + "epoch": 6.431133492357982, + "grad_norm": 0.00037391993100754917, + "learning_rate": 0.00012593547314152667, + "loss": 0.0011, + "step": 54600 + }, + { + "epoch": 6.442913095974321, + "grad_norm": 0.00037975850864313543, + "learning_rate": 0.00012551970730084814, + "loss": 0.0012, + "step": 54700 + }, + { + "epoch": 6.454692699590659, + "grad_norm": 0.00040658892248757184, + "learning_rate": 0.00012510394146016961, + "loss": 0.0012, + "step": 54800 + }, + { + "epoch": 6.466472303206997, + "grad_norm": 0.00040094659198075533, + "learning_rate": 0.0001246881756194911, + "loss": 0.0011, + "step": 54900 + }, + { + "epoch": 6.478251906823336, + "grad_norm": 0.00041930668521672487, + "learning_rate": 0.00012427240977881256, + "loss": 0.0011, + "step": 55000 + }, + { + "epoch": 6.478251906823336, + "eval_en-ja_loss": 0.0011713720159605145, + "eval_en-ja_mean_accuracy": 0.6686913307564881, + "eval_en-ja_negative_mse": -0.1749197542667389, + "eval_en-ja_runtime": 15.2462, + "eval_en-ja_samples_per_second": 475.135, + "eval_en-ja_src2trg_accuracy": 0.6819436775262286, + "eval_en-ja_steps_per_second": 1.902, + "eval_en-ja_trg2src_accuracy": 0.6554389839867476, + "eval_sequential_score": 0.24688578824487462, + "step": 55000 + }, + { + "epoch": 6.490031510439674, + "grad_norm": 0.0003573725407477468, + "learning_rate": 0.00012385664393813403, + "loss": 0.0012, + "step": 55100 + }, + { + "epoch": 6.501811114056012, + "grad_norm": 0.0003651538281701505, + "learning_rate": 0.0001234408780974555, + "loss": 0.0012, + "step": 55200 + }, + { + "epoch": 6.513590717672351, + "grad_norm": 0.0004064366512466222, + "learning_rate": 0.00012302511225677698, + "loss": 0.0011, + "step": 55300 + }, + { + "epoch": 6.525370321288689, + "grad_norm": 0.0004017501778434962, + "learning_rate": 0.00012260934641609845, + "loss": 0.0011, + "step": 55400 + }, + { + "epoch": 6.537149924905027, + "grad_norm": 0.0003772377676796168, + "learning_rate": 0.00012219358057541992, + "loss": 0.0011, + "step": 55500 + }, + { + "epoch": 6.548929528521366, + "grad_norm": 0.0003791556810028851, + "learning_rate": 0.00012177781473474138, + "loss": 0.0011, + "step": 55600 + }, + { + "epoch": 6.560709132137704, + "grad_norm": 0.00041642726864665747, + "learning_rate": 0.00012136204889406285, + "loss": 0.0011, + "step": 55700 + }, + { + "epoch": 6.572488735754042, + "grad_norm": 0.0003852172230836004, + "learning_rate": 0.00012094628305338432, + "loss": 0.0011, + "step": 55800 + }, + { + "epoch": 6.5842683393703805, + "grad_norm": 0.0003827541077043861, + "learning_rate": 0.0001205305172127058, + "loss": 0.0012, + "step": 55900 + }, + { + "epoch": 6.5960479429867185, + "grad_norm": 0.0004128233122173697, + "learning_rate": 0.00012011475137202725, + "loss": 0.0012, + "step": 56000 + }, + { + "epoch": 6.5960479429867185, + "eval_en-ja_loss": 0.0011674802517518401, + "eval_en-ja_mean_accuracy": 0.6725565985643291, + "eval_en-ja_negative_mse": -0.1745622605085373, + "eval_en-ja_runtime": 15.6956, + "eval_en-ja_samples_per_second": 461.529, + "eval_en-ja_src2trg_accuracy": 0.6859469906129211, + "eval_en-ja_steps_per_second": 1.848, + "eval_en-ja_trg2src_accuracy": 0.6591662065157372, + "eval_sequential_score": 0.24899716902789593, + "step": 56000 + }, + { + "epoch": 6.6078275466030565, + "grad_norm": 0.00039105219184421003, + "learning_rate": 0.00011969898553134873, + "loss": 0.0012, + "step": 56100 + }, + { + "epoch": 6.619607150219395, + "grad_norm": 0.00042517733527347445, + "learning_rate": 0.0001192832196906702, + "loss": 0.0011, + "step": 56200 + }, + { + "epoch": 6.631386753835733, + "grad_norm": 0.00039977877167984843, + "learning_rate": 0.00011886745384999167, + "loss": 0.0012, + "step": 56300 + }, + { + "epoch": 6.643166357452071, + "grad_norm": 0.00036832160549238324, + "learning_rate": 0.00011845168800931314, + "loss": 0.0011, + "step": 56400 + }, + { + "epoch": 6.65494596106841, + "grad_norm": 0.00037057173904031515, + "learning_rate": 0.00011803592216863462, + "loss": 0.0011, + "step": 56500 + }, + { + "epoch": 6.666725564684748, + "grad_norm": 0.00040815569809637964, + "learning_rate": 0.00011762015632795609, + "loss": 0.0011, + "step": 56600 + }, + { + "epoch": 6.678505168301086, + "grad_norm": 0.00038996554212644696, + "learning_rate": 0.00011720439048727756, + "loss": 0.0011, + "step": 56700 + }, + { + "epoch": 6.690284771917425, + "grad_norm": 0.00037325904122553766, + "learning_rate": 0.00011678862464659903, + "loss": 0.0011, + "step": 56800 + }, + { + "epoch": 6.702064375533763, + "grad_norm": 0.00040903699118644, + "learning_rate": 0.00011637285880592049, + "loss": 0.0012, + "step": 56900 + }, + { + "epoch": 6.713843979150101, + "grad_norm": 0.00036679740878753364, + "learning_rate": 0.00011595709296524196, + "loss": 0.0011, + "step": 57000 + }, + { + "epoch": 6.713843979150101, + "eval_en-ja_loss": 0.0011687715305015445, + "eval_en-ja_mean_accuracy": 0.672073440088349, + "eval_en-ja_negative_mse": -0.1747935712337494, + "eval_en-ja_runtime": 15.1235, + "eval_en-ja_samples_per_second": 478.989, + "eval_en-ja_src2trg_accuracy": 0.6859469906129211, + "eval_en-ja_steps_per_second": 1.918, + "eval_en-ja_trg2src_accuracy": 0.658199889563777, + "eval_sequential_score": 0.2486399344272998, + "step": 57000 + }, + { + "epoch": 6.72562358276644, + "grad_norm": 0.0003858786076307297, + "learning_rate": 0.00011554132712456344, + "loss": 0.0011, + "step": 57100 + }, + { + "epoch": 6.737403186382778, + "grad_norm": 0.00040288810851052403, + "learning_rate": 0.00011512556128388491, + "loss": 0.0012, + "step": 57200 + }, + { + "epoch": 6.749182789999116, + "grad_norm": 0.00040118524339050055, + "learning_rate": 0.00011470979544320638, + "loss": 0.0011, + "step": 57300 + }, + { + "epoch": 6.760962393615455, + "grad_norm": 0.00038556699291802943, + "learning_rate": 0.00011429402960252785, + "loss": 0.0011, + "step": 57400 + }, + { + "epoch": 6.772741997231793, + "grad_norm": 0.00044391959090717137, + "learning_rate": 0.00011387826376184933, + "loss": 0.0011, + "step": 57500 + }, + { + "epoch": 6.784521600848132, + "grad_norm": 0.00038290160591714084, + "learning_rate": 0.00011346249792117079, + "loss": 0.0011, + "step": 57600 + }, + { + "epoch": 6.79630120446447, + "grad_norm": 0.0003911529202014208, + "learning_rate": 0.00011304673208049226, + "loss": 0.0011, + "step": 57700 + }, + { + "epoch": 6.808080808080808, + "grad_norm": 0.0003912771353498101, + "learning_rate": 0.00011263096623981372, + "loss": 0.0011, + "step": 57800 + }, + { + "epoch": 6.819860411697146, + "grad_norm": 0.0003660422225948423, + "learning_rate": 0.00011221520039913519, + "loss": 0.0011, + "step": 57900 + }, + { + "epoch": 6.831640015313485, + "grad_norm": 0.00039444954018108547, + "learning_rate": 0.00011179943455845666, + "loss": 0.0011, + "step": 58000 + }, + { + "epoch": 6.831640015313485, + "eval_en-ja_loss": 0.0011658320436254144, + "eval_en-ja_mean_accuracy": 0.6709690778575372, + "eval_en-ja_negative_mse": -0.1743980348110199, + "eval_en-ja_runtime": 15.2875, + "eval_en-ja_samples_per_second": 473.852, + "eval_en-ja_src2trg_accuracy": 0.683876311430149, + "eval_en-ja_steps_per_second": 1.897, + "eval_en-ja_trg2src_accuracy": 0.6580618442849254, + "eval_sequential_score": 0.24828552152325867, + "step": 58000 + }, + { + "epoch": 6.843419618929823, + "grad_norm": 0.00040171254659071565, + "learning_rate": 0.00011138366871777813, + "loss": 0.0011, + "step": 58100 + }, + { + "epoch": 6.855199222546162, + "grad_norm": 0.00040757787064649165, + "learning_rate": 0.0001109679028770996, + "loss": 0.0011, + "step": 58200 + }, + { + "epoch": 6.8669788261625, + "grad_norm": 0.00039746586116962135, + "learning_rate": 0.00011055213703642108, + "loss": 0.0011, + "step": 58300 + }, + { + "epoch": 6.878758429778838, + "grad_norm": 0.00036546625779010355, + "learning_rate": 0.00011013637119574255, + "loss": 0.0011, + "step": 58400 + }, + { + "epoch": 6.890538033395176, + "grad_norm": 0.0003577105817385018, + "learning_rate": 0.00010972060535506402, + "loss": 0.0011, + "step": 58500 + }, + { + "epoch": 6.902317637011515, + "grad_norm": 0.0003869399952236563, + "learning_rate": 0.0001093048395143855, + "loss": 0.0011, + "step": 58600 + }, + { + "epoch": 6.914097240627853, + "grad_norm": 0.0004132670001126826, + "learning_rate": 0.00010888907367370697, + "loss": 0.0011, + "step": 58700 + }, + { + "epoch": 6.925876844244192, + "grad_norm": 0.0003834750968962908, + "learning_rate": 0.00010847330783302843, + "loss": 0.0011, + "step": 58800 + }, + { + "epoch": 6.93765644786053, + "grad_norm": 0.000377436401322484, + "learning_rate": 0.0001080575419923499, + "loss": 0.0011, + "step": 58900 + }, + { + "epoch": 6.949436051476868, + "grad_norm": 0.0003678147913888097, + "learning_rate": 0.00010764177615167137, + "loss": 0.0011, + "step": 59000 + }, + { + "epoch": 6.949436051476868, + "eval_en-ja_loss": 0.0011661696480587125, + "eval_en-ja_mean_accuracy": 0.671797349530646, + "eval_en-ja_negative_mse": -0.1747504323720932, + "eval_en-ja_runtime": 15.8684, + "eval_en-ja_samples_per_second": 456.506, + "eval_en-ja_src2trg_accuracy": 0.684152401987852, + "eval_en-ja_steps_per_second": 1.828, + "eval_en-ja_trg2src_accuracy": 0.65944229707344, + "eval_sequential_score": 0.24852345857927638, + "step": 59000 + }, + { + "epoch": 6.9612156550932065, + "grad_norm": 0.0003904175537172705, + "learning_rate": 0.00010722601031099284, + "loss": 0.0011, + "step": 59100 + }, + { + "epoch": 6.9729952587095445, + "grad_norm": 0.00037075518048368394, + "learning_rate": 0.00010681024447031432, + "loss": 0.0011, + "step": 59200 + }, + { + "epoch": 6.9847748623258825, + "grad_norm": 0.00041074954788200557, + "learning_rate": 0.00010639447862963579, + "loss": 0.0011, + "step": 59300 + }, + { + "epoch": 6.996554465942221, + "grad_norm": 0.0003889407671522349, + "learning_rate": 0.00010597871278895726, + "loss": 0.0011, + "step": 59400 + }, + { + "epoch": 7.008245722531437, + "grad_norm": 0.00039450384792871773, + "learning_rate": 0.00010556294694827873, + "loss": 0.0011, + "step": 59500 + }, + { + "epoch": 7.020025326147775, + "grad_norm": 0.00037633461761288345, + "learning_rate": 0.0001051471811076002, + "loss": 0.0011, + "step": 59600 + }, + { + "epoch": 7.031804929764114, + "grad_norm": 0.0003587510727811605, + "learning_rate": 0.00010473141526692165, + "loss": 0.0011, + "step": 59700 + }, + { + "epoch": 7.043584533380452, + "grad_norm": 0.000375708332285285, + "learning_rate": 0.00010431564942624312, + "loss": 0.0011, + "step": 59800 + }, + { + "epoch": 7.05536413699679, + "grad_norm": 0.0003931323590222746, + "learning_rate": 0.0001038998835855646, + "loss": 0.0011, + "step": 59900 + }, + { + "epoch": 7.067143740613129, + "grad_norm": 0.0003840556601062417, + "learning_rate": 0.00010348411774488607, + "loss": 0.0011, + "step": 60000 + }, + { + "epoch": 7.067143740613129, + "eval_en-ja_loss": 0.0011633113026618958, + "eval_en-ja_mean_accuracy": 0.6722114853672004, + "eval_en-ja_negative_mse": -0.17430946230888367, + "eval_en-ja_runtime": 16.2404, + "eval_en-ja_samples_per_second": 446.049, + "eval_en-ja_src2trg_accuracy": 0.683876311430149, + "eval_en-ja_steps_per_second": 1.786, + "eval_en-ja_trg2src_accuracy": 0.6605466593042518, + "eval_sequential_score": 0.24895101152915838, + "step": 60000 + }, + { + "epoch": 7.078923344229467, + "grad_norm": 0.00038004378438927233, + "learning_rate": 0.00010306835190420754, + "loss": 0.0011, + "step": 60100 + }, + { + "epoch": 7.090702947845805, + "grad_norm": 0.00040805505705066025, + "learning_rate": 0.00010265258606352901, + "loss": 0.0011, + "step": 60200 + }, + { + "epoch": 7.102482551462144, + "grad_norm": 0.0003804980078712106, + "learning_rate": 0.00010223682022285048, + "loss": 0.0011, + "step": 60300 + }, + { + "epoch": 7.114262155078482, + "grad_norm": 0.000404447695473209, + "learning_rate": 0.00010182105438217196, + "loss": 0.0011, + "step": 60400 + }, + { + "epoch": 7.12604175869482, + "grad_norm": 0.00037697446532547474, + "learning_rate": 0.00010140528854149343, + "loss": 0.0011, + "step": 60500 + }, + { + "epoch": 7.137821362311159, + "grad_norm": 0.000377972872229293, + "learning_rate": 0.00010098952270081489, + "loss": 0.0011, + "step": 60600 + }, + { + "epoch": 7.149600965927497, + "grad_norm": 0.000384141894755885, + "learning_rate": 0.00010057375686013636, + "loss": 0.0011, + "step": 60700 + }, + { + "epoch": 7.161380569543835, + "grad_norm": 0.0003973060520365834, + "learning_rate": 0.00010015799101945783, + "loss": 0.0011, + "step": 60800 + }, + { + "epoch": 7.1731601731601735, + "grad_norm": 0.00037429051008075476, + "learning_rate": 9.97422251787793e-05, + "loss": 0.0011, + "step": 60900 + }, + { + "epoch": 7.1849397767765115, + "grad_norm": 0.00039109771023504436, + "learning_rate": 9.932645933810078e-05, + "loss": 0.0011, + "step": 61000 + }, + { + "epoch": 7.1849397767765115, + "eval_en-ja_loss": 0.0011624135076999664, + "eval_en-ja_mean_accuracy": 0.6731778023191607, + "eval_en-ja_negative_mse": -0.1742713451385498, + "eval_en-ja_runtime": 16.1357, + "eval_en-ja_samples_per_second": 448.943, + "eval_en-ja_src2trg_accuracy": 0.688293760353396, + "eval_en-ja_steps_per_second": 1.797, + "eval_en-ja_trg2src_accuracy": 0.6580618442849254, + "eval_sequential_score": 0.24945322859030544, + "step": 61000 + }, + { + "epoch": 7.1967193803928495, + "grad_norm": 0.0003723310655914247, + "learning_rate": 9.891069349742225e-05, + "loss": 0.0011, + "step": 61100 + }, + { + "epoch": 7.208498984009188, + "grad_norm": 0.0003875307156704366, + "learning_rate": 9.849492765674372e-05, + "loss": 0.0011, + "step": 61200 + }, + { + "epoch": 7.220278587625526, + "grad_norm": 0.0004053867014590651, + "learning_rate": 9.80791618160652e-05, + "loss": 0.0011, + "step": 61300 + }, + { + "epoch": 7.232058191241864, + "grad_norm": 0.0004063684609718621, + "learning_rate": 9.766339597538667e-05, + "loss": 0.0011, + "step": 61400 + }, + { + "epoch": 7.243837794858203, + "grad_norm": 0.000408061285270378, + "learning_rate": 9.724763013470814e-05, + "loss": 0.0011, + "step": 61500 + }, + { + "epoch": 7.255617398474541, + "grad_norm": 0.0003542134363669902, + "learning_rate": 9.683186429402958e-05, + "loss": 0.0011, + "step": 61600 + }, + { + "epoch": 7.267397002090879, + "grad_norm": 0.0003746738948393613, + "learning_rate": 9.641609845335106e-05, + "loss": 0.0011, + "step": 61700 + }, + { + "epoch": 7.279176605707218, + "grad_norm": 0.00038777096779085696, + "learning_rate": 9.600033261267253e-05, + "loss": 0.0011, + "step": 61800 + }, + { + "epoch": 7.290956209323556, + "grad_norm": 0.00038884810055606067, + "learning_rate": 9.5584566771994e-05, + "loss": 0.0011, + "step": 61900 + }, + { + "epoch": 7.302735812939894, + "grad_norm": 0.00038354366552084684, + "learning_rate": 9.516880093131547e-05, + "loss": 0.0011, + "step": 62000 + }, + { + "epoch": 7.302735812939894, + "eval_en-ja_loss": 0.0011605541221797466, + "eval_en-ja_mean_accuracy": 0.6719353948094975, + "eval_en-ja_negative_mse": -0.1740083396434784, + "eval_en-ja_runtime": 15.6112, + "eval_en-ja_samples_per_second": 464.025, + "eval_en-ja_src2trg_accuracy": 0.6864991717283269, + "eval_en-ja_steps_per_second": 1.858, + "eval_en-ja_trg2src_accuracy": 0.6573716178906681, + "eval_sequential_score": 0.24896352758300955, + "step": 62000 + }, + { + "epoch": 7.314515416556233, + "grad_norm": 0.0003730931202881038, + "learning_rate": 9.475303509063694e-05, + "loss": 0.0011, + "step": 62100 + }, + { + "epoch": 7.326295020172571, + "grad_norm": 0.00039628075319342315, + "learning_rate": 9.433726924995842e-05, + "loss": 0.0011, + "step": 62200 + }, + { + "epoch": 7.338074623788909, + "grad_norm": 0.00039051880594342947, + "learning_rate": 9.392150340927989e-05, + "loss": 0.0011, + "step": 62300 + }, + { + "epoch": 7.349854227405248, + "grad_norm": 0.00039308969280682504, + "learning_rate": 9.350573756860136e-05, + "loss": 0.0011, + "step": 62400 + }, + { + "epoch": 7.361633831021586, + "grad_norm": 0.00040232320316135883, + "learning_rate": 9.308997172792282e-05, + "loss": 0.0011, + "step": 62500 + }, + { + "epoch": 7.373413434637924, + "grad_norm": 0.0003875276306644082, + "learning_rate": 9.267420588724429e-05, + "loss": 0.0011, + "step": 62600 + }, + { + "epoch": 7.385193038254263, + "grad_norm": 0.00039769316208548844, + "learning_rate": 9.225844004656577e-05, + "loss": 0.0011, + "step": 62700 + }, + { + "epoch": 7.396972641870601, + "grad_norm": 0.0003912523970939219, + "learning_rate": 9.184267420588724e-05, + "loss": 0.0011, + "step": 62800 + }, + { + "epoch": 7.408752245486939, + "grad_norm": 0.00037197049823589623, + "learning_rate": 9.142690836520871e-05, + "loss": 0.0011, + "step": 62900 + }, + { + "epoch": 7.420531849103278, + "grad_norm": 0.0003954895946662873, + "learning_rate": 9.101114252453018e-05, + "loss": 0.0011, + "step": 63000 + }, + { + "epoch": 7.420531849103278, + "eval_en-ja_loss": 0.0011597032425925136, + "eval_en-ja_mean_accuracy": 0.6725565985643291, + "eval_en-ja_negative_mse": -0.17402510344982147, + "eval_en-ja_runtime": 16.0558, + "eval_en-ja_samples_per_second": 451.178, + "eval_en-ja_src2trg_accuracy": 0.6863611264494754, + "eval_en-ja_steps_per_second": 1.806, + "eval_en-ja_trg2src_accuracy": 0.6587520706791827, + "eval_sequential_score": 0.24926574755725384, + "step": 63000 + }, + { + "epoch": 7.432311452719616, + "grad_norm": 0.00037373058148659766, + "learning_rate": 9.059537668385165e-05, + "loss": 0.0011, + "step": 63100 + }, + { + "epoch": 7.444091056335954, + "grad_norm": 0.0003724014386534691, + "learning_rate": 9.017961084317313e-05, + "loss": 0.0011, + "step": 63200 + }, + { + "epoch": 7.455870659952293, + "grad_norm": 0.000386463274480775, + "learning_rate": 8.97638450024946e-05, + "loss": 0.0011, + "step": 63300 + }, + { + "epoch": 7.467650263568631, + "grad_norm": 0.00037366541801020503, + "learning_rate": 8.934807916181604e-05, + "loss": 0.0011, + "step": 63400 + }, + { + "epoch": 7.479429867184969, + "grad_norm": 0.0003651261213235557, + "learning_rate": 8.893231332113752e-05, + "loss": 0.0011, + "step": 63500 + }, + { + "epoch": 7.491209470801308, + "grad_norm": 0.00042331754229962826, + "learning_rate": 8.851654748045899e-05, + "loss": 0.0011, + "step": 63600 + }, + { + "epoch": 7.502989074417646, + "grad_norm": 0.00036620654282160103, + "learning_rate": 8.810078163978046e-05, + "loss": 0.0011, + "step": 63700 + }, + { + "epoch": 7.514768678033985, + "grad_norm": 0.00037081693881191313, + "learning_rate": 8.768501579910193e-05, + "loss": 0.0011, + "step": 63800 + }, + { + "epoch": 7.526548281650323, + "grad_norm": 0.00037995251477696, + "learning_rate": 8.72692499584234e-05, + "loss": 0.0011, + "step": 63900 + }, + { + "epoch": 7.538327885266661, + "grad_norm": 0.00041717124986462295, + "learning_rate": 8.685348411774488e-05, + "loss": 0.0011, + "step": 64000 + }, + { + "epoch": 7.538327885266661, + "eval_en-ja_loss": 0.0011579494457691908, + "eval_en-ja_mean_accuracy": 0.6712451684152402, + "eval_en-ja_negative_mse": -0.1738053560256958, + "eval_en-ja_runtime": 16.1096, + "eval_en-ja_samples_per_second": 449.67, + "eval_en-ja_src2trg_accuracy": 0.6845665378244064, + "eval_en-ja_steps_per_second": 1.8, + "eval_en-ja_trg2src_accuracy": 0.657923799006074, + "eval_sequential_score": 0.2487199061947722, + "step": 64000 + }, + { + "epoch": 7.550107488882999, + "grad_norm": 0.00037772778887301683, + "learning_rate": 8.643771827706635e-05, + "loss": 0.0011, + "step": 64100 + }, + { + "epoch": 7.5618870924993375, + "grad_norm": 0.00040009853546507657, + "learning_rate": 8.602195243638782e-05, + "loss": 0.0011, + "step": 64200 + }, + { + "epoch": 7.573666696115676, + "grad_norm": 0.000418604671722278, + "learning_rate": 8.560618659570928e-05, + "loss": 0.0011, + "step": 64300 + }, + { + "epoch": 7.5854462997320145, + "grad_norm": 0.00037799408892169595, + "learning_rate": 8.519042075503075e-05, + "loss": 0.0011, + "step": 64400 + }, + { + "epoch": 7.5972259033483525, + "grad_norm": 0.0003827895852737129, + "learning_rate": 8.477465491435223e-05, + "loss": 0.0011, + "step": 64500 + }, + { + "epoch": 7.6090055069646905, + "grad_norm": 0.0003785111475735903, + "learning_rate": 8.43588890736737e-05, + "loss": 0.0011, + "step": 64600 + }, + { + "epoch": 7.6207851105810285, + "grad_norm": 0.00037619651993736625, + "learning_rate": 8.394312323299517e-05, + "loss": 0.0011, + "step": 64700 + }, + { + "epoch": 7.632564714197367, + "grad_norm": 0.0003860403085127473, + "learning_rate": 8.352735739231664e-05, + "loss": 0.0011, + "step": 64800 + }, + { + "epoch": 7.644344317813705, + "grad_norm": 0.0004239362315274775, + "learning_rate": 8.311159155163812e-05, + "loss": 0.0011, + "step": 64900 + }, + { + "epoch": 7.656123921430044, + "grad_norm": 0.00036785047268494964, + "learning_rate": 8.269582571095959e-05, + "loss": 0.0011, + "step": 65000 + }, + { + "epoch": 7.656123921430044, + "eval_en-ja_loss": 0.0011565688764676452, + "eval_en-ja_mean_accuracy": 0.6722805080066262, + "eval_en-ja_negative_mse": -0.17366579174995422, + "eval_en-ja_runtime": 15.4407, + "eval_en-ja_samples_per_second": 469.15, + "eval_en-ja_src2trg_accuracy": 0.6856709000552181, + "eval_en-ja_steps_per_second": 1.878, + "eval_en-ja_trg2src_accuracy": 0.6588901159580343, + "eval_sequential_score": 0.249307358128336, + "step": 65000 + }, + { + "epoch": 7.667903525046382, + "grad_norm": 0.00038455682806670666, + "learning_rate": 8.228005987028106e-05, + "loss": 0.0011, + "step": 65100 + }, + { + "epoch": 7.67968312866272, + "grad_norm": 0.00035886719706468284, + "learning_rate": 8.186429402960253e-05, + "loss": 0.0011, + "step": 65200 + }, + { + "epoch": 7.691462732279059, + "grad_norm": 0.0003581951605156064, + "learning_rate": 8.144852818892398e-05, + "loss": 0.0011, + "step": 65300 + }, + { + "epoch": 7.703242335895397, + "grad_norm": 0.0003670606529340148, + "learning_rate": 8.103276234824545e-05, + "loss": 0.0011, + "step": 65400 + }, + { + "epoch": 7.715021939511735, + "grad_norm": 0.00037795593379996717, + "learning_rate": 8.061699650756692e-05, + "loss": 0.0011, + "step": 65500 + }, + { + "epoch": 7.726801543128074, + "grad_norm": 0.000354167481418699, + "learning_rate": 8.02012306668884e-05, + "loss": 0.0011, + "step": 65600 + }, + { + "epoch": 7.738581146744412, + "grad_norm": 0.00039969783392734826, + "learning_rate": 7.978546482620987e-05, + "loss": 0.0011, + "step": 65700 + }, + { + "epoch": 7.75036075036075, + "grad_norm": 0.00039526625187136233, + "learning_rate": 7.936969898553134e-05, + "loss": 0.0011, + "step": 65800 + }, + { + "epoch": 7.762140353977089, + "grad_norm": 0.0003688740252982825, + "learning_rate": 7.895393314485281e-05, + "loss": 0.0011, + "step": 65900 + }, + { + "epoch": 7.773919957593427, + "grad_norm": 0.00037402415182441473, + "learning_rate": 7.853816730417428e-05, + "loss": 0.0011, + "step": 66000 + }, + { + "epoch": 7.773919957593427, + "eval_en-ja_loss": 0.001155746984295547, + "eval_en-ja_mean_accuracy": 0.6745582551076753, + "eval_en-ja_negative_mse": -0.1736912578344345, + "eval_en-ja_runtime": 15.4732, + "eval_en-ja_samples_per_second": 468.164, + "eval_en-ja_src2trg_accuracy": 0.6881557150745444, + "eval_en-ja_steps_per_second": 1.874, + "eval_en-ja_trg2src_accuracy": 0.6609607951408062, + "eval_sequential_score": 0.2504334986366204, + "step": 66000 + }, + { + "epoch": 7.785699561209765, + "grad_norm": 0.00035557872615754604, + "learning_rate": 7.812240146349576e-05, + "loss": 0.0011, + "step": 66100 + }, + { + "epoch": 7.797479164826104, + "grad_norm": 0.0003937767760362476, + "learning_rate": 7.770663562281722e-05, + "loss": 0.0011, + "step": 66200 + }, + { + "epoch": 7.809258768442442, + "grad_norm": 0.00036812914186157286, + "learning_rate": 7.729086978213869e-05, + "loss": 0.0011, + "step": 66300 + }, + { + "epoch": 7.82103837205878, + "grad_norm": 0.000401312398025766, + "learning_rate": 7.687510394146016e-05, + "loss": 0.0011, + "step": 66400 + }, + { + "epoch": 7.832817975675119, + "grad_norm": 0.00037549060652963817, + "learning_rate": 7.645933810078163e-05, + "loss": 0.0011, + "step": 66500 + }, + { + "epoch": 7.844597579291457, + "grad_norm": 0.00036597222788259387, + "learning_rate": 7.60435722601031e-05, + "loss": 0.0011, + "step": 66600 + }, + { + "epoch": 7.856377182907795, + "grad_norm": 0.00038033645250834525, + "learning_rate": 7.562780641942458e-05, + "loss": 0.0011, + "step": 66700 + }, + { + "epoch": 7.868156786524134, + "grad_norm": 0.00037783014704473317, + "learning_rate": 7.521204057874605e-05, + "loss": 0.0011, + "step": 66800 + }, + { + "epoch": 7.879936390140472, + "grad_norm": 0.00038337422301992774, + "learning_rate": 7.479627473806751e-05, + "loss": 0.0011, + "step": 66900 + }, + { + "epoch": 7.89171599375681, + "grad_norm": 0.00038828441756777465, + "learning_rate": 7.438050889738898e-05, + "loss": 0.0011, + "step": 67000 + }, + { + "epoch": 7.89171599375681, + "eval_en-ja_loss": 0.0011530497577041388, + "eval_en-ja_mean_accuracy": 0.6740060739922695, + "eval_en-ja_negative_mse": -0.17330332100391388, + "eval_en-ja_runtime": 16.0243, + "eval_en-ja_samples_per_second": 452.064, + "eval_en-ja_src2trg_accuracy": 0.6870513528437328, + "eval_en-ja_steps_per_second": 1.81, + "eval_en-ja_trg2src_accuracy": 0.6609607951408062, + "eval_sequential_score": 0.2503513764941778, + "step": 67000 + }, + { + "epoch": 7.903495597373149, + "grad_norm": 0.00037329448969103396, + "learning_rate": 7.396474305671045e-05, + "loss": 0.0011, + "step": 67100 + }, + { + "epoch": 7.915275200989487, + "grad_norm": 0.00039515344542451203, + "learning_rate": 7.354897721603193e-05, + "loss": 0.0011, + "step": 67200 + }, + { + "epoch": 7.927054804605825, + "grad_norm": 0.00038708417559973896, + "learning_rate": 7.31332113753534e-05, + "loss": 0.0011, + "step": 67300 + }, + { + "epoch": 7.938834408222164, + "grad_norm": 0.000391502893762663, + "learning_rate": 7.271744553467486e-05, + "loss": 0.0011, + "step": 67400 + }, + { + "epoch": 7.950614011838502, + "grad_norm": 0.00037589779822155833, + "learning_rate": 7.230167969399633e-05, + "loss": 0.0011, + "step": 67500 + }, + { + "epoch": 7.96239361545484, + "grad_norm": 0.0004028193943668157, + "learning_rate": 7.18859138533178e-05, + "loss": 0.0011, + "step": 67600 + }, + { + "epoch": 7.9741732190711785, + "grad_norm": 0.0004133171169087291, + "learning_rate": 7.147014801263927e-05, + "loss": 0.0011, + "step": 67700 + }, + { + "epoch": 7.9859528226875165, + "grad_norm": 0.0003792022180277854, + "learning_rate": 7.105438217196075e-05, + "loss": 0.0011, + "step": 67800 + }, + { + "epoch": 7.9977324263038545, + "grad_norm": 0.000405277096433565, + "learning_rate": 7.063861633128222e-05, + "loss": 0.0011, + "step": 67900 + }, + { + "epoch": 8.00942368289307, + "grad_norm": 0.00037854581023566425, + "learning_rate": 7.022285049060369e-05, + "loss": 0.0011, + "step": 68000 + }, + { + "epoch": 8.00942368289307, + "eval_en-ja_loss": 0.0011519509134814143, + "eval_en-ja_mean_accuracy": 0.6732468249585863, + "eval_en-ja_negative_mse": -0.1732640564441681, + "eval_en-ja_runtime": 16.3197, + "eval_en-ja_samples_per_second": 443.881, + "eval_en-ja_src2trg_accuracy": 0.6870513528437328, + "eval_en-ja_steps_per_second": 1.777, + "eval_en-ja_trg2src_accuracy": 0.65944229707344, + "eval_sequential_score": 0.24999138425720913, + "step": 68000 + }, + { + "epoch": 8.02120328650941, + "grad_norm": 0.00038805027725175023, + "learning_rate": 6.980708464992516e-05, + "loss": 0.0011, + "step": 68100 + }, + { + "epoch": 8.032982890125748, + "grad_norm": 0.00037932037957943976, + "learning_rate": 6.939131880924663e-05, + "loss": 0.0011, + "step": 68200 + }, + { + "epoch": 8.044762493742086, + "grad_norm": 0.0003810868365690112, + "learning_rate": 6.89755529685681e-05, + "loss": 0.0011, + "step": 68300 + }, + { + "epoch": 8.056542097358424, + "grad_norm": 0.00039596392889507115, + "learning_rate": 6.855978712788957e-05, + "loss": 0.0011, + "step": 68400 + }, + { + "epoch": 8.068321700974762, + "grad_norm": 0.0003704438859131187, + "learning_rate": 6.814402128721104e-05, + "loss": 0.0011, + "step": 68500 + }, + { + "epoch": 8.0801013045911, + "grad_norm": 0.00036145004560239613, + "learning_rate": 6.772825544653251e-05, + "loss": 0.0011, + "step": 68600 + }, + { + "epoch": 8.09188090820744, + "grad_norm": 0.00036129524232819676, + "learning_rate": 6.731248960585398e-05, + "loss": 0.0011, + "step": 68700 + }, + { + "epoch": 8.103660511823778, + "grad_norm": 0.0003919534501619637, + "learning_rate": 6.689672376517544e-05, + "loss": 0.0011, + "step": 68800 + }, + { + "epoch": 8.115440115440116, + "grad_norm": 0.00038478715578094125, + "learning_rate": 6.648095792449691e-05, + "loss": 0.0011, + "step": 68900 + }, + { + "epoch": 8.127219719056454, + "grad_norm": 0.0003716874634847045, + "learning_rate": 6.606519208381839e-05, + "loss": 0.0011, + "step": 69000 + }, + { + "epoch": 8.127219719056454, + "eval_en-ja_loss": 0.0011516198283061385, + "eval_en-ja_mean_accuracy": 0.6729707344008835, + "eval_en-ja_negative_mse": -0.17333927750587463, + "eval_en-ja_runtime": 16.8456, + "eval_en-ja_samples_per_second": 430.023, + "eval_en-ja_src2trg_accuracy": 0.6860850358917725, + "eval_en-ja_steps_per_second": 1.722, + "eval_en-ja_trg2src_accuracy": 0.6598564329099945, + "eval_sequential_score": 0.24981572844750444, + "step": 69000 + }, + { + "epoch": 8.138999322672792, + "grad_norm": 0.0003551229601725936, + "learning_rate": 6.564942624313986e-05, + "loss": 0.0011, + "step": 69100 + }, + { + "epoch": 8.15077892628913, + "grad_norm": 0.0003837377589661628, + "learning_rate": 6.523366040246132e-05, + "loss": 0.0011, + "step": 69200 + }, + { + "epoch": 8.16255852990547, + "grad_norm": 0.00039518391713500023, + "learning_rate": 6.481789456178279e-05, + "loss": 0.0011, + "step": 69300 + }, + { + "epoch": 8.174338133521807, + "grad_norm": 0.0003781222039833665, + "learning_rate": 6.440212872110426e-05, + "loss": 0.0011, + "step": 69400 + }, + { + "epoch": 8.186117737138146, + "grad_norm": 0.0003702257527038455, + "learning_rate": 6.398636288042573e-05, + "loss": 0.0011, + "step": 69500 + }, + { + "epoch": 8.197897340754484, + "grad_norm": 0.00038342276820912957, + "learning_rate": 6.35705970397472e-05, + "loss": 0.0011, + "step": 69600 + }, + { + "epoch": 8.209676944370822, + "grad_norm": 0.00036476284731179476, + "learning_rate": 6.315483119906868e-05, + "loss": 0.0011, + "step": 69700 + }, + { + "epoch": 8.22145654798716, + "grad_norm": 0.0003710215096361935, + "learning_rate": 6.273906535839015e-05, + "loss": 0.0011, + "step": 69800 + }, + { + "epoch": 8.2332361516035, + "grad_norm": 0.00038622686406597495, + "learning_rate": 6.232329951771162e-05, + "loss": 0.0011, + "step": 69900 + }, + { + "epoch": 8.245015755219837, + "grad_norm": 0.00037663665716536343, + "learning_rate": 6.19075336770331e-05, + "loss": 0.0011, + "step": 70000 + }, + { + "epoch": 8.245015755219837, + "eval_en-ja_loss": 0.001151022152043879, + "eval_en-ja_mean_accuracy": 0.6725565985643291, + "eval_en-ja_negative_mse": -0.1732015758752823, + "eval_en-ja_runtime": 16.2396, + "eval_en-ja_samples_per_second": 446.069, + "eval_en-ja_src2trg_accuracy": 0.6864991717283269, + "eval_en-ja_steps_per_second": 1.786, + "eval_en-ja_trg2src_accuracy": 0.6586140254003313, + "eval_sequential_score": 0.24967751134452343, + "step": 70000 + }, + { + "epoch": 8.256795358836175, + "grad_norm": 0.0003928474325221032, + "learning_rate": 6.149176783635457e-05, + "loss": 0.0011, + "step": 70100 + }, + { + "epoch": 8.268574962452513, + "grad_norm": 0.0003761306288652122, + "learning_rate": 6.107600199567603e-05, + "loss": 0.0011, + "step": 70200 + }, + { + "epoch": 8.280354566068851, + "grad_norm": 0.0003697881766129285, + "learning_rate": 6.06602361549975e-05, + "loss": 0.0011, + "step": 70300 + }, + { + "epoch": 8.29213416968519, + "grad_norm": 0.000399710435885936, + "learning_rate": 6.024447031431897e-05, + "loss": 0.0011, + "step": 70400 + }, + { + "epoch": 8.30391377330153, + "grad_norm": 0.0004011922574136406, + "learning_rate": 5.9828704473640444e-05, + "loss": 0.0011, + "step": 70500 + }, + { + "epoch": 8.315693376917867, + "grad_norm": 0.00036803894909098744, + "learning_rate": 5.941293863296191e-05, + "loss": 0.0011, + "step": 70600 + }, + { + "epoch": 8.327472980534205, + "grad_norm": 0.00035756686702370644, + "learning_rate": 5.899717279228338e-05, + "loss": 0.0011, + "step": 70700 + }, + { + "epoch": 8.339252584150543, + "grad_norm": 0.0004143210535403341, + "learning_rate": 5.858140695160485e-05, + "loss": 0.0011, + "step": 70800 + }, + { + "epoch": 8.351032187766881, + "grad_norm": 0.0003596473834477365, + "learning_rate": 5.816564111092632e-05, + "loss": 0.0011, + "step": 70900 + }, + { + "epoch": 8.36281179138322, + "grad_norm": 0.0003588312538340688, + "learning_rate": 5.774987527024779e-05, + "loss": 0.0011, + "step": 71000 + }, + { + "epoch": 8.36281179138322, + "eval_en-ja_loss": 0.0011520846746861935, + "eval_en-ja_mean_accuracy": 0.6726256212037549, + "eval_en-ja_negative_mse": -0.17346236109733582, + "eval_en-ja_runtime": 15.8415, + "eval_en-ja_samples_per_second": 457.28, + "eval_en-ja_src2trg_accuracy": 0.6856709000552181, + "eval_en-ja_steps_per_second": 1.831, + "eval_en-ja_trg2src_accuracy": 0.6595803423522916, + "eval_sequential_score": 0.24958163005320955, + "step": 71000 + }, + { + "epoch": 8.374591394999559, + "grad_norm": 0.00040090797119773924, + "learning_rate": 5.733410942956926e-05, + "loss": 0.0011, + "step": 71100 + }, + { + "epoch": 8.386370998615897, + "grad_norm": 0.0003644197713583708, + "learning_rate": 5.691834358889073e-05, + "loss": 0.0011, + "step": 71200 + }, + { + "epoch": 8.398150602232235, + "grad_norm": 0.00037489531678147614, + "learning_rate": 5.65025777482122e-05, + "loss": 0.0011, + "step": 71300 + }, + { + "epoch": 8.409930205848573, + "grad_norm": 0.00038432059227488935, + "learning_rate": 5.6086811907533675e-05, + "loss": 0.0011, + "step": 71400 + }, + { + "epoch": 8.421709809464911, + "grad_norm": 0.0003840674180537462, + "learning_rate": 5.567104606685515e-05, + "loss": 0.0011, + "step": 71500 + }, + { + "epoch": 8.433489413081249, + "grad_norm": 0.00038858296466059983, + "learning_rate": 5.525528022617661e-05, + "loss": 0.0011, + "step": 71600 + }, + { + "epoch": 8.445269016697589, + "grad_norm": 0.0003745088470168412, + "learning_rate": 5.4839514385498085e-05, + "loss": 0.0011, + "step": 71700 + }, + { + "epoch": 8.457048620313927, + "grad_norm": 0.00037446763599291444, + "learning_rate": 5.442374854481955e-05, + "loss": 0.0011, + "step": 71800 + }, + { + "epoch": 8.468828223930265, + "grad_norm": 0.0003915918350685388, + "learning_rate": 5.400798270414102e-05, + "loss": 0.0011, + "step": 71900 + }, + { + "epoch": 8.480607827546603, + "grad_norm": 0.0003920751914847642, + "learning_rate": 5.359221686346249e-05, + "loss": 0.0011, + "step": 72000 + }, + { + "epoch": 8.480607827546603, + "eval_en-ja_loss": 0.0011500016553327441, + "eval_en-ja_mean_accuracy": 0.674903368304804, + "eval_en-ja_negative_mse": -0.1732206493616104, + "eval_en-ja_runtime": 16.3938, + "eval_en-ja_samples_per_second": 441.876, + "eval_en-ja_src2trg_accuracy": 0.6878796245168415, + "eval_en-ja_steps_per_second": 1.769, + "eval_en-ja_trg2src_accuracy": 0.6619271120927664, + "eval_sequential_score": 0.2508413594715968, + "step": 72000 + }, + { + "epoch": 8.49238743116294, + "grad_norm": 0.0003714688937179744, + "learning_rate": 5.317645102278396e-05, + "loss": 0.0011, + "step": 72100 + }, + { + "epoch": 8.504167034779279, + "grad_norm": 0.0003740395186468959, + "learning_rate": 5.276068518210543e-05, + "loss": 0.0011, + "step": 72200 + }, + { + "epoch": 8.515946638395619, + "grad_norm": 0.0003702123067341745, + "learning_rate": 5.2344919341426905e-05, + "loss": 0.0011, + "step": 72300 + }, + { + "epoch": 8.527726242011957, + "grad_norm": 0.0003866225015372038, + "learning_rate": 5.192915350074838e-05, + "loss": 0.0011, + "step": 72400 + }, + { + "epoch": 8.539505845628295, + "grad_norm": 0.00037718715611845255, + "learning_rate": 5.151338766006984e-05, + "loss": 0.0011, + "step": 72500 + }, + { + "epoch": 8.551285449244633, + "grad_norm": 0.00036451805499382317, + "learning_rate": 5.1097621819391315e-05, + "loss": 0.0011, + "step": 72600 + }, + { + "epoch": 8.56306505286097, + "grad_norm": 0.00039494872908107936, + "learning_rate": 5.068185597871279e-05, + "loss": 0.0011, + "step": 72700 + }, + { + "epoch": 8.57484465647731, + "grad_norm": 0.00037671206519007683, + "learning_rate": 5.026609013803426e-05, + "loss": 0.0011, + "step": 72800 + }, + { + "epoch": 8.586624260093648, + "grad_norm": 0.0003754513163585216, + "learning_rate": 4.9850324297355726e-05, + "loss": 0.0011, + "step": 72900 + }, + { + "epoch": 8.598403863709986, + "grad_norm": 0.0003859667631331831, + "learning_rate": 4.943455845667719e-05, + "loss": 0.0011, + "step": 73000 + }, + { + "epoch": 8.598403863709986, + "eval_en-ja_loss": 0.0011476819636300206, + "eval_en-ja_mean_accuracy": 0.6724875759249034, + "eval_en-ja_negative_mse": -0.17299410700798035, + "eval_en-ja_runtime": 15.2569, + "eval_en-ja_samples_per_second": 474.802, + "eval_en-ja_src2trg_accuracy": 0.6863611264494754, + "eval_en-ja_steps_per_second": 1.901, + "eval_en-ja_trg2src_accuracy": 0.6586140254003313, + "eval_sequential_score": 0.24974673445846152, + "step": 73000 + }, + { + "epoch": 8.610183467326324, + "grad_norm": 0.0003951348189730197, + "learning_rate": 4.9018792615998664e-05, + "loss": 0.0011, + "step": 73100 + }, + { + "epoch": 8.621963070942662, + "grad_norm": 0.00040716808871366084, + "learning_rate": 4.8603026775320136e-05, + "loss": 0.0011, + "step": 73200 + }, + { + "epoch": 8.633742674559, + "grad_norm": 0.00038438240881077945, + "learning_rate": 4.818726093464161e-05, + "loss": 0.0011, + "step": 73300 + }, + { + "epoch": 8.645522278175338, + "grad_norm": 0.00038383385981433094, + "learning_rate": 4.7771495093963074e-05, + "loss": 0.0011, + "step": 73400 + }, + { + "epoch": 8.657301881791678, + "grad_norm": 0.00034663823316805065, + "learning_rate": 4.7355729253284546e-05, + "loss": 0.0011, + "step": 73500 + }, + { + "epoch": 8.669081485408016, + "grad_norm": 0.000398988660890609, + "learning_rate": 4.693996341260602e-05, + "loss": 0.0011, + "step": 73600 + }, + { + "epoch": 8.680861089024354, + "grad_norm": 0.00037689783493988216, + "learning_rate": 4.652419757192749e-05, + "loss": 0.0011, + "step": 73700 + }, + { + "epoch": 8.692640692640692, + "grad_norm": 0.0004017073370050639, + "learning_rate": 4.610843173124896e-05, + "loss": 0.0011, + "step": 73800 + }, + { + "epoch": 8.70442029625703, + "grad_norm": 0.00036805891431868076, + "learning_rate": 4.569266589057042e-05, + "loss": 0.0011, + "step": 73900 + }, + { + "epoch": 8.71619989987337, + "grad_norm": 0.0003851047367788851, + "learning_rate": 4.5276900049891894e-05, + "loss": 0.0011, + "step": 74000 + }, + { + "epoch": 8.71619989987337, + "eval_en-ja_loss": 0.0011471901088953018, + "eval_en-ja_mean_accuracy": 0.6719353948094975, + "eval_en-ja_negative_mse": -0.172978475689888, + "eval_en-ja_runtime": 16.6342, + "eval_en-ja_samples_per_second": 435.488, + "eval_en-ja_src2trg_accuracy": 0.6849806736609608, + "eval_en-ja_steps_per_second": 1.743, + "eval_en-ja_trg2src_accuracy": 0.6588901159580343, + "eval_sequential_score": 0.24947845955980474, + "step": 74000 + }, + { + "epoch": 8.727979503489708, + "grad_norm": 0.00038057490019127727, + "learning_rate": 4.4861134209213366e-05, + "loss": 0.0011, + "step": 74100 + }, + { + "epoch": 8.739759107106046, + "grad_norm": 0.0003939093148801476, + "learning_rate": 4.444536836853484e-05, + "loss": 0.0011, + "step": 74200 + }, + { + "epoch": 8.751538710722384, + "grad_norm": 0.0003625132085289806, + "learning_rate": 4.402960252785631e-05, + "loss": 0.0011, + "step": 74300 + }, + { + "epoch": 8.763318314338722, + "grad_norm": 0.0003674387698993087, + "learning_rate": 4.361383668717778e-05, + "loss": 0.0011, + "step": 74400 + }, + { + "epoch": 8.77509791795506, + "grad_norm": 0.0003783796855714172, + "learning_rate": 4.319807084649925e-05, + "loss": 0.0011, + "step": 74500 + }, + { + "epoch": 8.7868775215714, + "grad_norm": 0.0003727516159415245, + "learning_rate": 4.278230500582072e-05, + "loss": 0.0011, + "step": 74600 + }, + { + "epoch": 8.798657125187738, + "grad_norm": 0.00037171467556618154, + "learning_rate": 4.2366539165142194e-05, + "loss": 0.0011, + "step": 74700 + }, + { + "epoch": 8.810436728804076, + "grad_norm": 0.0003745635913219303, + "learning_rate": 4.195077332446365e-05, + "loss": 0.0011, + "step": 74800 + }, + { + "epoch": 8.822216332420414, + "grad_norm": 0.0003792518109548837, + "learning_rate": 4.1535007483785125e-05, + "loss": 0.0011, + "step": 74900 + }, + { + "epoch": 8.833995936036752, + "grad_norm": 0.0003704579721670598, + "learning_rate": 4.11192416431066e-05, + "loss": 0.0011, + "step": 75000 + }, + { + "epoch": 8.833995936036752, + "eval_en-ja_loss": 0.0011465653078630567, + "eval_en-ja_mean_accuracy": 0.6727636664826063, + "eval_en-ja_negative_mse": -0.17291390895843506, + "eval_en-ja_runtime": 16.5423, + "eval_en-ja_samples_per_second": 437.907, + "eval_en-ja_src2trg_accuracy": 0.6863611264494754, + "eval_en-ja_steps_per_second": 1.753, + "eval_en-ja_trg2src_accuracy": 0.6591662065157372, + "eval_sequential_score": 0.24992487876208563, + "step": 75000 + }, + { + "epoch": 8.84577553965309, + "grad_norm": 0.0003647334815468639, + "learning_rate": 4.070347580242807e-05, + "loss": 0.0011, + "step": 75100 + }, + { + "epoch": 8.85755514326943, + "grad_norm": 0.0003647030680440366, + "learning_rate": 4.028770996174954e-05, + "loss": 0.0011, + "step": 75200 + }, + { + "epoch": 8.869334746885768, + "grad_norm": 0.00038341653998941183, + "learning_rate": 3.987194412107101e-05, + "loss": 0.0011, + "step": 75300 + }, + { + "epoch": 8.881114350502106, + "grad_norm": 0.00035631400533020496, + "learning_rate": 3.945617828039248e-05, + "loss": 0.0011, + "step": 75400 + }, + { + "epoch": 8.892893954118444, + "grad_norm": 0.00038093674811534584, + "learning_rate": 3.904041243971395e-05, + "loss": 0.0011, + "step": 75500 + }, + { + "epoch": 8.904673557734782, + "grad_norm": 0.00036318585625849664, + "learning_rate": 3.8624646599035424e-05, + "loss": 0.0011, + "step": 75600 + }, + { + "epoch": 8.91645316135112, + "grad_norm": 0.0003946650249417871, + "learning_rate": 3.8208880758356897e-05, + "loss": 0.0011, + "step": 75700 + }, + { + "epoch": 8.92823276496746, + "grad_norm": 0.0004018346662633121, + "learning_rate": 3.7793114917678355e-05, + "loss": 0.0011, + "step": 75800 + }, + { + "epoch": 8.940012368583798, + "grad_norm": 0.0003674339095596224, + "learning_rate": 3.737734907699983e-05, + "loss": 0.0011, + "step": 75900 + }, + { + "epoch": 8.951791972200136, + "grad_norm": 0.00034675252391025424, + "learning_rate": 3.69615832363213e-05, + "loss": 0.0011, + "step": 76000 + }, + { + "epoch": 8.951791972200136, + "eval_en-ja_loss": 0.0011459133820608258, + "eval_en-ja_mean_accuracy": 0.6741441192711208, + "eval_en-ja_negative_mse": -0.17281439900398254, + "eval_en-ja_runtime": 16.2218, + "eval_en-ja_samples_per_second": 446.56, + "eval_en-ja_src2trg_accuracy": 0.6869133075648812, + "eval_en-ja_steps_per_second": 1.788, + "eval_en-ja_trg2src_accuracy": 0.6613749309773606, + "eval_sequential_score": 0.25066486013356915, + "step": 76000 + }, + { + "epoch": 8.963571575816474, + "grad_norm": 0.0003767320595216006, + "learning_rate": 3.654581739564277e-05, + "loss": 0.0011, + "step": 76100 + }, + { + "epoch": 8.975351179432812, + "grad_norm": 0.0003760862455237657, + "learning_rate": 3.6130051554964245e-05, + "loss": 0.0011, + "step": 76200 + }, + { + "epoch": 8.98713078304915, + "grad_norm": 0.00038563867565244436, + "learning_rate": 3.571428571428571e-05, + "loss": 0.0011, + "step": 76300 + }, + { + "epoch": 8.99891038666549, + "grad_norm": 0.00037326893652789295, + "learning_rate": 3.529851987360718e-05, + "loss": 0.0011, + "step": 76400 + }, + { + "epoch": 9.010601643254704, + "grad_norm": 0.00037344417069107294, + "learning_rate": 3.4882754032928655e-05, + "loss": 0.0011, + "step": 76500 + }, + { + "epoch": 9.022381246871042, + "grad_norm": 0.0003961708571296185, + "learning_rate": 3.446698819225012e-05, + "loss": 0.0011, + "step": 76600 + }, + { + "epoch": 9.034160850487382, + "grad_norm": 0.0003513297706376761, + "learning_rate": 3.405122235157159e-05, + "loss": 0.0011, + "step": 76700 + }, + { + "epoch": 9.04594045410372, + "grad_norm": 0.00036019927938468754, + "learning_rate": 3.363545651089306e-05, + "loss": 0.0011, + "step": 76800 + }, + { + "epoch": 9.057720057720058, + "grad_norm": 0.0003701628011185676, + "learning_rate": 3.321969067021453e-05, + "loss": 0.0011, + "step": 76900 + }, + { + "epoch": 9.069499661336396, + "grad_norm": 0.00038925156695768237, + "learning_rate": 3.2803924829536e-05, + "loss": 0.0011, + "step": 77000 + }, + { + "epoch": 9.069499661336396, + "eval_en-ja_loss": 0.001144262496381998, + "eval_en-ja_mean_accuracy": 0.672832689122032, + "eval_en-ja_negative_mse": -0.17259259521961212, + "eval_en-ja_runtime": 16.7877, + "eval_en-ja_samples_per_second": 431.507, + "eval_en-ja_src2trg_accuracy": 0.6858089453340696, + "eval_en-ja_steps_per_second": 1.727, + "eval_en-ja_trg2src_accuracy": 0.6598564329099945, + "eval_sequential_score": 0.25012004695120993, + "step": 77000 + }, + { + "epoch": 9.081279264952734, + "grad_norm": 0.00037116726161912084, + "learning_rate": 3.2388158988857475e-05, + "loss": 0.0011, + "step": 77100 + }, + { + "epoch": 9.093058868569072, + "grad_norm": 0.00036074183299206197, + "learning_rate": 3.197239314817895e-05, + "loss": 0.0011, + "step": 77200 + }, + { + "epoch": 9.104838472185412, + "grad_norm": 0.0003990030672866851, + "learning_rate": 3.155662730750041e-05, + "loss": 0.0011, + "step": 77300 + }, + { + "epoch": 9.11661807580175, + "grad_norm": 0.0003688944852910936, + "learning_rate": 3.1140861466821885e-05, + "loss": 0.0011, + "step": 77400 + }, + { + "epoch": 9.128397679418088, + "grad_norm": 0.00038703533937223256, + "learning_rate": 3.072509562614335e-05, + "loss": 0.0011, + "step": 77500 + }, + { + "epoch": 9.140177283034426, + "grad_norm": 0.00038619840051978827, + "learning_rate": 3.0309329785464823e-05, + "loss": 0.0011, + "step": 77600 + }, + { + "epoch": 9.151956886650764, + "grad_norm": 0.00036552129313349724, + "learning_rate": 2.9893563944786292e-05, + "loss": 0.0011, + "step": 77700 + }, + { + "epoch": 9.163736490267102, + "grad_norm": 0.000375434203306213, + "learning_rate": 2.9477798104107765e-05, + "loss": 0.0011, + "step": 77800 + }, + { + "epoch": 9.175516093883441, + "grad_norm": 0.0003523014602251351, + "learning_rate": 2.9062032263429237e-05, + "loss": 0.0011, + "step": 77900 + }, + { + "epoch": 9.18729569749978, + "grad_norm": 0.0003509012167342007, + "learning_rate": 2.8646266422750702e-05, + "loss": 0.0011, + "step": 78000 + }, + { + "epoch": 9.18729569749978, + "eval_en-ja_loss": 0.0011447904398664832, + "eval_en-ja_mean_accuracy": 0.6735229155162894, + "eval_en-ja_negative_mse": -0.17272119224071503, + "eval_en-ja_runtime": 16.3845, + "eval_en-ja_samples_per_second": 442.124, + "eval_en-ja_src2trg_accuracy": 0.6870513528437328, + "eval_en-ja_steps_per_second": 1.77, + "eval_en-ja_trg2src_accuracy": 0.6599944781888459, + "eval_sequential_score": 0.2504008616377872, + "step": 78000 + }, + { + "epoch": 9.199075301116117, + "grad_norm": 0.0003553859132807702, + "learning_rate": 2.8230500582072175e-05, + "loss": 0.0011, + "step": 78100 + }, + { + "epoch": 9.210854904732455, + "grad_norm": 0.0003659150388557464, + "learning_rate": 2.7814734741393644e-05, + "loss": 0.0011, + "step": 78200 + }, + { + "epoch": 9.222634508348794, + "grad_norm": 0.0003826880711130798, + "learning_rate": 2.7398968900715116e-05, + "loss": 0.0011, + "step": 78300 + }, + { + "epoch": 9.234414111965132, + "grad_norm": 0.00036154358531348407, + "learning_rate": 2.698320306003658e-05, + "loss": 0.0011, + "step": 78400 + }, + { + "epoch": 9.246193715581471, + "grad_norm": 0.000358589953975752, + "learning_rate": 2.6567437219358054e-05, + "loss": 0.0011, + "step": 78500 + }, + { + "epoch": 9.25797331919781, + "grad_norm": 0.00037929543759673834, + "learning_rate": 2.6151671378679526e-05, + "loss": 0.0011, + "step": 78600 + }, + { + "epoch": 9.269752922814147, + "grad_norm": 0.0003660687361843884, + "learning_rate": 2.5735905538000995e-05, + "loss": 0.0011, + "step": 78700 + }, + { + "epoch": 9.281532526430485, + "grad_norm": 0.00038304622285068035, + "learning_rate": 2.5320139697322468e-05, + "loss": 0.0011, + "step": 78800 + }, + { + "epoch": 9.293312130046823, + "grad_norm": 0.0003703866677824408, + "learning_rate": 2.4904373856643933e-05, + "loss": 0.0011, + "step": 78900 + }, + { + "epoch": 9.305091733663161, + "grad_norm": 0.0003485914785414934, + "learning_rate": 2.4488608015965405e-05, + "loss": 0.0011, + "step": 79000 + }, + { + "epoch": 9.305091733663161, + "eval_en-ja_loss": 0.0011436532950028777, + "eval_en-ja_mean_accuracy": 0.6729707344008835, + "eval_en-ja_negative_mse": -0.17267149686813354, + "eval_en-ja_runtime": 16.7393, + "eval_en-ja_samples_per_second": 432.754, + "eval_en-ja_src2trg_accuracy": 0.6869133075648812, + "eval_en-ja_steps_per_second": 1.732, + "eval_en-ja_trg2src_accuracy": 0.6590281612368857, + "eval_sequential_score": 0.250149618766375, + "step": 79000 + }, + { + "epoch": 9.316871337279501, + "grad_norm": 0.0003747837617993355, + "learning_rate": 2.4072842175286874e-05, + "loss": 0.0011, + "step": 79100 + }, + { + "epoch": 9.32865094089584, + "grad_norm": 0.0004102491948287934, + "learning_rate": 2.3657076334608347e-05, + "loss": 0.0011, + "step": 79200 + }, + { + "epoch": 9.340430544512177, + "grad_norm": 0.0003647518460638821, + "learning_rate": 2.324131049392982e-05, + "loss": 0.0011, + "step": 79300 + }, + { + "epoch": 9.352210148128515, + "grad_norm": 0.00038676703115925193, + "learning_rate": 2.2825544653251285e-05, + "loss": 0.0011, + "step": 79400 + }, + { + "epoch": 9.363989751744853, + "grad_norm": 0.00037207905552349985, + "learning_rate": 2.2409778812572757e-05, + "loss": 0.0011, + "step": 79500 + }, + { + "epoch": 9.375769355361193, + "grad_norm": 0.00037133367732167244, + "learning_rate": 2.1994012971894226e-05, + "loss": 0.0011, + "step": 79600 + }, + { + "epoch": 9.387548958977531, + "grad_norm": 0.00036941279540769756, + "learning_rate": 2.1578247131215698e-05, + "loss": 0.0011, + "step": 79700 + }, + { + "epoch": 9.399328562593869, + "grad_norm": 0.000359290192136541, + "learning_rate": 2.1162481290537167e-05, + "loss": 0.0011, + "step": 79800 + }, + { + "epoch": 9.411108166210207, + "grad_norm": 0.0003693661419674754, + "learning_rate": 2.074671544985864e-05, + "loss": 0.0011, + "step": 79900 + }, + { + "epoch": 9.422887769826545, + "grad_norm": 0.0003606871759984642, + "learning_rate": 2.033094960918011e-05, + "loss": 0.0011, + "step": 80000 + }, + { + "epoch": 9.422887769826545, + "eval_en-ja_loss": 0.0011423684190958738, + "eval_en-ja_mean_accuracy": 0.6736609607951408, + "eval_en-ja_negative_mse": -0.17252600193023682, + "eval_en-ja_runtime": 16.0765, + "eval_en-ja_samples_per_second": 450.595, + "eval_en-ja_src2trg_accuracy": 0.6867752622860298, + "eval_en-ja_steps_per_second": 1.804, + "eval_en-ja_trg2src_accuracy": 0.6605466593042518, + "eval_sequential_score": 0.250567479432452, + "step": 80000 + }, + { + "epoch": 9.434667373442883, + "grad_norm": 0.0003646311815828085, + "learning_rate": 1.9915183768501577e-05, + "loss": 0.0011, + "step": 80100 + }, + { + "epoch": 9.446446977059223, + "grad_norm": 0.0003598592011258006, + "learning_rate": 1.949941792782305e-05, + "loss": 0.0011, + "step": 80200 + }, + { + "epoch": 9.45822658067556, + "grad_norm": 0.00038207133184187114, + "learning_rate": 1.908365208714452e-05, + "loss": 0.0011, + "step": 80300 + }, + { + "epoch": 9.470006184291899, + "grad_norm": 0.00036295110476203263, + "learning_rate": 1.866788624646599e-05, + "loss": 0.0011, + "step": 80400 + }, + { + "epoch": 9.481785787908237, + "grad_norm": 0.0003741601249203086, + "learning_rate": 1.825212040578746e-05, + "loss": 0.0011, + "step": 80500 + }, + { + "epoch": 9.493565391524575, + "grad_norm": 0.0003623844822868705, + "learning_rate": 1.783635456510893e-05, + "loss": 0.0011, + "step": 80600 + }, + { + "epoch": 9.505344995140913, + "grad_norm": 0.00037168600829318166, + "learning_rate": 1.7420588724430398e-05, + "loss": 0.0011, + "step": 80700 + }, + { + "epoch": 9.517124598757253, + "grad_norm": 0.000367098196875304, + "learning_rate": 1.700482288375187e-05, + "loss": 0.0011, + "step": 80800 + }, + { + "epoch": 9.52890420237359, + "grad_norm": 0.00036603593616746366, + "learning_rate": 1.658905704307334e-05, + "loss": 0.0011, + "step": 80900 + }, + { + "epoch": 9.540683805989929, + "grad_norm": 0.0003636149049270898, + "learning_rate": 1.617329120239481e-05, + "loss": 0.0011, + "step": 81000 + }, + { + "epoch": 9.540683805989929, + "eval_en-ja_loss": 0.0011418904177844524, + "eval_en-ja_mean_accuracy": 0.674903368304804, + "eval_en-ja_negative_mse": -0.17244380712509155, + "eval_en-ja_runtime": 15.9589, + "eval_en-ja_samples_per_second": 453.915, + "eval_en-ja_src2trg_accuracy": 0.6881557150745444, + "eval_en-ja_steps_per_second": 1.817, + "eval_en-ja_trg2src_accuracy": 0.6616510215350635, + "eval_sequential_score": 0.25122978058985623, + "step": 81000 + }, + { + "epoch": 9.552463409606267, + "grad_norm": 0.00036611512769013643, + "learning_rate": 1.575752536171628e-05, + "loss": 0.0011, + "step": 81100 + }, + { + "epoch": 9.564243013222605, + "grad_norm": 0.00036720981006510556, + "learning_rate": 1.534175952103775e-05, + "loss": 0.0011, + "step": 81200 + }, + { + "epoch": 9.576022616838943, + "grad_norm": 0.0003722389228641987, + "learning_rate": 1.492599368035922e-05, + "loss": 0.0011, + "step": 81300 + }, + { + "epoch": 9.587802220455282, + "grad_norm": 0.00035411425051279366, + "learning_rate": 1.451022783968069e-05, + "loss": 0.0011, + "step": 81400 + }, + { + "epoch": 9.59958182407162, + "grad_norm": 0.00037509057438001037, + "learning_rate": 1.409446199900216e-05, + "loss": 0.0011, + "step": 81500 + }, + { + "epoch": 9.611361427687958, + "grad_norm": 0.0003512932453304529, + "learning_rate": 1.367869615832363e-05, + "loss": 0.0011, + "step": 81600 + }, + { + "epoch": 9.623141031304296, + "grad_norm": 0.0003858322452288121, + "learning_rate": 1.3262930317645102e-05, + "loss": 0.0011, + "step": 81700 + }, + { + "epoch": 9.634920634920634, + "grad_norm": 0.0003562423516996205, + "learning_rate": 1.2847164476966571e-05, + "loss": 0.0011, + "step": 81800 + }, + { + "epoch": 9.646700238536972, + "grad_norm": 0.0003401229914743453, + "learning_rate": 1.2431398636288042e-05, + "loss": 0.0011, + "step": 81900 + }, + { + "epoch": 9.658479842153312, + "grad_norm": 0.00036892088246531785, + "learning_rate": 1.2015632795609512e-05, + "loss": 0.0011, + "step": 82000 + }, + { + "epoch": 9.658479842153312, + "eval_en-ja_loss": 0.0011419870425015688, + "eval_en-ja_mean_accuracy": 0.6729017117614577, + "eval_en-ja_negative_mse": -0.1725073754787445, + "eval_en-ja_runtime": 15.9533, + "eval_en-ja_samples_per_second": 454.075, + "eval_en-ja_src2trg_accuracy": 0.6860850358917725, + "eval_en-ja_steps_per_second": 1.818, + "eval_en-ja_trg2src_accuracy": 0.659718387631143, + "eval_sequential_score": 0.2501971681413566, + "step": 82000 + }, + { + "epoch": 9.67025944576965, + "grad_norm": 0.0003424219321459532, + "learning_rate": 1.1599866954930981e-05, + "loss": 0.0011, + "step": 82100 + }, + { + "epoch": 9.682039049385988, + "grad_norm": 0.00039850251050665975, + "learning_rate": 1.1184101114252452e-05, + "loss": 0.0011, + "step": 82200 + }, + { + "epoch": 9.693818653002326, + "grad_norm": 0.0003566779778338969, + "learning_rate": 1.0768335273573921e-05, + "loss": 0.0011, + "step": 82300 + }, + { + "epoch": 9.705598256618664, + "grad_norm": 0.00035444454988464713, + "learning_rate": 1.0352569432895393e-05, + "loss": 0.0011, + "step": 82400 + }, + { + "epoch": 9.717377860235002, + "grad_norm": 0.0003690596204251051, + "learning_rate": 9.936803592216864e-06, + "loss": 0.0011, + "step": 82500 + }, + { + "epoch": 9.729157463851342, + "grad_norm": 0.00034330427297391, + "learning_rate": 9.521037751538333e-06, + "loss": 0.0011, + "step": 82600 + }, + { + "epoch": 9.74093706746768, + "grad_norm": 0.0003629418497439474, + "learning_rate": 9.105271910859804e-06, + "loss": 0.0011, + "step": 82700 + }, + { + "epoch": 9.752716671084018, + "grad_norm": 0.0003738144296221435, + "learning_rate": 8.689506070181272e-06, + "loss": 0.0011, + "step": 82800 + }, + { + "epoch": 9.764496274700356, + "grad_norm": 0.000365712505299598, + "learning_rate": 8.273740229502743e-06, + "loss": 0.0011, + "step": 82900 + }, + { + "epoch": 9.776275878316694, + "grad_norm": 0.00036353274481371045, + "learning_rate": 7.857974388824214e-06, + "loss": 0.0011, + "step": 83000 + }, + { + "epoch": 9.776275878316694, + "eval_en-ja_loss": 0.001140790292993188, + "eval_en-ja_mean_accuracy": 0.6727636664826063, + "eval_en-ja_negative_mse": -0.17234469950199127, + "eval_en-ja_runtime": 15.9338, + "eval_en-ja_samples_per_second": 454.63, + "eval_en-ja_src2trg_accuracy": 0.6863611264494754, + "eval_en-ja_steps_per_second": 1.82, + "eval_en-ja_trg2src_accuracy": 0.6591662065157372, + "eval_sequential_score": 0.2502094834903075, + "step": 83000 + }, + { + "epoch": 9.788055481933032, + "grad_norm": 0.0003632176958490163, + "learning_rate": 7.4422085481456835e-06, + "loss": 0.0011, + "step": 83100 + }, + { + "epoch": 9.799835085549372, + "grad_norm": 0.00040425657061859965, + "learning_rate": 7.026442707467153e-06, + "loss": 0.0011, + "step": 83200 + }, + { + "epoch": 9.81161468916571, + "grad_norm": 0.00036839439417235553, + "learning_rate": 6.610676866788625e-06, + "loss": 0.0011, + "step": 83300 + }, + { + "epoch": 9.823394292782048, + "grad_norm": 0.00035321430186741054, + "learning_rate": 6.1949110261100945e-06, + "loss": 0.0011, + "step": 83400 + }, + { + "epoch": 9.835173896398386, + "grad_norm": 0.0003472540411166847, + "learning_rate": 5.779145185431564e-06, + "loss": 0.0011, + "step": 83500 + }, + { + "epoch": 9.846953500014724, + "grad_norm": 0.0003592240682337433, + "learning_rate": 5.363379344753035e-06, + "loss": 0.0011, + "step": 83600 + }, + { + "epoch": 9.858733103631064, + "grad_norm": 0.00041063467506319284, + "learning_rate": 4.947613504074505e-06, + "loss": 0.0011, + "step": 83700 + }, + { + "epoch": 9.870512707247402, + "grad_norm": 0.0003799698024522513, + "learning_rate": 4.5318476633959745e-06, + "loss": 0.0011, + "step": 83800 + }, + { + "epoch": 9.88229231086374, + "grad_norm": 0.00036392963374964893, + "learning_rate": 4.116081822717445e-06, + "loss": 0.0011, + "step": 83900 + }, + { + "epoch": 9.894071914480078, + "grad_norm": 0.00036928793997503817, + "learning_rate": 3.7003159820389154e-06, + "loss": 0.0011, + "step": 84000 + }, + { + "epoch": 9.894071914480078, + "eval_en-ja_loss": 0.0011400107759982347, + "eval_en-ja_mean_accuracy": 0.6749723909442298, + "eval_en-ja_negative_mse": -0.17230987548828125, + "eval_en-ja_runtime": 15.4994, + "eval_en-ja_samples_per_second": 467.372, + "eval_en-ja_src2trg_accuracy": 0.688293760353396, + "eval_en-ja_steps_per_second": 1.871, + "eval_en-ja_trg2src_accuracy": 0.6616510215350635, + "eval_sequential_score": 0.25133125772797427, + "step": 84000 + }, + { + "epoch": 9.905851518096416, + "grad_norm": 0.00034306125598959625, + "learning_rate": 3.2845501413603856e-06, + "loss": 0.0011, + "step": 84100 + }, + { + "epoch": 9.917631121712754, + "grad_norm": 0.0003568149986676872, + "learning_rate": 2.8687843006818554e-06, + "loss": 0.0011, + "step": 84200 + }, + { + "epoch": 9.929410725329092, + "grad_norm": 0.0003445280890446156, + "learning_rate": 2.453018460003326e-06, + "loss": 0.0011, + "step": 84300 + }, + { + "epoch": 9.941190328945432, + "grad_norm": 0.0003591186832636595, + "learning_rate": 2.037252619324796e-06, + "loss": 0.0011, + "step": 84400 + }, + { + "epoch": 9.95296993256177, + "grad_norm": 0.00035163512802682817, + "learning_rate": 1.6214867786462662e-06, + "loss": 0.0011, + "step": 84500 + }, + { + "epoch": 9.964749536178108, + "grad_norm": 0.00036031630588695407, + "learning_rate": 1.2057209379677366e-06, + "loss": 0.0011, + "step": 84600 + }, + { + "epoch": 9.976529139794446, + "grad_norm": 0.00034320889972150326, + "learning_rate": 7.899550972892066e-07, + "loss": 0.0011, + "step": 84700 + }, + { + "epoch": 9.988308743410784, + "grad_norm": 0.00036468764301389456, + "learning_rate": 3.741892566106768e-07, + "loss": 0.0011, + "step": 84800 + } + ], + "logging_steps": 100, + "max_steps": 84890, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 256, + "trial_name": null, + "trial_params": null +} \ No newline at end of file