{ "best_metric": null, "best_model_checkpoint": null, "epoch": 24.0, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 234.578125, "epoch": 0.008, "grad_norm": 0.5078125, "kl": 0.0, "learning_rate": 8e-08, "loss": -0.0, "reward": -0.07029185444116592, "reward_std": 0.10451331734657288, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.009191176854074001, "rewards/no_repetition_reward_func": -0.0794830285012722, "rewards/verse_reward_func": 0.0, "step": 1 }, { "completion_length": 237.4375, "epoch": 0.016, "grad_norm": 0.50390625, "kl": 0.0, "learning_rate": 1.6e-07, "loss": -0.0, "reward": -0.046800397336483, "reward_std": 0.12188586592674255, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02995642740279436, "rewards/no_repetition_reward_func": -0.07675682753324509, "rewards/verse_reward_func": 0.0, "step": 2 }, { "completion_length": 232.890625, "epoch": 0.024, "grad_norm": 0.53125, "kl": 0.000829962722491473, "learning_rate": 2.4e-07, "loss": 0.0, "reward": -0.12429150566458702, "reward_std": 0.17918991297483444, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.021012932062149048, "rewards/no_repetition_reward_func": -0.13749194517731667, "rewards/verse_reward_func": -0.0078125, "step": 3 }, { "completion_length": 224.25, "epoch": 0.032, "grad_norm": 0.65625, "kl": 0.0008925290603656322, "learning_rate": 3.2e-07, "loss": 0.0, "reward": -0.11034148931503296, "reward_std": 0.13532811403274536, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0074404762126505375, "rewards/no_repetition_reward_func": -0.10215696692466736, "rewards/verse_reward_func": -0.015625, "step": 4 }, { "completion_length": 231.828125, "epoch": 0.04, "grad_norm": 0.55859375, "kl": 0.000950473127886653, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": -0.10083281621336937, "reward_std": 0.10725041478872299, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.10083281248807907, "rewards/verse_reward_func": 0.0, "step": 5 }, { "completion_length": 221.546875, "epoch": 0.048, "grad_norm": 0.65234375, "kl": 0.0008773832232691348, "learning_rate": 4.8e-07, "loss": 0.0, "reward": -0.1049312986433506, "reward_std": 0.17812475562095642, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0186011902987957, "rewards/no_repetition_reward_func": -0.1079074926674366, "rewards/verse_reward_func": -0.015625, "step": 6 }, { "completion_length": 228.90625, "epoch": 0.056, "grad_norm": 0.86328125, "kl": 0.0008952018979471177, "learning_rate": 5.6e-07, "loss": 0.0, "reward": -0.06702559348195791, "reward_std": 0.2511194050312042, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06397855281829834, "rewards/no_repetition_reward_func": -0.11537915095686913, "rewards/verse_reward_func": -0.015625, "step": 7 }, { "completion_length": 237.171875, "epoch": 0.064, "grad_norm": 0.41015625, "kl": 0.0009173652215395123, "learning_rate": 6.4e-07, "loss": 0.0, "reward": -0.0834280326962471, "reward_std": 0.08821814879775047, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.0834280364215374, "rewards/verse_reward_func": 0.0, "step": 8 }, { "completion_length": 236.09375, "epoch": 0.072, "grad_norm": 0.87890625, "kl": 0.0009107163641601801, "learning_rate": 7.2e-07, "loss": 0.0, "reward": -0.06266414746642113, "reward_std": 0.2090400755405426, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04830918088555336, "rewards/no_repetition_reward_func": -0.09534832835197449, "rewards/verse_reward_func": -0.015625, "step": 9 }, { "completion_length": 234.765625, "epoch": 0.08, "grad_norm": 0.5, "kl": 0.0008130542992148548, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": -0.12114381045103073, "reward_std": 0.19046685844659805, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02242823038250208, "rewards/no_repetition_reward_func": -0.13575951755046844, "rewards/verse_reward_func": -0.0078125, "step": 10 }, { "completion_length": 225.125, "epoch": 0.088, "grad_norm": 0.60546875, "kl": 0.0009163094509858638, "learning_rate": 8.8e-07, "loss": 0.0, "reward": -0.1142631508409977, "reward_std": 0.15695324540138245, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.016121032182127237, "rewards/no_repetition_reward_func": -0.0991341844201088, "rewards/verse_reward_func": -0.03125, "step": 11 }, { "completion_length": 223.0, "epoch": 0.096, "grad_norm": 0.82421875, "kl": 0.0009066381608135998, "learning_rate": 9.6e-07, "loss": 0.0, "reward": -0.08941932767629623, "reward_std": 0.14067870378494263, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.014542749151587486, "rewards/no_repetition_reward_func": -0.08052457496523857, "rewards/verse_reward_func": -0.0234375, "step": 12 }, { "completion_length": 225.96875, "epoch": 0.104, "grad_norm": 0.71875, "kl": 0.000948868808336556, "learning_rate": 1.04e-06, "loss": 0.0, "reward": -0.06765815615653992, "reward_std": 0.18116378784179688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03707417845726013, "rewards/no_repetition_reward_func": -0.08910733461380005, "rewards/verse_reward_func": -0.015625, "step": 13 }, { "completion_length": 231.609375, "epoch": 0.112, "grad_norm": 0.671875, "kl": 0.0008257974695879966, "learning_rate": 1.12e-06, "loss": 0.0, "reward": -0.10997654870152473, "reward_std": 0.16079682111740112, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.014204545877873898, "rewards/no_repetition_reward_func": -0.1085560955107212, "rewards/verse_reward_func": -0.015625, "step": 14 }, { "completion_length": 212.359375, "epoch": 0.12, "grad_norm": 0.6015625, "kl": 0.0008982184808701277, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": -0.08577539958059788, "reward_std": 0.11965971440076828, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.008223684504628181, "rewards/no_repetition_reward_func": -0.09399908408522606, "rewards/verse_reward_func": 0.0, "step": 15 }, { "completion_length": 244.640625, "epoch": 0.128, "grad_norm": 0.423828125, "kl": 0.0008157639822456986, "learning_rate": 1.28e-06, "loss": 0.0, "reward": -0.12014786899089813, "reward_std": 0.20400109514594078, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.028453406877815723, "rewards/no_repetition_reward_func": -0.14860127866268158, "rewards/verse_reward_func": 0.0, "step": 16 }, { "completion_length": 213.0625, "epoch": 0.136, "grad_norm": 0.734375, "kl": 0.0009511272655799985, "learning_rate": 1.36e-06, "loss": 0.0, "reward": -0.10209917649626732, "reward_std": 0.1469094678759575, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.007560483645647764, "rewards/no_repetition_reward_func": -0.08622216060757637, "rewards/verse_reward_func": -0.0234375, "step": 17 }, { "completion_length": 241.0625, "epoch": 0.144, "grad_norm": 0.4765625, "kl": 0.0009110291430260986, "learning_rate": 1.44e-06, "loss": 0.0, "reward": -0.10876219347119331, "reward_std": 0.12227174639701843, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0078125, "rewards/no_repetition_reward_func": -0.11657469347119331, "rewards/verse_reward_func": 0.0, "step": 18 }, { "completion_length": 231.375, "epoch": 0.152, "grad_norm": 0.50390625, "kl": 0.0009210791904479265, "learning_rate": 1.52e-06, "loss": 0.0, "reward": -0.10426163300871849, "reward_std": 0.12610777840018272, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0078125, "rewards/no_repetition_reward_func": -0.11207414418458939, "rewards/verse_reward_func": 0.0, "step": 19 }, { "completion_length": 224.84375, "epoch": 0.16, "grad_norm": 1.0859375, "kl": 0.0010051967692561448, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "reward": -0.09108131937682629, "reward_std": 0.1131216362118721, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01525297574698925, "rewards/no_repetition_reward_func": -0.08289679884910583, "rewards/verse_reward_func": -0.0234375, "step": 20 }, { "completion_length": 232.84375, "epoch": 0.168, "grad_norm": 0.5390625, "kl": 0.0009506940841674805, "learning_rate": 1.68e-06, "loss": 0.0, "reward": -0.0641196258366108, "reward_std": 0.08292144164443016, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0074404762126505375, "rewards/no_repetition_reward_func": -0.0715601034462452, "rewards/verse_reward_func": 0.0, "step": 21 }, { "completion_length": 230.984375, "epoch": 0.176, "grad_norm": 0.8359375, "kl": 0.0009471351804677397, "learning_rate": 1.76e-06, "loss": 0.0, "reward": -0.05562848970293999, "reward_std": 0.1474185325205326, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0234375, "rewards/no_repetition_reward_func": -0.07125349342823029, "rewards/verse_reward_func": -0.0078125, "step": 22 }, { "completion_length": 243.875, "epoch": 0.184, "grad_norm": 0.51171875, "kl": 0.0008547779580112547, "learning_rate": 1.84e-06, "loss": 0.0, "reward": -0.11299563199281693, "reward_std": 0.13702279329299927, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09737062454223633, "rewards/verse_reward_func": -0.015625, "step": 23 }, { "completion_length": 233.203125, "epoch": 0.192, "grad_norm": 0.5234375, "kl": 0.0008922955603338778, "learning_rate": 1.92e-06, "loss": 0.0, "reward": -0.10346489027142525, "reward_std": 0.16529236733913422, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01720610074698925, "rewards/no_repetition_reward_func": -0.10504599660634995, "rewards/verse_reward_func": -0.015625, "step": 24 }, { "completion_length": 233.578125, "epoch": 0.2, "grad_norm": 0.61328125, "kl": 0.0008766758255660534, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "reward": -0.0827447697520256, "reward_std": 0.1924658641219139, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02957993559539318, "rewards/no_repetition_reward_func": -0.10451219975948334, "rewards/verse_reward_func": -0.0078125, "step": 25 }, { "completion_length": 236.15625, "epoch": 0.208, "grad_norm": 0.484375, "kl": 0.0008589337521698326, "learning_rate": 2.08e-06, "loss": 0.0, "reward": -0.09895991161465645, "reward_std": 0.20940428972244263, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02922533079981804, "rewards/no_repetition_reward_func": -0.1281852424144745, "rewards/verse_reward_func": 0.0, "step": 26 }, { "completion_length": 231.078125, "epoch": 0.216, "grad_norm": 0.4765625, "kl": 0.0009160517365671694, "learning_rate": 2.16e-06, "loss": 0.0, "reward": -0.1198582798242569, "reward_std": 0.13756625354290009, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.1120457835495472, "rewards/verse_reward_func": -0.0078125, "step": 27 }, { "completion_length": 229.1875, "epoch": 0.224, "grad_norm": 0.4296875, "kl": 0.0008074616198427975, "learning_rate": 2.24e-06, "loss": 0.0, "reward": -0.12098969519138336, "reward_std": 0.12192413210868835, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.12098968774080276, "rewards/verse_reward_func": 0.0, "step": 28 }, { "completion_length": 230.109375, "epoch": 0.232, "grad_norm": 0.51171875, "kl": 0.0009061071323230863, "learning_rate": 2.32e-06, "loss": 0.0, "reward": -0.08345619216561317, "reward_std": 0.12353258952498436, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0086805559694767, "rewards/no_repetition_reward_func": -0.09213674999773502, "rewards/verse_reward_func": 0.0, "step": 29 }, { "completion_length": 236.703125, "epoch": 0.24, "grad_norm": 0.51171875, "kl": 0.0008841204107739031, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "reward": -0.0850614532828331, "reward_std": 0.1487605795264244, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.017414861358702183, "rewards/no_repetition_reward_func": -0.09466380998492241, "rewards/verse_reward_func": -0.0078125, "step": 30 }, { "completion_length": 235.234375, "epoch": 0.248, "grad_norm": 0.5625, "kl": 0.0009039035066962242, "learning_rate": 2.48e-06, "loss": 0.0, "reward": -0.1119324266910553, "reward_std": 0.13293934985995293, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.016121032182127237, "rewards/no_repetition_reward_func": -0.1280534639954567, "rewards/verse_reward_func": 0.0, "step": 31 }, { "completion_length": 232.96875, "epoch": 0.256, "grad_norm": 0.65625, "kl": 0.0008624704205431044, "learning_rate": 2.56e-06, "loss": 0.0, "reward": -0.1293131783604622, "reward_std": 0.17691338807344437, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.018382353708148003, "rewards/no_repetition_reward_func": -0.13207053020596504, "rewards/verse_reward_func": -0.015625, "step": 32 }, { "completion_length": 228.828125, "epoch": 0.264, "grad_norm": 0.546875, "kl": 0.0009184274531435221, "learning_rate": 2.64e-06, "loss": 0.0, "reward": -0.08011619932949543, "reward_std": 0.14918120577931404, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015625, "rewards/no_repetition_reward_func": -0.09574120491743088, "rewards/verse_reward_func": 0.0, "step": 33 }, { "completion_length": 224.0625, "epoch": 0.272, "grad_norm": 0.9140625, "kl": 0.0009085725178010762, "learning_rate": 2.72e-06, "loss": 0.0, "reward": -0.1309002749621868, "reward_std": 0.17097631096839905, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09965027496218681, "rewards/verse_reward_func": -0.03125, "step": 34 }, { "completion_length": 235.59375, "epoch": 0.28, "grad_norm": 0.5390625, "kl": 0.0009127297962550074, "learning_rate": 2.8000000000000003e-06, "loss": 0.0, "reward": -0.1411154381930828, "reward_std": 0.17660044133663177, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.13330292701721191, "rewards/verse_reward_func": -0.0078125, "step": 35 }, { "completion_length": 231.984375, "epoch": 0.288, "grad_norm": 0.5078125, "kl": 0.0009109513775911182, "learning_rate": 2.88e-06, "loss": 0.0, "reward": -0.08548308163881302, "reward_std": 0.1076219268143177, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.013804041780531406, "rewards/no_repetition_reward_func": -0.09928712248802185, "rewards/verse_reward_func": 0.0, "step": 36 }, { "completion_length": 228.484375, "epoch": 0.296, "grad_norm": 0.6171875, "kl": 0.0009116008295677602, "learning_rate": 2.9600000000000005e-06, "loss": 0.0, "reward": -0.09420940279960632, "reward_std": 0.10294660925865173, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09420940279960632, "rewards/verse_reward_func": 0.0, "step": 37 }, { "completion_length": 227.09375, "epoch": 0.304, "grad_norm": 0.52734375, "kl": 0.0009482204332016408, "learning_rate": 3.04e-06, "loss": 0.0, "reward": -0.11299112066626549, "reward_std": 0.11639419943094254, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.11299112066626549, "rewards/verse_reward_func": 0.0, "step": 38 }, { "completion_length": 221.28125, "epoch": 0.312, "grad_norm": 0.6484375, "kl": 0.0008380573708564043, "learning_rate": 3.12e-06, "loss": 0.0, "reward": -0.10707683116197586, "reward_std": 0.13801930844783783, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0074404762126505375, "rewards/no_repetition_reward_func": -0.10670480504631996, "rewards/verse_reward_func": -0.0078125, "step": 39 }, { "completion_length": 237.265625, "epoch": 0.32, "grad_norm": 0.65234375, "kl": 0.0009151692502200603, "learning_rate": 3.2000000000000003e-06, "loss": 0.0, "reward": -0.09380726888775826, "reward_std": 0.08578159287571907, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.08599477261304855, "rewards/verse_reward_func": -0.0078125, "step": 40 }, { "completion_length": 232.640625, "epoch": 0.328, "grad_norm": 0.55859375, "kl": 0.0009369859762955457, "learning_rate": 3.2800000000000004e-06, "loss": 0.0, "reward": -0.10363754257559776, "reward_std": 0.21328559517860413, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03493499429896474, "rewards/no_repetition_reward_func": -0.12294753640890121, "rewards/verse_reward_func": -0.015625, "step": 41 }, { "completion_length": 232.515625, "epoch": 0.336, "grad_norm": 0.80859375, "kl": 0.0009049727232195437, "learning_rate": 3.36e-06, "loss": 0.0, "reward": -0.11345454677939415, "reward_std": 0.10152676701545715, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.10564204305410385, "rewards/verse_reward_func": -0.0078125, "step": 42 }, { "completion_length": 227.046875, "epoch": 0.344, "grad_norm": 0.65625, "kl": 0.0009055770060513169, "learning_rate": 3.44e-06, "loss": 0.0, "reward": -0.06363708339631557, "reward_std": 0.17262688279151917, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02682487480342388, "rewards/no_repetition_reward_func": -0.09046195819973946, "rewards/verse_reward_func": 0.0, "step": 43 }, { "completion_length": 223.4375, "epoch": 0.352, "grad_norm": 0.486328125, "kl": 0.0009291177266277373, "learning_rate": 3.52e-06, "loss": 0.0, "reward": -0.08017076924443245, "reward_std": 0.17380152456462383, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.024671053513884544, "rewards/no_repetition_reward_func": -0.104841822758317, "rewards/verse_reward_func": 0.0, "step": 44 }, { "completion_length": 234.359375, "epoch": 0.36, "grad_norm": 0.68359375, "kl": 0.0009367179591208696, "learning_rate": 3.6e-06, "loss": 0.0, "reward": -0.070566950365901, "reward_std": 0.103252362459898, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0086805559694767, "rewards/no_repetition_reward_func": -0.07143500447273254, "rewards/verse_reward_func": -0.0078125, "step": 45 }, { "completion_length": 230.6875, "epoch": 0.368, "grad_norm": 0.72265625, "kl": 0.0009074956469703466, "learning_rate": 3.68e-06, "loss": 0.0, "reward": -0.09840996563434601, "reward_std": 0.11776120960712433, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.006793478038161993, "rewards/no_repetition_reward_func": -0.09739093855023384, "rewards/verse_reward_func": -0.0078125, "step": 46 }, { "completion_length": 227.546875, "epoch": 0.376, "grad_norm": 0.58203125, "kl": 0.000938474026042968, "learning_rate": 3.7600000000000004e-06, "loss": 0.0, "reward": -0.08063029125332832, "reward_std": 0.16702739894390106, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.018382353708148003, "rewards/no_repetition_reward_func": -0.09901264682412148, "rewards/verse_reward_func": 0.0, "step": 47 }, { "completion_length": 238.71875, "epoch": 0.384, "grad_norm": 0.5, "kl": 0.0009473034297116101, "learning_rate": 3.84e-06, "loss": 0.0, "reward": -0.056341368705034256, "reward_std": 0.1851743832230568, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.037109375, "rewards/no_repetition_reward_func": -0.09345073997974396, "rewards/verse_reward_func": 0.0, "step": 48 }, { "completion_length": 211.390625, "epoch": 0.392, "grad_norm": 1.1875, "kl": 0.000885247573023662, "learning_rate": 3.92e-06, "loss": 0.0, "reward": -0.1284841112792492, "reward_std": 0.14908602833747864, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.007102272938936949, "rewards/no_repetition_reward_func": -0.10433638095855713, "rewards/verse_reward_func": -0.03125, "step": 49 }, { "completion_length": 224.46875, "epoch": 0.4, "grad_norm": 0.95703125, "kl": 0.0008466793806292117, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "reward": -0.10290900617837906, "reward_std": 0.14872103184461594, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.014880952425301075, "rewards/no_repetition_reward_func": -0.10216495767235756, "rewards/verse_reward_func": -0.015625, "step": 50 }, { "completion_length": 224.3125, "epoch": 0.408, "grad_norm": 0.7421875, "kl": 0.0009395062224939466, "learning_rate": 4.080000000000001e-06, "loss": 0.0, "reward": -0.1067863367497921, "reward_std": 0.1559562385082245, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01864035055041313, "rewards/no_repetition_reward_func": -0.11761419475078583, "rewards/verse_reward_func": -0.0078125, "step": 51 }, { "completion_length": 233.328125, "epoch": 0.416, "grad_norm": 3.21875, "kl": 0.0008471752516925335, "learning_rate": 4.16e-06, "loss": 0.0, "reward": -0.07651101611554623, "reward_std": 0.1582777351140976, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0217013880610466, "rewards/no_repetition_reward_func": -0.08258740231394768, "rewards/verse_reward_func": -0.015625, "step": 52 }, { "completion_length": 226.8125, "epoch": 0.424, "grad_norm": 0.77734375, "kl": 0.0009691714076325297, "learning_rate": 4.24e-06, "loss": 0.0, "reward": -0.05908671393990517, "reward_std": 0.1914033740758896, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04081058315932751, "rewards/no_repetition_reward_func": -0.08427229896187782, "rewards/verse_reward_func": -0.015625, "step": 53 }, { "completion_length": 232.390625, "epoch": 0.432, "grad_norm": 0.486328125, "kl": 0.0008469471649732441, "learning_rate": 4.32e-06, "loss": 0.0, "reward": -0.07930327206850052, "reward_std": 0.13239499181509018, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.017578125, "rewards/no_repetition_reward_func": -0.09688139706850052, "rewards/verse_reward_func": 0.0, "step": 54 }, { "completion_length": 236.171875, "epoch": 0.44, "grad_norm": 0.478515625, "kl": 0.0009291544556617737, "learning_rate": 4.4e-06, "loss": 0.0, "reward": -0.08240247517824173, "reward_std": 0.16815150529146194, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02616747049614787, "rewards/no_repetition_reward_func": -0.10856993868947029, "rewards/verse_reward_func": 0.0, "step": 55 }, { "completion_length": 236.21875, "epoch": 0.448, "grad_norm": 0.4765625, "kl": 0.0009327279985882342, "learning_rate": 4.48e-06, "loss": 0.0, "reward": -0.083309855312109, "reward_std": 0.14545220881700516, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.021053165663033724, "rewards/no_repetition_reward_func": -0.10436301305890083, "rewards/verse_reward_func": 0.0, "step": 56 }, { "completion_length": 229.171875, "epoch": 0.456, "grad_norm": 1.09375, "kl": 0.0009404254960827529, "learning_rate": 4.56e-06, "loss": 0.0, "reward": -0.09063693881034851, "reward_std": 0.1346384808421135, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0086805559694767, "rewards/no_repetition_reward_func": -0.07587999850511551, "rewards/verse_reward_func": -0.0234375, "step": 57 }, { "completion_length": 227.0625, "epoch": 0.464, "grad_norm": 0.6953125, "kl": 0.0009185484668705612, "learning_rate": 4.64e-06, "loss": 0.0, "reward": -0.12275362759828568, "reward_std": 0.15785391628742218, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.11494112759828568, "rewards/verse_reward_func": -0.0078125, "step": 58 }, { "completion_length": 219.8125, "epoch": 0.472, "grad_norm": 0.57421875, "kl": 0.000917254772502929, "learning_rate": 4.72e-06, "loss": 0.0, "reward": -0.0648726224899292, "reward_std": 0.2277185432612896, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04531250149011612, "rewards/no_repetition_reward_func": -0.08674762398004532, "rewards/verse_reward_func": -0.0234375, "step": 59 }, { "completion_length": 225.078125, "epoch": 0.48, "grad_norm": 0.4921875, "kl": 0.0008833970641717315, "learning_rate": 4.800000000000001e-06, "loss": 0.0, "reward": -0.09633488953113556, "reward_std": 0.11032947897911072, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.08852238208055496, "rewards/verse_reward_func": -0.0078125, "step": 60 }, { "completion_length": 223.453125, "epoch": 0.488, "grad_norm": 0.8046875, "kl": 0.0009509058436378837, "learning_rate": 4.880000000000001e-06, "loss": 0.0, "reward": -0.0631482508033514, "reward_std": 0.12232452630996704, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01460597850382328, "rewards/no_repetition_reward_func": -0.06994172558188438, "rewards/verse_reward_func": -0.0078125, "step": 61 }, { "completion_length": 223.953125, "epoch": 0.496, "grad_norm": 0.9453125, "kl": 0.0009646493708714843, "learning_rate": 4.96e-06, "loss": 0.0, "reward": -0.10367628931999207, "reward_std": 0.13329732045531273, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01759958826005459, "rewards/no_repetition_reward_func": -0.1134633831679821, "rewards/verse_reward_func": -0.0078125, "step": 62 }, { "completion_length": 225.46875, "epoch": 0.504, "grad_norm": 0.87109375, "kl": 0.000900004553841427, "learning_rate": 5.04e-06, "loss": 0.0, "reward": -0.0993577390909195, "reward_std": 0.11302126944065094, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.0915452390909195, "rewards/verse_reward_func": -0.0078125, "step": 63 }, { "completion_length": 232.109375, "epoch": 0.512, "grad_norm": 0.44921875, "kl": 0.0008504063007421792, "learning_rate": 5.12e-06, "loss": 0.0, "reward": -0.12935913354158401, "reward_std": 0.12859037145972252, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.12935911864042282, "rewards/verse_reward_func": 0.0, "step": 64 }, { "completion_length": 217.046875, "epoch": 0.52, "grad_norm": 0.9921875, "kl": 0.0009185618255287409, "learning_rate": 5.2e-06, "loss": 0.0, "reward": -0.09801210835576057, "reward_std": 0.1046023741364479, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.06676211021840572, "rewards/verse_reward_func": -0.03125, "step": 65 }, { "completion_length": 223.375, "epoch": 0.528, "grad_norm": 0.71484375, "kl": 0.0009222006483469158, "learning_rate": 5.28e-06, "loss": 0.0, "reward": -0.0882854089140892, "reward_std": 0.12013217061758041, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.008223684504628181, "rewards/no_repetition_reward_func": -0.08869659155607224, "rewards/verse_reward_func": -0.0078125, "step": 66 }, { "completion_length": 222.328125, "epoch": 0.536, "grad_norm": 1.0703125, "kl": 0.0009764206188265234, "learning_rate": 5.36e-06, "loss": 0.0, "reward": -0.07260116934776306, "reward_std": 0.10710709542036057, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.007102272938936949, "rewards/no_repetition_reward_func": -0.06407843716442585, "rewards/verse_reward_func": -0.015625, "step": 67 }, { "completion_length": 225.21875, "epoch": 0.544, "grad_norm": 0.59375, "kl": 0.0008670419338159263, "learning_rate": 5.44e-06, "loss": 0.0, "reward": -0.11209752410650253, "reward_std": 0.1401442177593708, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.10428501665592194, "rewards/verse_reward_func": -0.0078125, "step": 68 }, { "completion_length": 230.421875, "epoch": 0.552, "grad_norm": 0.5859375, "kl": 0.0009191063873004168, "learning_rate": 5.5200000000000005e-06, "loss": 0.0, "reward": -0.06064521707594395, "reward_std": 0.16238445043563843, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.026486672461032867, "rewards/no_repetition_reward_func": -0.07931939139962196, "rewards/verse_reward_func": -0.0078125, "step": 69 }, { "completion_length": 229.78125, "epoch": 0.56, "grad_norm": 0.74609375, "kl": 0.0009369188046548516, "learning_rate": 5.600000000000001e-06, "loss": 0.0, "reward": -0.08487707376480103, "reward_std": 0.11812922358512878, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.06925207749009132, "rewards/verse_reward_func": -0.015625, "step": 70 }, { "completion_length": 232.90625, "epoch": 0.568, "grad_norm": 0.68359375, "kl": 0.0009167685930151492, "learning_rate": 5.680000000000001e-06, "loss": 0.0, "reward": -0.09084771201014519, "reward_std": 0.13058960437774658, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0086805559694767, "rewards/no_repetition_reward_func": -0.09171576797962189, "rewards/verse_reward_func": -0.0078125, "step": 71 }, { "completion_length": 233.828125, "epoch": 0.576, "grad_norm": 0.92578125, "kl": 0.0008477937954012305, "learning_rate": 5.76e-06, "loss": 0.0, "reward": -0.1375313699245453, "reward_std": 0.11720774695277214, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.12190636992454529, "rewards/verse_reward_func": -0.015625, "step": 72 }, { "completion_length": 217.296875, "epoch": 0.584, "grad_norm": 1.0703125, "kl": 0.0009375098161399364, "learning_rate": 5.84e-06, "loss": 0.0, "reward": -0.12238041684031487, "reward_std": 0.16847951710224152, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01460597850382328, "rewards/no_repetition_reward_func": -0.105736393481493, "rewards/verse_reward_func": -0.03125, "step": 73 }, { "completion_length": 236.3125, "epoch": 0.592, "grad_norm": 0.8984375, "kl": 0.0009294936899095774, "learning_rate": 5.920000000000001e-06, "loss": 0.0, "reward": -0.08543004468083382, "reward_std": 0.1495388224720955, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01938439905643463, "rewards/no_repetition_reward_func": -0.08918944001197815, "rewards/verse_reward_func": -0.015625, "step": 74 }, { "completion_length": 237.328125, "epoch": 0.6, "grad_norm": 0.50390625, "kl": 0.0009592815476935357, "learning_rate": 6e-06, "loss": 0.0, "reward": -0.07669084891676903, "reward_std": 0.10719931125640869, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.009191176854074001, "rewards/no_repetition_reward_func": -0.0858820229768753, "rewards/verse_reward_func": 0.0, "step": 75 }, { "completion_length": 230.234375, "epoch": 0.608, "grad_norm": 1.1328125, "kl": 0.000914647476747632, "learning_rate": 6.08e-06, "loss": 0.0, "reward": -0.10938768088817596, "reward_std": 0.17462372407317162, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01690424047410488, "rewards/no_repetition_reward_func": -0.1028544194996357, "rewards/verse_reward_func": -0.0234375, "step": 76 }, { "completion_length": 229.828125, "epoch": 0.616, "grad_norm": 0.53125, "kl": 0.0009276027558371425, "learning_rate": 6.16e-06, "loss": 0.0, "reward": -0.09114603698253632, "reward_std": 0.09186182916164398, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09114603698253632, "rewards/verse_reward_func": 0.0, "step": 77 }, { "completion_length": 225.75, "epoch": 0.624, "grad_norm": 0.5, "kl": 0.0008824745018500835, "learning_rate": 6.24e-06, "loss": 0.0, "reward": -0.09547640383243561, "reward_std": 0.1430344358086586, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.008223684504628181, "rewards/no_repetition_reward_func": -0.09588759019970894, "rewards/verse_reward_func": -0.0078125, "step": 78 }, { "completion_length": 233.234375, "epoch": 0.632, "grad_norm": 0.8984375, "kl": 0.0008006822899915278, "learning_rate": 6.320000000000001e-06, "loss": 0.0, "reward": -0.14296218380331993, "reward_std": 0.15777891874313354, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01304347813129425, "rewards/no_repetition_reward_func": -0.14819316193461418, "rewards/verse_reward_func": -0.0078125, "step": 79 }, { "completion_length": 227.453125, "epoch": 0.64, "grad_norm": 0.53125, "kl": 0.0008722004131413996, "learning_rate": 6.4000000000000006e-06, "loss": 0.0, "reward": -0.08674908056855202, "reward_std": 0.10596892610192299, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0062500000931322575, "rewards/no_repetition_reward_func": -0.08518658205866814, "rewards/verse_reward_func": -0.0078125, "step": 80 }, { "completion_length": 219.609375, "epoch": 0.648, "grad_norm": 1.0625, "kl": 0.0008951073396019638, "learning_rate": 6.48e-06, "loss": 0.0, "reward": -0.11094655469059944, "reward_std": 0.18442078679800034, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.020089285913854837, "rewards/no_repetition_reward_func": -0.12322334200143814, "rewards/verse_reward_func": -0.0078125, "step": 81 }, { "completion_length": 236.53125, "epoch": 0.656, "grad_norm": 0.86328125, "kl": 0.0008985979366116226, "learning_rate": 6.560000000000001e-06, "loss": 0.0, "reward": -0.06619586795568466, "reward_std": 0.16633900627493858, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.028722427785396576, "rewards/no_repetition_reward_func": -0.08710579946637154, "rewards/verse_reward_func": -0.0078125, "step": 82 }, { "completion_length": 236.28125, "epoch": 0.664, "grad_norm": 0.63671875, "kl": 0.0009353983332403004, "learning_rate": 6.640000000000001e-06, "loss": 0.0, "reward": -0.07310020923614502, "reward_std": 0.11749500781297684, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015441176947206259, "rewards/no_repetition_reward_func": -0.08854138478636742, "rewards/verse_reward_func": 0.0, "step": 83 }, { "completion_length": 227.265625, "epoch": 0.672, "grad_norm": 0.546875, "kl": 0.0009250672010239214, "learning_rate": 6.72e-06, "loss": 0.0, "reward": -0.06355707719922066, "reward_std": 0.19093193486332893, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0416666679084301, "rewards/no_repetition_reward_func": -0.10522374138236046, "rewards/verse_reward_func": 0.0, "step": 84 }, { "completion_length": 234.109375, "epoch": 0.68, "grad_norm": 0.7109375, "kl": 0.0009462277521379292, "learning_rate": 6.800000000000001e-06, "loss": 0.0, "reward": -0.1116388738155365, "reward_std": 0.1485554464161396, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.1038263700902462, "rewards/verse_reward_func": -0.0078125, "step": 85 }, { "completion_length": 235.3125, "epoch": 0.688, "grad_norm": 1.21875, "kl": 0.0007775500416755676, "learning_rate": 6.88e-06, "loss": 0.0, "reward": -0.13516101241111755, "reward_std": 0.1931028515100479, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.022017045877873898, "rewards/no_repetition_reward_func": -0.14155305176973343, "rewards/verse_reward_func": -0.015625, "step": 86 }, { "completion_length": 229.953125, "epoch": 0.696, "grad_norm": 0.63671875, "kl": 0.0009285025880672038, "learning_rate": 6.9599999999999994e-06, "loss": 0.0, "reward": -0.10425681620836258, "reward_std": 0.10534143820405006, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.08081931248307228, "rewards/verse_reward_func": -0.0234375, "step": 87 }, { "completion_length": 235.4375, "epoch": 0.704, "grad_norm": 0.484375, "kl": 0.000856903032399714, "learning_rate": 7.04e-06, "loss": 0.0, "reward": -0.06883471831679344, "reward_std": 0.1821139007806778, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0387687967158854, "rewards/no_repetition_reward_func": -0.107603520154953, "rewards/verse_reward_func": 0.0, "step": 88 }, { "completion_length": 228.375, "epoch": 0.712, "grad_norm": 0.5390625, "kl": 0.0009344764985144138, "learning_rate": 7.1200000000000004e-06, "loss": 0.0, "reward": -0.0736040361225605, "reward_std": 0.11564646661281586, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.012019230984151363, "rewards/no_repetition_reward_func": -0.08562326431274414, "rewards/verse_reward_func": 0.0, "step": 89 }, { "completion_length": 235.96875, "epoch": 0.72, "grad_norm": 0.6484375, "kl": 0.0009017071570269763, "learning_rate": 7.2e-06, "loss": 0.0, "reward": -0.10645043477416039, "reward_std": 0.14924214035272598, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015441176947206259, "rewards/no_repetition_reward_func": -0.11407911032438278, "rewards/verse_reward_func": -0.0078125, "step": 90 }, { "completion_length": 234.703125, "epoch": 0.728, "grad_norm": 0.51171875, "kl": 0.0009558142628520727, "learning_rate": 7.280000000000001e-06, "loss": 0.0, "reward": -0.06551615335047245, "reward_std": 0.1749710515141487, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03744822181761265, "rewards/no_repetition_reward_func": -0.1029643714427948, "rewards/verse_reward_func": 0.0, "step": 91 }, { "completion_length": 240.46875, "epoch": 0.736, "grad_norm": 0.8984375, "kl": 0.0010150729794986546, "learning_rate": 7.36e-06, "loss": 0.0, "reward": -0.09910141304135323, "reward_std": 0.12177912518382072, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.006009615492075682, "rewards/no_repetition_reward_func": -0.0972985327243805, "rewards/verse_reward_func": -0.0078125, "step": 92 }, { "completion_length": 247.75, "epoch": 0.744, "grad_norm": 0.390625, "kl": 0.0008480488322675228, "learning_rate": 7.44e-06, "loss": 0.0, "reward": -0.11823654919862747, "reward_std": 0.10747197270393372, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.11042404919862747, "rewards/verse_reward_func": -0.0078125, "step": 93 }, { "completion_length": 220.859375, "epoch": 0.752, "grad_norm": 0.921875, "kl": 0.000897552294190973, "learning_rate": 7.520000000000001e-06, "loss": 0.0, "reward": -0.0846867710351944, "reward_std": 0.1478750929236412, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01519097201526165, "rewards/no_repetition_reward_func": -0.0842527523636818, "rewards/verse_reward_func": -0.015625, "step": 94 }, { "completion_length": 232.703125, "epoch": 0.76, "grad_norm": 0.455078125, "kl": 0.0009288189758080989, "learning_rate": 7.6e-06, "loss": 0.0, "reward": -0.09670411422848701, "reward_std": 0.15406665951013565, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01717033004388213, "rewards/no_repetition_reward_func": -0.11387444660067558, "rewards/verse_reward_func": 0.0, "step": 95 }, { "completion_length": 214.515625, "epoch": 0.768, "grad_norm": 0.578125, "kl": 0.0009084817138500512, "learning_rate": 7.68e-06, "loss": 0.0, "reward": -0.1570199318230152, "reward_std": 0.1730286180973053, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.1257699429988861, "rewards/verse_reward_func": -0.03125, "step": 96 }, { "completion_length": 227.234375, "epoch": 0.776, "grad_norm": 0.9453125, "kl": 0.0009650715219322592, "learning_rate": 7.76e-06, "loss": 0.0, "reward": -0.12628470361232758, "reward_std": 0.11259858682751656, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.10284719616174698, "rewards/verse_reward_func": -0.0234375, "step": 97 }, { "completion_length": 235.109375, "epoch": 0.784, "grad_norm": 0.52734375, "kl": 0.000914206902962178, "learning_rate": 7.84e-06, "loss": 0.0, "reward": -0.11900394782423973, "reward_std": 0.1304343193769455, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.10337893664836884, "rewards/verse_reward_func": -0.015625, "step": 98 }, { "completion_length": 218.265625, "epoch": 0.792, "grad_norm": 1.3515625, "kl": 0.0009186547831632197, "learning_rate": 7.92e-06, "loss": 0.0, "reward": -0.10051256790757179, "reward_std": 0.11499280482530594, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09270006790757179, "rewards/verse_reward_func": -0.0078125, "step": 99 }, { "completion_length": 228.875, "epoch": 0.8, "grad_norm": 0.9453125, "kl": 0.0009272133465856314, "learning_rate": 8.000000000000001e-06, "loss": 0.0, "reward": -0.08849799633026123, "reward_std": 0.12408233806490898, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015701593831181526, "rewards/no_repetition_reward_func": -0.09638708457350731, "rewards/verse_reward_func": -0.0078125, "step": 100 }, { "completion_length": 227.171875, "epoch": 0.808, "grad_norm": 1.1875, "kl": 0.000911645736778155, "learning_rate": 8.08e-06, "loss": 0.0, "reward": -0.08093632012605667, "reward_std": 0.1279943659901619, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01532595744356513, "rewards/no_repetition_reward_func": -0.08063727989792824, "rewards/verse_reward_func": -0.015625, "step": 101 }, { "completion_length": 234.046875, "epoch": 0.816, "grad_norm": 0.484375, "kl": 0.0008847687277011573, "learning_rate": 8.160000000000001e-06, "loss": 0.0, "reward": -0.12249046564102173, "reward_std": 0.1900215446949005, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01603618450462818, "rewards/no_repetition_reward_func": -0.13071414455771446, "rewards/verse_reward_func": -0.0078125, "step": 102 }, { "completion_length": 219.296875, "epoch": 0.824, "grad_norm": 0.79296875, "kl": 0.0009696218767203391, "learning_rate": 8.24e-06, "loss": 0.0, "reward": -0.09280933812260628, "reward_std": 0.12301011011004448, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01311188843101263, "rewards/no_repetition_reward_func": -0.09029622375965118, "rewards/verse_reward_func": -0.015625, "step": 103 }, { "completion_length": 232.984375, "epoch": 0.832, "grad_norm": 0.625, "kl": 0.0009296589705627412, "learning_rate": 8.32e-06, "loss": 0.0, "reward": -0.07492734491825104, "reward_std": 0.09040962904691696, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.010416666977107525, "rewards/no_repetition_reward_func": -0.08534401282668114, "rewards/verse_reward_func": 0.0, "step": 104 }, { "completion_length": 229.1875, "epoch": 0.84, "grad_norm": 1.2421875, "kl": 0.0008937256061471999, "learning_rate": 8.400000000000001e-06, "loss": 0.0, "reward": -0.11696134135127068, "reward_std": 0.14167778193950653, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.006009615492075682, "rewards/no_repetition_reward_func": -0.09953345358371735, "rewards/verse_reward_func": -0.0234375, "step": 105 }, { "completion_length": 232.3125, "epoch": 0.848, "grad_norm": 0.5234375, "kl": 0.0010223403223790228, "learning_rate": 8.48e-06, "loss": 0.0, "reward": -0.06904798001050949, "reward_std": 0.18861844390630722, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03632398135960102, "rewards/no_repetition_reward_func": -0.08974695950746536, "rewards/verse_reward_func": -0.015625, "step": 106 }, { "completion_length": 232.0625, "epoch": 0.856, "grad_norm": 0.5, "kl": 0.000962819205597043, "learning_rate": 8.56e-06, "loss": 0.0, "reward": -0.0645495094358921, "reward_std": 0.09728751704096794, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0074404762126505375, "rewards/no_repetition_reward_func": -0.0719899870455265, "rewards/verse_reward_func": 0.0, "step": 107 }, { "completion_length": 225.953125, "epoch": 0.864, "grad_norm": 0.46875, "kl": 0.0009057191491592675, "learning_rate": 8.64e-06, "loss": 0.0, "reward": -0.07919973321259022, "reward_std": 0.20255421102046967, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02756358217447996, "rewards/no_repetition_reward_func": -0.0989508107304573, "rewards/verse_reward_func": -0.0078125, "step": 108 }, { "completion_length": 207.796875, "epoch": 0.872, "grad_norm": 1.125, "kl": 0.0009713373146951199, "learning_rate": 8.720000000000001e-06, "loss": 0.0, "reward": -0.07574327662587166, "reward_std": 0.11393077671527863, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.010416666977107525, "rewards/no_repetition_reward_func": -0.07053494453430176, "rewards/verse_reward_func": -0.015625, "step": 109 }, { "completion_length": 231.96875, "epoch": 0.88, "grad_norm": 0.57421875, "kl": 0.0009075571724679321, "learning_rate": 8.8e-06, "loss": 0.0, "reward": -0.036662960425019264, "reward_std": 0.21780648455023766, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.054764095693826675, "rewards/no_repetition_reward_func": -0.08361455425620079, "rewards/verse_reward_func": -0.0078125, "step": 110 }, { "completion_length": 216.515625, "epoch": 0.888, "grad_norm": 0.52734375, "kl": 0.0009229624993167818, "learning_rate": 8.880000000000001e-06, "loss": 0.0, "reward": -0.08506099134683609, "reward_std": 0.0783744752407074, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.07724848762154579, "rewards/verse_reward_func": -0.0078125, "step": 111 }, { "completion_length": 216.90625, "epoch": 0.896, "grad_norm": 0.61328125, "kl": 0.0009516240679658949, "learning_rate": 8.96e-06, "loss": 0.0, "reward": -0.08930078148841858, "reward_std": 0.11291036382317543, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.007102272938936949, "rewards/no_repetition_reward_func": -0.09640305489301682, "rewards/verse_reward_func": 0.0, "step": 112 }, { "completion_length": 225.328125, "epoch": 0.904, "grad_norm": 0.5390625, "kl": 0.0009350940817967057, "learning_rate": 9.04e-06, "loss": 0.0, "reward": -0.06717121973633766, "reward_std": 0.17519910633563995, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03810409037396312, "rewards/no_repetition_reward_func": -0.09746281057596207, "rewards/verse_reward_func": -0.0078125, "step": 113 }, { "completion_length": 233.609375, "epoch": 0.912, "grad_norm": 0.470703125, "kl": 0.0009594852745067328, "learning_rate": 9.12e-06, "loss": 0.0, "reward": -0.06004701554775238, "reward_std": 0.12479560449719429, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.024594906717538834, "rewards/no_repetition_reward_func": -0.08464191854000092, "rewards/verse_reward_func": 0.0, "step": 114 }, { "completion_length": 245.671875, "epoch": 0.92, "grad_norm": 0.52734375, "kl": 0.0008760744822211564, "learning_rate": 9.2e-06, "loss": 0.0, "reward": -0.12967995926737785, "reward_std": 0.1694769337773323, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.006793478038161993, "rewards/no_repetition_reward_func": -0.12084844708442688, "rewards/verse_reward_func": -0.015625, "step": 115 }, { "completion_length": 232.703125, "epoch": 0.928, "grad_norm": 1.40625, "kl": 0.0009645233221817762, "learning_rate": 9.28e-06, "loss": 0.0, "reward": -0.06748884171247482, "reward_std": 0.0962454304099083, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.008223684504628181, "rewards/no_repetition_reward_func": -0.0679000299423933, "rewards/verse_reward_func": -0.0078125, "step": 116 }, { "completion_length": 214.53125, "epoch": 0.936, "grad_norm": 1.296875, "kl": 0.0009821033745538443, "learning_rate": 9.36e-06, "loss": 0.0, "reward": -0.10846579447388649, "reward_std": 0.15414345264434814, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02384868450462818, "rewards/no_repetition_reward_func": -0.10887697711586952, "rewards/verse_reward_func": -0.0234375, "step": 117 }, { "completion_length": 227.078125, "epoch": 0.944, "grad_norm": 0.72265625, "kl": 0.00091310910647735, "learning_rate": 9.44e-06, "loss": 0.0, "reward": -0.11495331302285194, "reward_std": 0.12077228724956512, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09151580929756165, "rewards/verse_reward_func": -0.0234375, "step": 118 }, { "completion_length": 221.15625, "epoch": 0.952, "grad_norm": 0.81640625, "kl": 0.0009164984803646803, "learning_rate": 9.52e-06, "loss": 0.0, "reward": -0.10485706105828285, "reward_std": 0.17042756080627441, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.017414860427379608, "rewards/no_repetition_reward_func": -0.09102192148566246, "rewards/verse_reward_func": -0.03125, "step": 119 }, { "completion_length": 225.140625, "epoch": 0.96, "grad_norm": 0.82421875, "kl": 0.0009031399386003613, "learning_rate": 9.600000000000001e-06, "loss": 0.0, "reward": -0.10434608533978462, "reward_std": 0.1362314410507679, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0078125, "rewards/no_repetition_reward_func": -0.09653358533978462, "rewards/verse_reward_func": -0.015625, "step": 120 }, { "completion_length": 231.59375, "epoch": 0.968, "grad_norm": 0.90234375, "kl": 0.0009503560431767255, "learning_rate": 9.68e-06, "loss": 0.0, "reward": -0.09547318518161774, "reward_std": 0.1317606195807457, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.008223684504628181, "rewards/no_repetition_reward_func": -0.09588436782360077, "rewards/verse_reward_func": -0.0078125, "step": 121 }, { "completion_length": 238.9375, "epoch": 0.976, "grad_norm": 0.52734375, "kl": 0.0008562441798858345, "learning_rate": 9.760000000000001e-06, "loss": 0.0, "reward": -0.028793351724743843, "reward_std": 0.28765882551670074, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09007394686341286, "rewards/no_repetition_reward_func": -0.11105479300022125, "rewards/verse_reward_func": -0.0078125, "step": 122 }, { "completion_length": 239.09375, "epoch": 0.984, "grad_norm": 0.5078125, "kl": 0.0009293340553995222, "learning_rate": 9.84e-06, "loss": 0.0, "reward": -0.07063544541597366, "reward_std": 0.07468729466199875, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.07063544541597366, "rewards/verse_reward_func": 0.0, "step": 123 }, { "completion_length": 225.125, "epoch": 0.992, "grad_norm": 0.51171875, "kl": 0.0009349569445475936, "learning_rate": 9.92e-06, "loss": 0.0, "reward": -0.09636906534433365, "reward_std": 0.14432670921087265, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01566416071727872, "rewards/no_repetition_reward_func": -0.11203322559595108, "rewards/verse_reward_func": 0.0, "step": 124 }, { "completion_length": 252.3125, "epoch": 1.0, "grad_norm": 0.51171875, "kl": 0.0009056414710357785, "learning_rate": 1e-05, "loss": 0.0, "reward": -0.11089875921607018, "reward_std": 0.12123321741819382, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.11089875549077988, "rewards/verse_reward_func": 0.0, "step": 125 }, { "completion_length": 226.859375, "epoch": 1.008, "grad_norm": 0.8203125, "kl": 0.0008826199627947062, "learning_rate": 1.008e-05, "loss": 0.0, "reward": -0.09451838955283165, "reward_std": 0.136559896171093, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015252976212650537, "rewards/no_repetition_reward_func": -0.09414637088775635, "rewards/verse_reward_func": -0.015625, "step": 126 }, { "completion_length": 230.484375, "epoch": 1.016, "grad_norm": 0.5390625, "kl": 0.0009819735423661768, "learning_rate": 1.016e-05, "loss": 0.0, "reward": -0.09354683384299278, "reward_std": 0.0975259467959404, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.08573432639241219, "rewards/verse_reward_func": -0.0078125, "step": 127 }, { "completion_length": 244.578125, "epoch": 1.024, "grad_norm": 0.66796875, "kl": 0.0010077067418023944, "learning_rate": 1.024e-05, "loss": 0.0, "reward": -0.08196036890149117, "reward_std": 0.09506024420261383, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0078125, "rewards/no_repetition_reward_func": -0.08977286517620087, "rewards/verse_reward_func": 0.0, "step": 128 }, { "completion_length": 241.671875, "epoch": 1.032, "grad_norm": 0.734375, "kl": 0.0009059908043127507, "learning_rate": 1.0320000000000001e-05, "loss": 0.0, "reward": -0.11128167808055878, "reward_std": 0.10646028816699982, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09565667435526848, "rewards/verse_reward_func": -0.015625, "step": 129 }, { "completion_length": 238.953125, "epoch": 1.04, "grad_norm": 0.60546875, "kl": 0.0009497147984802723, "learning_rate": 1.04e-05, "loss": 0.0, "reward": -0.11473039537668228, "reward_std": 0.10996419563889503, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.10691789165139198, "rewards/verse_reward_func": -0.0078125, "step": 130 }, { "completion_length": 239.078125, "epoch": 1.048, "grad_norm": 0.671875, "kl": 0.0009714066691230983, "learning_rate": 1.0480000000000001e-05, "loss": 0.0, "reward": -0.07778122276067734, "reward_std": 0.11016635224223137, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.013227513059973717, "rewards/no_repetition_reward_func": -0.0910087376832962, "rewards/verse_reward_func": 0.0, "step": 131 }, { "completion_length": 231.953125, "epoch": 1.056, "grad_norm": 0.54296875, "kl": 0.0009650475694797933, "learning_rate": 1.056e-05, "loss": 0.0, "reward": -0.1207541897892952, "reward_std": 0.1323094740509987, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.1051291860640049, "rewards/verse_reward_func": -0.015625, "step": 132 }, { "completion_length": 240.625, "epoch": 1.064, "grad_norm": 0.734375, "kl": 0.0009036169794853777, "learning_rate": 1.064e-05, "loss": 0.0, "reward": -0.07731092721223831, "reward_std": 0.21987367421388626, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0434027761220932, "rewards/no_repetition_reward_func": -0.10508871451020241, "rewards/verse_reward_func": -0.015625, "step": 133 }, { "completion_length": 237.0, "epoch": 1.072, "grad_norm": 0.46484375, "kl": 0.0010101073421537876, "learning_rate": 1.072e-05, "loss": 0.0, "reward": -0.07436153292655945, "reward_std": 0.07278687506914139, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.07436153292655945, "rewards/verse_reward_func": 0.0, "step": 134 }, { "completion_length": 225.71875, "epoch": 1.08, "grad_norm": 0.609375, "kl": 0.0009602317295502871, "learning_rate": 1.08e-05, "loss": 0.0, "reward": -0.07882250845432281, "reward_std": 0.16665278747677803, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02130681835114956, "rewards/no_repetition_reward_func": -0.09231682866811752, "rewards/verse_reward_func": -0.0078125, "step": 135 }, { "completion_length": 228.453125, "epoch": 1.088, "grad_norm": 0.486328125, "kl": 0.0009719938971102238, "learning_rate": 1.088e-05, "loss": 0.0, "reward": -0.09162363782525063, "reward_std": 0.10296135395765305, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09162363782525063, "rewards/verse_reward_func": 0.0, "step": 136 }, { "completion_length": 228.671875, "epoch": 1.096, "grad_norm": 0.578125, "kl": 0.0010191774345003068, "learning_rate": 1.096e-05, "loss": 0.0, "reward": -0.09341166540980339, "reward_std": 0.1734352931380272, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02384868450462818, "rewards/no_repetition_reward_func": -0.10163535177707672, "rewards/verse_reward_func": -0.015625, "step": 137 }, { "completion_length": 228.59375, "epoch": 1.104, "grad_norm": 0.498046875, "kl": 0.0009971531690098345, "learning_rate": 1.1040000000000001e-05, "loss": 0.0, "reward": -0.0631587766110897, "reward_std": 0.11596111953258514, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015625, "rewards/no_repetition_reward_func": -0.07097127288579941, "rewards/verse_reward_func": -0.0078125, "step": 138 }, { "completion_length": 221.640625, "epoch": 1.112, "grad_norm": 0.75390625, "kl": 0.0009975798311643302, "learning_rate": 1.112e-05, "loss": 0.0, "reward": -0.09898503124713898, "reward_std": 0.1850246638059616, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.023065476212650537, "rewards/no_repetition_reward_func": -0.11423800885677338, "rewards/verse_reward_func": -0.0078125, "step": 139 }, { "completion_length": 235.203125, "epoch": 1.12, "grad_norm": 0.8125, "kl": 0.0009761290275491774, "learning_rate": 1.1200000000000001e-05, "loss": 0.0, "reward": -0.10496794432401657, "reward_std": 0.10986243560910225, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09715544432401657, "rewards/verse_reward_func": -0.0078125, "step": 140 }, { "completion_length": 222.609375, "epoch": 1.1280000000000001, "grad_norm": 0.6953125, "kl": 0.0009398605907335877, "learning_rate": 1.128e-05, "loss": 0.0, "reward": -0.1127433069050312, "reward_std": 0.13825493305921555, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.0971183106303215, "rewards/verse_reward_func": -0.015625, "step": 141 }, { "completion_length": 232.375, "epoch": 1.1360000000000001, "grad_norm": 0.56640625, "kl": 0.0009225225076079369, "learning_rate": 1.1360000000000001e-05, "loss": 0.0, "reward": -0.09707474708557129, "reward_std": 0.19106215238571167, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0265522887930274, "rewards/no_repetition_reward_func": -0.11581454053521156, "rewards/verse_reward_func": -0.0078125, "step": 142 }, { "completion_length": 228.59375, "epoch": 1.144, "grad_norm": 0.494140625, "kl": 0.0010526655241847038, "learning_rate": 1.144e-05, "loss": 0.0, "reward": -0.04885440971702337, "reward_std": 0.15387418493628502, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03427618695423007, "rewards/no_repetition_reward_func": -0.0831305980682373, "rewards/verse_reward_func": 0.0, "step": 143 }, { "completion_length": 228.21875, "epoch": 1.152, "grad_norm": 1.171875, "kl": 0.0010413251584395766, "learning_rate": 1.152e-05, "loss": 0.0, "reward": -0.09335272200405598, "reward_std": 0.21751710772514343, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03376520425081253, "rewards/no_repetition_reward_func": -0.09586792625486851, "rewards/verse_reward_func": -0.03125, "step": 144 }, { "completion_length": 219.875, "epoch": 1.16, "grad_norm": 0.515625, "kl": 0.001026755548082292, "learning_rate": 1.16e-05, "loss": 0.0, "reward": -0.10019876435399055, "reward_std": 0.13528341427445412, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09238626807928085, "rewards/verse_reward_func": -0.0078125, "step": 145 }, { "completion_length": 245.40625, "epoch": 1.168, "grad_norm": 1.078125, "kl": 0.000992421293631196, "learning_rate": 1.168e-05, "loss": 0.0, "reward": -0.12389492988586426, "reward_std": 0.11891792714595795, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.11608243361115456, "rewards/verse_reward_func": -0.0078125, "step": 146 }, { "completion_length": 242.515625, "epoch": 1.176, "grad_norm": 0.484375, "kl": 0.0010139758232980967, "learning_rate": 1.1760000000000001e-05, "loss": 0.0, "reward": -0.09761396795511246, "reward_std": 0.11721779406070709, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09761397168040276, "rewards/verse_reward_func": 0.0, "step": 147 }, { "completion_length": 220.859375, "epoch": 1.184, "grad_norm": 0.54296875, "kl": 0.001036174362525344, "learning_rate": 1.1840000000000002e-05, "loss": 0.0, "reward": -0.08542966097593307, "reward_std": 0.1307716928422451, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.012181408703327179, "rewards/no_repetition_reward_func": -0.09761107712984085, "rewards/verse_reward_func": 0.0, "step": 148 }, { "completion_length": 245.9375, "epoch": 1.192, "grad_norm": 0.45703125, "kl": 0.0009866940090432763, "learning_rate": 1.1920000000000001e-05, "loss": 0.0, "reward": -0.09207209199666977, "reward_std": 0.11661773920059204, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.009765625, "rewards/no_repetition_reward_func": -0.10183771699666977, "rewards/verse_reward_func": 0.0, "step": 149 }, { "completion_length": 234.703125, "epoch": 1.2, "grad_norm": 0.486328125, "kl": 0.0010041765635833144, "learning_rate": 1.2e-05, "loss": 0.0, "reward": -0.08135378733277321, "reward_std": 0.12183233350515366, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01629344979301095, "rewards/no_repetition_reward_func": -0.09764724224805832, "rewards/verse_reward_func": 0.0, "step": 150 }, { "completion_length": 231.875, "epoch": 1.208, "grad_norm": 0.86328125, "kl": 0.001173751661553979, "learning_rate": 1.2080000000000001e-05, "loss": 0.0, "reward": -0.056069184094667435, "reward_std": 0.17241568863391876, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.029468202963471413, "rewards/no_repetition_reward_func": -0.0699123851954937, "rewards/verse_reward_func": -0.015625, "step": 151 }, { "completion_length": 238.75, "epoch": 1.216, "grad_norm": 0.73046875, "kl": 0.0010443408973515034, "learning_rate": 1.216e-05, "loss": 0.0, "reward": -0.06567678228020668, "reward_std": 0.1872636303305626, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0315126059576869, "rewards/no_repetition_reward_func": -0.08937688916921616, "rewards/verse_reward_func": -0.0078125, "step": 152 }, { "completion_length": 214.015625, "epoch": 1.224, "grad_norm": 0.7734375, "kl": 0.0010671618510968983, "learning_rate": 1.224e-05, "loss": 0.0, "reward": -0.08186103031039238, "reward_std": 0.10873320326209068, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.006793478038161993, "rewards/no_repetition_reward_func": -0.08084200695157051, "rewards/verse_reward_func": -0.0078125, "step": 153 }, { "completion_length": 221.46875, "epoch": 1.232, "grad_norm": 0.7890625, "kl": 0.0010688489419408143, "learning_rate": 1.232e-05, "loss": 0.0, "reward": -0.09862119890749454, "reward_std": 0.1351141482591629, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.006793478038161993, "rewards/no_repetition_reward_func": -0.09760217368602753, "rewards/verse_reward_func": -0.0078125, "step": 154 }, { "completion_length": 231.3125, "epoch": 1.24, "grad_norm": 0.6328125, "kl": 0.0010459377081133425, "learning_rate": 1.24e-05, "loss": 0.0, "reward": -0.07467435300350189, "reward_std": 0.21781045943498611, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0466806311160326, "rewards/no_repetition_reward_func": -0.11354248225688934, "rewards/verse_reward_func": -0.0078125, "step": 155 }, { "completion_length": 237.515625, "epoch": 1.248, "grad_norm": 0.6328125, "kl": 0.00106938456883654, "learning_rate": 1.248e-05, "loss": 0.0, "reward": -0.05984549596905708, "reward_std": 0.1556803286075592, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.030230978038161993, "rewards/no_repetition_reward_func": -0.09007647633552551, "rewards/verse_reward_func": 0.0, "step": 156 }, { "completion_length": 228.421875, "epoch": 1.256, "grad_norm": 0.8515625, "kl": 0.00117484072688967, "learning_rate": 1.256e-05, "loss": 0.0, "reward": -0.025189975276589394, "reward_std": 0.1884380355477333, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05628685560077429, "rewards/no_repetition_reward_func": -0.07366432994604111, "rewards/verse_reward_func": -0.0078125, "step": 157 }, { "completion_length": 237.953125, "epoch": 1.264, "grad_norm": 0.490234375, "kl": 0.0011019533849321306, "learning_rate": 1.2640000000000003e-05, "loss": 0.0, "reward": -0.04384150542318821, "reward_std": 0.13314421847462654, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.025091582909226418, "rewards/no_repetition_reward_func": -0.06893308833241463, "rewards/verse_reward_func": 0.0, "step": 158 }, { "completion_length": 237.5, "epoch": 1.272, "grad_norm": 0.55859375, "kl": 0.0010649362811818719, "learning_rate": 1.2720000000000002e-05, "loss": 0.0, "reward": -0.08817163109779358, "reward_std": 0.08343742787837982, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.08817163482308388, "rewards/verse_reward_func": 0.0, "step": 159 }, { "completion_length": 221.28125, "epoch": 1.28, "grad_norm": 0.4921875, "kl": 0.0010974335600621998, "learning_rate": 1.2800000000000001e-05, "loss": 0.0, "reward": -0.07976796105504036, "reward_std": 0.14786510914564133, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.022786458488553762, "rewards/no_repetition_reward_func": -0.08692942187190056, "rewards/verse_reward_func": -0.015625, "step": 160 }, { "completion_length": 228.078125, "epoch": 1.288, "grad_norm": 0.5859375, "kl": 0.0010233705397695303, "learning_rate": 1.288e-05, "loss": 0.0, "reward": -0.12897229194641113, "reward_std": 0.17166408896446228, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0065104165114462376, "rewards/no_repetition_reward_func": -0.12767020985484123, "rewards/verse_reward_func": -0.0078125, "step": 161 }, { "completion_length": 226.21875, "epoch": 1.296, "grad_norm": 0.57421875, "kl": 0.0011856920318678021, "learning_rate": 1.296e-05, "loss": 0.0, "reward": -0.08636101335287094, "reward_std": 0.14799706265330315, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.014690171927213669, "rewards/no_repetition_reward_func": -0.10105118155479431, "rewards/verse_reward_func": 0.0, "step": 162 }, { "completion_length": 221.34375, "epoch": 1.304, "grad_norm": 0.8828125, "kl": 0.001230894005857408, "learning_rate": 1.3039999999999999e-05, "loss": 0.0, "reward": -0.04063412547111511, "reward_std": 0.23765011504292488, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05461516231298447, "rewards/no_repetition_reward_func": -0.07181178964674473, "rewards/verse_reward_func": -0.0234375, "step": 163 }, { "completion_length": 225.671875, "epoch": 1.312, "grad_norm": 2.375, "kl": 0.0028623025864362717, "learning_rate": 1.3120000000000001e-05, "loss": 0.0001, "reward": -0.05923652462661266, "reward_std": 0.09846559539437294, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0078125, "rewards/no_repetition_reward_func": -0.05923652462661266, "rewards/verse_reward_func": -0.0078125, "step": 164 }, { "completion_length": 231.25, "epoch": 1.32, "grad_norm": 0.99609375, "kl": 0.0012457778793759644, "learning_rate": 1.32e-05, "loss": 0.0, "reward": -0.08726520836353302, "reward_std": 0.12036626413464546, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0078125, "rewards/no_repetition_reward_func": -0.07164020836353302, "rewards/verse_reward_func": -0.0234375, "step": 165 }, { "completion_length": 218.4375, "epoch": 1.328, "grad_norm": 0.74609375, "kl": 0.0012068189680576324, "learning_rate": 1.3280000000000002e-05, "loss": 0.0, "reward": -0.09236433357000351, "reward_std": 0.17130208015441895, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015625, "rewards/no_repetition_reward_func": -0.09236433357000351, "rewards/verse_reward_func": -0.015625, "step": 166 }, { "completion_length": 220.78125, "epoch": 1.336, "grad_norm": 0.5625, "kl": 0.0012252767919562757, "learning_rate": 1.336e-05, "loss": 0.0, "reward": -0.0686846412718296, "reward_std": 0.13090770691633224, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01690424047410488, "rewards/no_repetition_reward_func": -0.08558888360857964, "rewards/verse_reward_func": 0.0, "step": 167 }, { "completion_length": 225.09375, "epoch": 1.3439999999999999, "grad_norm": 0.765625, "kl": 0.0010955975158140063, "learning_rate": 1.344e-05, "loss": 0.0, "reward": -0.09006313607096672, "reward_std": 0.08349091559648514, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.08225063793361187, "rewards/verse_reward_func": -0.0078125, "step": 168 }, { "completion_length": 226.828125, "epoch": 1.3519999999999999, "grad_norm": 0.46875, "kl": 0.0011617571581155062, "learning_rate": 1.352e-05, "loss": 0.0, "reward": -0.11815259233117104, "reward_std": 0.12507855147123337, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.11815259605646133, "rewards/verse_reward_func": 0.0, "step": 169 }, { "completion_length": 223.171875, "epoch": 1.3599999999999999, "grad_norm": 0.55859375, "kl": 0.0012259799987077713, "learning_rate": 1.3600000000000002e-05, "loss": 0.0, "reward": -0.07549700699746609, "reward_std": 0.1506355106830597, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.021645022090524435, "rewards/no_repetition_reward_func": -0.08932952955365181, "rewards/verse_reward_func": -0.0078125, "step": 170 }, { "completion_length": 235.34375, "epoch": 1.3679999999999999, "grad_norm": 0.98828125, "kl": 0.0012528767110779881, "learning_rate": 1.3680000000000001e-05, "loss": 0.0001, "reward": -0.10926545038819313, "reward_std": 0.1457849256694317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0074404762126505375, "rewards/no_repetition_reward_func": -0.09326842427253723, "rewards/verse_reward_func": -0.0234375, "step": 171 }, { "completion_length": 234.078125, "epoch": 1.376, "grad_norm": 0.59765625, "kl": 0.0013281747815199196, "learning_rate": 1.376e-05, "loss": 0.0001, "reward": -0.06594929471611977, "reward_std": 0.13678889721632004, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.014880952425301075, "rewards/no_repetition_reward_func": -0.08083024993538857, "rewards/verse_reward_func": 0.0, "step": 172 }, { "completion_length": 234.671875, "epoch": 1.384, "grad_norm": 0.8515625, "kl": 0.0012164909858256578, "learning_rate": 1.384e-05, "loss": 0.0, "reward": -0.043765537440776825, "reward_std": 0.14428378641605377, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02517583128064871, "rewards/no_repetition_reward_func": -0.06112887151539326, "rewards/verse_reward_func": -0.0078125, "step": 173 }, { "completion_length": 242.8125, "epoch": 1.392, "grad_norm": 0.6171875, "kl": 0.001197631238028407, "learning_rate": 1.3919999999999999e-05, "loss": 0.0, "reward": -0.0848824679851532, "reward_std": 0.18079277873039246, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.030269484966993332, "rewards/no_repetition_reward_func": -0.11515195667743683, "rewards/verse_reward_func": 0.0, "step": 174 }, { "completion_length": 223.140625, "epoch": 1.4, "grad_norm": 0.66015625, "kl": 0.0013319969293661416, "learning_rate": 1.4000000000000001e-05, "loss": 0.0001, "reward": -0.06108429096639156, "reward_std": 0.1944085955619812, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03651556745171547, "rewards/no_repetition_reward_func": -0.08197485841810703, "rewards/verse_reward_func": -0.015625, "step": 175 }, { "completion_length": 227.203125, "epoch": 1.408, "grad_norm": 0.609375, "kl": 0.0012923061731271446, "learning_rate": 1.408e-05, "loss": 0.0001, "reward": -0.034936342388391495, "reward_std": 0.25394658744335175, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06874222215265036, "rewards/no_repetition_reward_func": -0.09586606547236443, "rewards/verse_reward_func": -0.0078125, "step": 176 }, { "completion_length": 222.828125, "epoch": 1.416, "grad_norm": 0.6875, "kl": 0.001278650015592575, "learning_rate": 1.4160000000000002e-05, "loss": 0.0001, "reward": -0.05875513143837452, "reward_std": 0.15186850726604462, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02994791604578495, "rewards/no_repetition_reward_func": -0.08089054375886917, "rewards/verse_reward_func": -0.0078125, "step": 177 }, { "completion_length": 226.515625, "epoch": 1.424, "grad_norm": 0.82421875, "kl": 0.0013883021310903132, "learning_rate": 1.4240000000000001e-05, "loss": 0.0001, "reward": -0.10252957418560982, "reward_std": 0.13616624474525452, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.006793478038161993, "rewards/no_repetition_reward_func": -0.09369805827736855, "rewards/verse_reward_func": -0.015625, "step": 178 }, { "completion_length": 238.609375, "epoch": 1.432, "grad_norm": 0.498046875, "kl": 0.0013281579595059156, "learning_rate": 1.432e-05, "loss": 0.0001, "reward": -0.1059655249118805, "reward_std": 0.10343372449278831, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.1059655211865902, "rewards/verse_reward_func": 0.0, "step": 179 }, { "completion_length": 235.40625, "epoch": 1.44, "grad_norm": 0.4453125, "kl": 0.0013788872165605426, "learning_rate": 1.44e-05, "loss": 0.0001, "reward": -0.05832286272197962, "reward_std": 0.16571523994207382, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03222730755805969, "rewards/no_repetition_reward_func": -0.09055017307400703, "rewards/verse_reward_func": 0.0, "step": 180 }, { "completion_length": 226.0625, "epoch": 1.448, "grad_norm": 0.92578125, "kl": 0.0014654535334557295, "learning_rate": 1.4480000000000002e-05, "loss": 0.0001, "reward": 0.02028096467256546, "reward_std": 0.4557221755385399, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.13703208323568106, "rewards/no_repetition_reward_func": -0.10112613067030907, "rewards/verse_reward_func": -0.015625, "step": 181 }, { "completion_length": 215.140625, "epoch": 1.456, "grad_norm": 0.6328125, "kl": 0.0014458707300946116, "learning_rate": 1.4560000000000001e-05, "loss": 0.0001, "reward": -0.07558359950780869, "reward_std": 0.1052810475230217, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.005787036847323179, "rewards/no_repetition_reward_func": -0.07355813682079315, "rewards/verse_reward_func": -0.0078125, "step": 182 }, { "completion_length": 222.453125, "epoch": 1.464, "grad_norm": 0.93359375, "kl": 0.001408717012964189, "learning_rate": 1.464e-05, "loss": 0.0001, "reward": -0.07697185128927231, "reward_std": 0.11877438426017761, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0086805559694767, "rewards/no_repetition_reward_func": -0.07002740353345871, "rewards/verse_reward_func": -0.015625, "step": 183 }, { "completion_length": 241.171875, "epoch": 1.472, "grad_norm": 0.498046875, "kl": 0.001281659584492445, "learning_rate": 1.472e-05, "loss": 0.0001, "reward": -0.09598147124052048, "reward_std": 0.12714635580778122, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.009191176854074001, "rewards/no_repetition_reward_func": -0.08954765647649765, "rewards/verse_reward_func": -0.015625, "step": 184 }, { "completion_length": 238.09375, "epoch": 1.48, "grad_norm": 0.51171875, "kl": 0.0013379572774283588, "learning_rate": 1.48e-05, "loss": 0.0001, "reward": -0.1140379086136818, "reward_std": 0.12429556995630264, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.1140379086136818, "rewards/verse_reward_func": 0.0, "step": 185 }, { "completion_length": 237.1875, "epoch": 1.488, "grad_norm": 0.5703125, "kl": 0.0013217166997492313, "learning_rate": 1.488e-05, "loss": 0.0001, "reward": -0.13353200629353523, "reward_std": 0.12724728882312775, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.12571951001882553, "rewards/verse_reward_func": -0.0078125, "step": 186 }, { "completion_length": 240.25, "epoch": 1.496, "grad_norm": 0.447265625, "kl": 0.0013371336390264332, "learning_rate": 1.4960000000000002e-05, "loss": 0.0001, "reward": -0.09627507254481316, "reward_std": 0.14540337771177292, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01603618450462818, "rewards/no_repetition_reward_func": -0.11231125891208649, "rewards/verse_reward_func": 0.0, "step": 187 }, { "completion_length": 238.28125, "epoch": 1.504, "grad_norm": 0.46875, "kl": 0.001424064626917243, "learning_rate": 1.5040000000000002e-05, "loss": 0.0001, "reward": -0.0930992029607296, "reward_std": 0.12443128600716591, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.009765625, "rewards/no_repetition_reward_func": -0.0950523279607296, "rewards/verse_reward_func": -0.0078125, "step": 188 }, { "completion_length": 239.90625, "epoch": 1.512, "grad_norm": 0.5234375, "kl": 0.0014646631316281855, "learning_rate": 1.5120000000000001e-05, "loss": 0.0001, "reward": -0.0662461668252945, "reward_std": 0.10939930006861687, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.013020833022892475, "rewards/no_repetition_reward_func": -0.07926700636744499, "rewards/verse_reward_func": 0.0, "step": 189 }, { "completion_length": 224.015625, "epoch": 1.52, "grad_norm": 0.6796875, "kl": 0.0015595549484714866, "learning_rate": 1.52e-05, "loss": 0.0001, "reward": -0.05768076702952385, "reward_std": 0.1396203115582466, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.027733649592846632, "rewards/no_repetition_reward_func": -0.07760191708803177, "rewards/verse_reward_func": -0.0078125, "step": 190 }, { "completion_length": 225.46875, "epoch": 1.528, "grad_norm": 0.6484375, "kl": 0.0015949244843795896, "learning_rate": 1.528e-05, "loss": 0.0001, "reward": -0.0972504299134016, "reward_std": 0.20609425008296967, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0231584832072258, "rewards/no_repetition_reward_func": -0.09697141125798225, "rewards/verse_reward_func": -0.0234375, "step": 191 }, { "completion_length": 238.140625, "epoch": 1.536, "grad_norm": 0.5234375, "kl": 0.0013081540819257498, "learning_rate": 1.536e-05, "loss": 0.0001, "reward": -0.10008811205625534, "reward_std": 0.15643932670354843, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.014880952425301075, "rewards/no_repetition_reward_func": -0.09934407100081444, "rewards/verse_reward_func": -0.015625, "step": 192 }, { "completion_length": 234.734375, "epoch": 1.544, "grad_norm": 0.470703125, "kl": 0.0014590376522392035, "learning_rate": 1.544e-05, "loss": 0.0001, "reward": -0.06860917247831821, "reward_std": 0.14759846031665802, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.023104636929929256, "rewards/no_repetition_reward_func": -0.0917138084769249, "rewards/verse_reward_func": 0.0, "step": 193 }, { "completion_length": 227.8125, "epoch": 1.552, "grad_norm": 0.63671875, "kl": 0.0016707320464774966, "learning_rate": 1.552e-05, "loss": 0.0001, "reward": -0.08563121780753136, "reward_std": 0.126017514616251, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.008223684504628181, "rewards/no_repetition_reward_func": -0.08604240044951439, "rewards/verse_reward_func": -0.0078125, "step": 194 }, { "completion_length": 232.9375, "epoch": 1.56, "grad_norm": 0.515625, "kl": 0.0015315399505198002, "learning_rate": 1.56e-05, "loss": 0.0001, "reward": -0.06762224063277245, "reward_std": 0.17968397587537766, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03491950733587146, "rewards/no_repetition_reward_func": -0.10254175588488579, "rewards/verse_reward_func": 0.0, "step": 195 }, { "completion_length": 244.203125, "epoch": 1.568, "grad_norm": 0.44921875, "kl": 0.001529368688352406, "learning_rate": 1.568e-05, "loss": 0.0001, "reward": -0.0804674793034792, "reward_std": 0.1211073026061058, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01864035055041313, "rewards/no_repetition_reward_func": -0.09910783171653748, "rewards/verse_reward_func": 0.0, "step": 196 }, { "completion_length": 228.453125, "epoch": 1.576, "grad_norm": 0.96484375, "kl": 0.0016282657161355019, "learning_rate": 1.5759999999999998e-05, "loss": 0.0001, "reward": -0.07252757996320724, "reward_std": 0.1535002589225769, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0320728300139308, "rewards/no_repetition_reward_func": -0.08897540718317032, "rewards/verse_reward_func": -0.015625, "step": 197 }, { "completion_length": 233.765625, "epoch": 1.584, "grad_norm": 0.60546875, "kl": 0.0016328762285411358, "learning_rate": 1.584e-05, "loss": 0.0001, "reward": -0.08456896245479584, "reward_std": 0.1391274891793728, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015984654426574707, "rewards/no_repetition_reward_func": -0.09274111688137054, "rewards/verse_reward_func": -0.0078125, "step": 198 }, { "completion_length": 232.546875, "epoch": 1.592, "grad_norm": 0.546875, "kl": 0.0017947786836884916, "learning_rate": 1.592e-05, "loss": 0.0001, "reward": -0.07668091543018818, "reward_std": 0.14250988513231277, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.016947751864790916, "rewards/no_repetition_reward_func": -0.0936286672949791, "rewards/verse_reward_func": 0.0, "step": 199 }, { "completion_length": 240.796875, "epoch": 1.6, "grad_norm": 0.494140625, "kl": 0.0016833654954098165, "learning_rate": 1.6000000000000003e-05, "loss": 0.0001, "reward": -0.0353618860244751, "reward_std": 0.1296749785542488, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0326507268473506, "rewards/no_repetition_reward_func": -0.06801261007785797, "rewards/verse_reward_func": 0.0, "step": 200 }, { "completion_length": 221.671875, "epoch": 1.608, "grad_norm": 0.46875, "kl": 0.0016492042341269553, "learning_rate": 1.6080000000000002e-05, "loss": 0.0001, "reward": -0.054715532809495926, "reward_std": 0.13639206811785698, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02354964194819331, "rewards/no_repetition_reward_func": -0.07826517522335052, "rewards/verse_reward_func": 0.0, "step": 201 }, { "completion_length": 234.59375, "epoch": 1.616, "grad_norm": 0.62109375, "kl": 0.001671292760875076, "learning_rate": 1.616e-05, "loss": 0.0001, "reward": -0.035621643997728825, "reward_std": 0.14181481674313545, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02775493450462818, "rewards/no_repetition_reward_func": -0.05556407570838928, "rewards/verse_reward_func": -0.0078125, "step": 202 }, { "completion_length": 232.65625, "epoch": 1.624, "grad_norm": 0.5, "kl": 0.0018911936203949153, "learning_rate": 1.624e-05, "loss": 0.0001, "reward": -0.08518489450216293, "reward_std": 0.11578556895256042, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.009191176854074001, "rewards/no_repetition_reward_func": -0.09437607601284981, "rewards/verse_reward_func": 0.0, "step": 203 }, { "completion_length": 217.671875, "epoch": 1.6320000000000001, "grad_norm": 0.58203125, "kl": 0.0017347463290207088, "learning_rate": 1.6320000000000003e-05, "loss": 0.0001, "reward": -0.07767417095601559, "reward_std": 0.10600043088197708, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.006793478038161993, "rewards/no_repetition_reward_func": -0.06103014945983887, "rewards/verse_reward_func": -0.0234375, "step": 204 }, { "completion_length": 232.0625, "epoch": 1.6400000000000001, "grad_norm": 0.48046875, "kl": 0.0016477849567309022, "learning_rate": 1.6400000000000002e-05, "loss": 0.0001, "reward": -0.09944234788417816, "reward_std": 0.10641930624842644, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09944234043359756, "rewards/verse_reward_func": 0.0, "step": 205 }, { "completion_length": 225.1875, "epoch": 1.6480000000000001, "grad_norm": 0.9375, "kl": 0.0017070864560082555, "learning_rate": 1.648e-05, "loss": 0.0001, "reward": -0.11022284254431725, "reward_std": 0.3012443482875824, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0524553582072258, "rewards/no_repetition_reward_func": -0.13142820447683334, "rewards/verse_reward_func": -0.03125, "step": 206 }, { "completion_length": 232.984375, "epoch": 1.6560000000000001, "grad_norm": 1.4453125, "kl": 0.0019166275160387158, "learning_rate": 1.656e-05, "loss": 0.0001, "reward": -0.11848941817879677, "reward_std": 0.12484071776270866, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.08723941445350647, "rewards/verse_reward_func": -0.03125, "step": 207 }, { "completion_length": 230.515625, "epoch": 1.6640000000000001, "grad_norm": 0.91796875, "kl": 0.002084849402308464, "learning_rate": 1.664e-05, "loss": 0.0001, "reward": -0.09387412667274475, "reward_std": 0.1591917797923088, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02576669305562973, "rewards/no_repetition_reward_func": -0.09620331972837448, "rewards/verse_reward_func": -0.0234375, "step": 208 }, { "completion_length": 237.921875, "epoch": 1.6720000000000002, "grad_norm": 0.45703125, "kl": 0.001617906498722732, "learning_rate": 1.672e-05, "loss": 0.0001, "reward": -0.11068761348724365, "reward_std": 0.1216357871890068, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0074404762126505375, "rewards/no_repetition_reward_func": -0.11812809109687805, "rewards/verse_reward_func": 0.0, "step": 209 }, { "completion_length": 218.359375, "epoch": 1.6800000000000002, "grad_norm": 0.95703125, "kl": 0.0018935074331238866, "learning_rate": 1.6800000000000002e-05, "loss": 0.0001, "reward": -0.11004349961876869, "reward_std": 0.11512648314237595, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.0944184958934784, "rewards/verse_reward_func": -0.015625, "step": 210 }, { "completion_length": 224.640625, "epoch": 1.688, "grad_norm": 1.0, "kl": 0.0018973069381900132, "learning_rate": 1.688e-05, "loss": 0.0001, "reward": -0.08843932673335075, "reward_std": 0.19538921862840652, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03178267180919647, "rewards/no_repetition_reward_func": -0.09678449854254723, "rewards/verse_reward_func": -0.0234375, "step": 211 }, { "completion_length": 234.265625, "epoch": 1.696, "grad_norm": 0.55859375, "kl": 0.0020933954510837793, "learning_rate": 1.696e-05, "loss": 0.0001, "reward": -0.07755101099610329, "reward_std": 0.1760719269514084, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03212491795420647, "rewards/no_repetition_reward_func": -0.09405092895030975, "rewards/verse_reward_func": -0.015625, "step": 212 }, { "completion_length": 231.34375, "epoch": 1.704, "grad_norm": 0.4921875, "kl": 0.001728408969938755, "learning_rate": 1.704e-05, "loss": 0.0001, "reward": -0.07344069331884384, "reward_std": 0.08292994648218155, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.07344069890677929, "rewards/verse_reward_func": 0.0, "step": 213 }, { "completion_length": 239.34375, "epoch": 1.712, "grad_norm": 0.478515625, "kl": 0.002014331752434373, "learning_rate": 1.712e-05, "loss": 0.0001, "reward": -0.0678497962653637, "reward_std": 0.09451267495751381, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.009191176854074001, "rewards/no_repetition_reward_func": -0.07704097032546997, "rewards/verse_reward_func": 0.0, "step": 214 }, { "completion_length": 221.140625, "epoch": 1.72, "grad_norm": 0.5390625, "kl": 0.0017779492773115635, "learning_rate": 1.7199999999999998e-05, "loss": 0.0001, "reward": -0.10677186027169228, "reward_std": 0.18621215224266052, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.014204545877873898, "rewards/no_repetition_reward_func": -0.10535140708088875, "rewards/verse_reward_func": -0.015625, "step": 215 }, { "completion_length": 235.421875, "epoch": 1.728, "grad_norm": 0.6796875, "kl": 0.0019389757653698325, "learning_rate": 1.728e-05, "loss": 0.0001, "reward": -0.08646563813090324, "reward_std": 0.1650192327797413, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02228565700352192, "rewards/no_repetition_reward_func": -0.10093880072236061, "rewards/verse_reward_func": -0.0078125, "step": 216 }, { "completion_length": 234.640625, "epoch": 1.736, "grad_norm": 0.447265625, "kl": 0.00193762534763664, "learning_rate": 1.736e-05, "loss": 0.0001, "reward": -0.05335468426346779, "reward_std": 0.2298300489783287, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.051388829946517944, "rewards/no_repetition_reward_func": -0.09693101048469543, "rewards/verse_reward_func": -0.0078125, "step": 217 }, { "completion_length": 242.859375, "epoch": 1.744, "grad_norm": 0.466796875, "kl": 0.0021059830905869603, "learning_rate": 1.7440000000000002e-05, "loss": 0.0001, "reward": -0.08343884348869324, "reward_std": 0.07496767863631248, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.08343884348869324, "rewards/verse_reward_func": 0.0, "step": 218 }, { "completion_length": 239.84375, "epoch": 1.752, "grad_norm": 0.462890625, "kl": 0.0022248076274991035, "learning_rate": 1.752e-05, "loss": 0.0001, "reward": -0.0802541933953762, "reward_std": 0.08681922778487206, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.0802541933953762, "rewards/verse_reward_func": 0.0, "step": 219 }, { "completion_length": 228.75, "epoch": 1.76, "grad_norm": 0.498046875, "kl": 0.0022150056902319193, "learning_rate": 1.76e-05, "loss": 0.0001, "reward": -0.09582371637225151, "reward_std": 0.1113310195505619, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.09582372009754181, "rewards/verse_reward_func": 0.0, "step": 220 }, { "completion_length": 227.1875, "epoch": 1.768, "grad_norm": 0.5390625, "kl": 0.002047770074568689, "learning_rate": 1.7680000000000004e-05, "loss": 0.0001, "reward": -0.06059158593416214, "reward_std": 0.17836104333400726, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0359157994389534, "rewards/no_repetition_reward_func": -0.09650738537311554, "rewards/verse_reward_func": 0.0, "step": 221 }, { "completion_length": 239.6875, "epoch": 1.776, "grad_norm": 0.6171875, "kl": 0.0021655824966728687, "learning_rate": 1.7760000000000003e-05, "loss": 0.0001, "reward": -0.04824200738221407, "reward_std": 0.1744939684867859, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03125, "rewards/no_repetition_reward_func": -0.0794920101761818, "rewards/verse_reward_func": 0.0, "step": 222 }, { "completion_length": 235.75, "epoch": 1.784, "grad_norm": 0.462890625, "kl": 0.0019855506252497435, "learning_rate": 1.7840000000000002e-05, "loss": 0.0001, "reward": -0.09778543561697006, "reward_std": 0.19650399684906006, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.027225378900766373, "rewards/no_repetition_reward_func": -0.12501081451773643, "rewards/verse_reward_func": 0.0, "step": 223 }, { "completion_length": 236.46875, "epoch": 1.792, "grad_norm": 0.53125, "kl": 0.0021805629367008805, "learning_rate": 1.792e-05, "loss": 0.0001, "reward": -0.067544249817729, "reward_std": 0.13770674914121628, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.026500282809138298, "rewards/no_repetition_reward_func": -0.0940445326268673, "rewards/verse_reward_func": 0.0, "step": 224 }, { "completion_length": 231.59375, "epoch": 1.8, "grad_norm": 0.55859375, "kl": 0.002285638125613332, "learning_rate": 1.8e-05, "loss": 0.0001, "reward": -0.06534324958920479, "reward_std": 0.18344224244356155, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.024671053513884544, "rewards/no_repetition_reward_func": -0.08220180496573448, "rewards/verse_reward_func": -0.0078125, "step": 225 }, { "completion_length": 230.578125, "epoch": 1.808, "grad_norm": 0.74609375, "kl": 0.0023708419175818563, "learning_rate": 1.808e-05, "loss": 0.0001, "reward": -0.07941694557666779, "reward_std": 0.10120367631316185, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0065104165114462376, "rewards/no_repetition_reward_func": -0.07811485975980759, "rewards/verse_reward_func": -0.0078125, "step": 226 }, { "completion_length": 239.75, "epoch": 1.8159999999999998, "grad_norm": 0.59375, "kl": 0.0021976070711389184, "learning_rate": 1.8160000000000002e-05, "loss": 0.0001, "reward": -0.08708159625530243, "reward_std": 0.13025571405887604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.010416666977107525, "rewards/no_repetition_reward_func": -0.08968575671315193, "rewards/verse_reward_func": -0.0078125, "step": 227 }, { "completion_length": 234.1875, "epoch": 1.8239999999999998, "grad_norm": 0.8984375, "kl": 0.00210936542134732, "learning_rate": 1.824e-05, "loss": 0.0001, "reward": -0.07672691717743874, "reward_std": 0.09323399141430855, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.005787036847323179, "rewards/no_repetition_reward_func": -0.0747014544904232, "rewards/verse_reward_func": -0.0078125, "step": 228 }, { "completion_length": 220.828125, "epoch": 1.8319999999999999, "grad_norm": 0.6328125, "kl": 0.0025486204540356994, "learning_rate": 1.832e-05, "loss": 0.0001, "reward": -0.09974836185574532, "reward_std": 0.09746643155813217, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.08412336185574532, "rewards/verse_reward_func": -0.015625, "step": 229 }, { "completion_length": 240.640625, "epoch": 1.8399999999999999, "grad_norm": 0.443359375, "kl": 0.001965251984074712, "learning_rate": 1.84e-05, "loss": 0.0001, "reward": -0.08333280123770237, "reward_std": 0.11425788328051567, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01532595744356513, "rewards/no_repetition_reward_func": -0.09865875542163849, "rewards/verse_reward_func": 0.0, "step": 230 }, { "completion_length": 234.21875, "epoch": 1.8479999999999999, "grad_norm": 0.75, "kl": 0.0022318714763969183, "learning_rate": 1.848e-05, "loss": 0.0001, "reward": -0.08904822170734406, "reward_std": 0.11062756180763245, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.08123572170734406, "rewards/verse_reward_func": -0.0078125, "step": 231 }, { "completion_length": 244.109375, "epoch": 1.8559999999999999, "grad_norm": 0.4765625, "kl": 0.0025593278696760535, "learning_rate": 1.856e-05, "loss": 0.0001, "reward": -0.04527693334966898, "reward_std": 0.12370739504694939, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.021983225364238024, "rewards/no_repetition_reward_func": -0.06726015359163284, "rewards/verse_reward_func": 0.0, "step": 232 }, { "completion_length": 228.515625, "epoch": 1.8639999999999999, "grad_norm": 0.52734375, "kl": 0.002744038007222116, "learning_rate": 1.864e-05, "loss": 0.0001, "reward": -0.017841722816228867, "reward_std": 0.17696692049503326, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05398261174559593, "rewards/no_repetition_reward_func": -0.0718243271112442, "rewards/verse_reward_func": 0.0, "step": 233 }, { "completion_length": 244.09375, "epoch": 1.8719999999999999, "grad_norm": 0.46875, "kl": 0.002400025026872754, "learning_rate": 1.872e-05, "loss": 0.0001, "reward": -0.04988584667444229, "reward_std": 0.16287477687001228, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0418753232806921, "rewards/no_repetition_reward_func": -0.09176117554306984, "rewards/verse_reward_func": 0.0, "step": 234 }, { "completion_length": 230.859375, "epoch": 1.88, "grad_norm": 0.51953125, "kl": 0.002506029326468706, "learning_rate": 1.88e-05, "loss": 0.0001, "reward": -0.02773375529795885, "reward_std": 0.11407701298594475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.027079109102487564, "rewards/no_repetition_reward_func": -0.05481286346912384, "rewards/verse_reward_func": 0.0, "step": 235 }, { "completion_length": 240.15625, "epoch": 1.888, "grad_norm": 0.52734375, "kl": 0.0024591261753812432, "learning_rate": 1.888e-05, "loss": 0.0001, "reward": -0.08745010942220688, "reward_std": 0.08442100510001183, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.08745010569691658, "rewards/verse_reward_func": 0.0, "step": 236 }, { "completion_length": 223.484375, "epoch": 1.896, "grad_norm": 1.0546875, "kl": 0.00271437456831336, "learning_rate": 1.896e-05, "loss": 0.0001, "reward": -0.07905127480626106, "reward_std": 0.16970539838075638, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.028437845408916473, "rewards/no_repetition_reward_func": -0.08405161648988724, "rewards/verse_reward_func": -0.0234375, "step": 237 }, { "completion_length": 219.1875, "epoch": 1.904, "grad_norm": 1.515625, "kl": 0.002771681407466531, "learning_rate": 1.904e-05, "loss": 0.0001, "reward": -0.10876382887363434, "reward_std": 0.18513648957014084, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015625, "rewards/no_repetition_reward_func": -0.10876383259892464, "rewards/verse_reward_func": -0.015625, "step": 238 }, { "completion_length": 238.796875, "epoch": 1.912, "grad_norm": 0.453125, "kl": 0.0024822709383443, "learning_rate": 1.9120000000000003e-05, "loss": 0.0001, "reward": -0.10951904207468033, "reward_std": 0.131969653069973, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.10170654952526093, "rewards/verse_reward_func": -0.0078125, "step": 239 }, { "completion_length": 231.59375, "epoch": 1.92, "grad_norm": 0.58203125, "kl": 0.002792819286696613, "learning_rate": 1.9200000000000003e-05, "loss": 0.0001, "reward": -0.06629946455359459, "reward_std": 0.12912384793162346, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01663165306672454, "rewards/no_repetition_reward_func": -0.07511861622333527, "rewards/verse_reward_func": -0.0078125, "step": 240 }, { "completion_length": 228.96875, "epoch": 1.928, "grad_norm": 0.59375, "kl": 0.003476368263363838, "learning_rate": 1.9280000000000002e-05, "loss": 0.0001, "reward": -0.061927299946546555, "reward_std": 0.15825669467449188, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03370831161737442, "rewards/no_repetition_reward_func": -0.09563561156392097, "rewards/verse_reward_func": 0.0, "step": 241 }, { "completion_length": 232.25, "epoch": 1.936, "grad_norm": 0.5390625, "kl": 0.0028298513498157263, "learning_rate": 1.936e-05, "loss": 0.0001, "reward": -0.04863723739981651, "reward_std": 0.14713557809591293, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03476138412952423, "rewards/no_repetition_reward_func": -0.08339862152934074, "rewards/verse_reward_func": 0.0, "step": 242 }, { "completion_length": 238.5, "epoch": 1.944, "grad_norm": 0.462890625, "kl": 0.002950546913780272, "learning_rate": 1.944e-05, "loss": 0.0001, "reward": -0.059866541996598244, "reward_std": 0.09976981580257416, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0065104165114462376, "rewards/no_repetition_reward_func": -0.0663769580423832, "rewards/verse_reward_func": 0.0, "step": 243 }, { "completion_length": 236.25, "epoch": 1.952, "grad_norm": 1.1171875, "kl": 0.0027014181250706315, "learning_rate": 1.9520000000000003e-05, "loss": 0.0001, "reward": -0.08645040541887283, "reward_std": 0.10438906028866768, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.06301291100680828, "rewards/verse_reward_func": -0.0234375, "step": 244 }, { "completion_length": 231.953125, "epoch": 1.96, "grad_norm": 0.498046875, "kl": 0.0027985426131635904, "learning_rate": 1.9600000000000002e-05, "loss": 0.0001, "reward": -0.04179947078227997, "reward_std": 0.1503365095704794, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02877792902290821, "rewards/no_repetition_reward_func": -0.06276489421725273, "rewards/verse_reward_func": -0.0078125, "step": 245 }, { "completion_length": 210.25, "epoch": 1.968, "grad_norm": 1.546875, "kl": 0.0036955412942916155, "learning_rate": 1.968e-05, "loss": 0.0001, "reward": -0.08131086081266403, "reward_std": 0.10414950922131538, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.06568586081266403, "rewards/verse_reward_func": -0.015625, "step": 246 }, { "completion_length": 237.203125, "epoch": 1.976, "grad_norm": 0.75, "kl": 0.003456969279795885, "learning_rate": 1.976e-05, "loss": 0.0001, "reward": -0.08153868466615677, "reward_std": 0.11300131678581238, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.009765625, "rewards/no_repetition_reward_func": -0.07567931339144707, "rewards/verse_reward_func": -0.015625, "step": 247 }, { "completion_length": 238.390625, "epoch": 1.984, "grad_norm": 1.265625, "kl": 0.0029719755984842777, "learning_rate": 1.984e-05, "loss": 0.0001, "reward": -0.05561628472059965, "reward_std": 0.11807091534137726, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02310463786125183, "rewards/no_repetition_reward_func": -0.07090842351317406, "rewards/verse_reward_func": -0.0078125, "step": 248 }, { "completion_length": 244.703125, "epoch": 1.992, "grad_norm": 0.42578125, "kl": 0.0030864758882671595, "learning_rate": 1.992e-05, "loss": 0.0001, "reward": -0.04801687132567167, "reward_std": 0.12363987788558006, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02033253200352192, "rewards/no_repetition_reward_func": -0.06834940239787102, "rewards/verse_reward_func": 0.0, "step": 249 }, { "completion_length": 229.3125, "epoch": 2.0, "grad_norm": 0.5078125, "kl": 0.0031127692200243473, "learning_rate": 2e-05, "loss": 0.0001, "reward": -0.06943036988377571, "reward_std": 0.11323832347989082, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.013020833022892475, "rewards/no_repetition_reward_func": -0.08245120197534561, "rewards/verse_reward_func": 0.0, "step": 250 }, { "completion_length": 250.296875, "epoch": 2.008, "grad_norm": 0.48828125, "kl": 0.0029879827052354813, "learning_rate": 2.008e-05, "loss": 0.0001, "reward": -0.05632927082479, "reward_std": 0.11913696676492691, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.018028846010565758, "rewards/no_repetition_reward_func": -0.07435811683535576, "rewards/verse_reward_func": 0.0, "step": 251 }, { "completion_length": 238.4375, "epoch": 2.016, "grad_norm": 0.625, "kl": 0.003554737661033869, "learning_rate": 2.016e-05, "loss": 0.0001, "reward": -0.08148816600441933, "reward_std": 0.13335837051272392, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01566416025161743, "rewards/no_repetition_reward_func": -0.08152732998132706, "rewards/verse_reward_func": -0.015625, "step": 252 }, { "completion_length": 244.515625, "epoch": 2.024, "grad_norm": 0.5625, "kl": 0.0030362242832779884, "learning_rate": 2.024e-05, "loss": 0.0001, "reward": -0.08090429194271564, "reward_std": 0.09926065430045128, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0062500000931322575, "rewards/no_repetition_reward_func": -0.07934179157018661, "rewards/verse_reward_func": -0.0078125, "step": 253 }, { "completion_length": 234.796875, "epoch": 2.032, "grad_norm": 0.51953125, "kl": 0.0036576417041942477, "learning_rate": 2.032e-05, "loss": 0.0001, "reward": -0.10980137437582016, "reward_std": 0.17939479649066925, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.010416666977107525, "rewards/no_repetition_reward_func": -0.10459304228425026, "rewards/verse_reward_func": -0.015625, "step": 254 }, { "completion_length": 251.453125, "epoch": 2.04, "grad_norm": 0.474609375, "kl": 0.0029017204651609063, "learning_rate": 2.04e-05, "loss": 0.0001, "reward": -0.07650923728942871, "reward_std": 0.1346660479903221, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0173611119389534, "rewards/no_repetition_reward_func": -0.09387035295367241, "rewards/verse_reward_func": 0.0, "step": 255 }, { "completion_length": 233.984375, "epoch": 2.048, "grad_norm": 0.53125, "kl": 0.0037072813138365746, "learning_rate": 2.048e-05, "loss": 0.0001, "reward": -0.04467508662492037, "reward_std": 0.16527284681797028, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03791360557079315, "rewards/no_repetition_reward_func": -0.07477618753910065, "rewards/verse_reward_func": -0.0078125, "step": 256 }, { "completion_length": 245.65625, "epoch": 2.056, "grad_norm": 0.41015625, "kl": 0.0030885885935276747, "learning_rate": 2.0560000000000003e-05, "loss": 0.0001, "reward": -0.05995970033109188, "reward_std": 0.18319451063871384, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0324569046497345, "rewards/no_repetition_reward_func": -0.08460410311818123, "rewards/verse_reward_func": -0.0078125, "step": 257 }, { "completion_length": 244.0, "epoch": 2.064, "grad_norm": 0.484375, "kl": 0.0035122495610266924, "learning_rate": 2.0640000000000002e-05, "loss": 0.0001, "reward": -0.046705189161002636, "reward_std": 0.13365914672613144, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.028631010558456182, "rewards/no_repetition_reward_func": -0.07533620670437813, "rewards/verse_reward_func": 0.0, "step": 258 }, { "completion_length": 225.359375, "epoch": 2.072, "grad_norm": 0.7265625, "kl": 0.004174498375505209, "learning_rate": 2.072e-05, "loss": 0.0002, "reward": -0.05662533454596996, "reward_std": 0.1325223371386528, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.021569866687059402, "rewards/no_repetition_reward_func": -0.07038270309567451, "rewards/verse_reward_func": -0.0078125, "step": 259 }, { "completion_length": 230.796875, "epoch": 2.08, "grad_norm": 0.734375, "kl": 0.004041245905682445, "learning_rate": 2.08e-05, "loss": 0.0002, "reward": -0.027458790689706802, "reward_std": 0.16342619061470032, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04226277768611908, "rewards/no_repetition_reward_func": -0.06190907210111618, "rewards/verse_reward_func": -0.0078125, "step": 260 }, { "completion_length": 233.15625, "epoch": 2.088, "grad_norm": 0.578125, "kl": 0.003988734213635325, "learning_rate": 2.0880000000000003e-05, "loss": 0.0002, "reward": -0.05640886351466179, "reward_std": 0.15537385642528534, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.024671053513884544, "rewards/no_repetition_reward_func": -0.07326741702854633, "rewards/verse_reward_func": -0.0078125, "step": 261 }, { "completion_length": 236.578125, "epoch": 2.096, "grad_norm": 0.68359375, "kl": 0.0033725841203704476, "learning_rate": 2.0960000000000003e-05, "loss": 0.0001, "reward": -0.06995265185832977, "reward_std": 0.1425425074994564, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02250267146155238, "rewards/no_repetition_reward_func": -0.08464282006025314, "rewards/verse_reward_func": -0.0078125, "step": 262 }, { "completion_length": 233.265625, "epoch": 2.104, "grad_norm": 1.0078125, "kl": 0.004290988203138113, "learning_rate": 2.1040000000000002e-05, "loss": 0.0002, "reward": -0.07625529170036316, "reward_std": 0.13432425260543823, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.014880952425301075, "rewards/no_repetition_reward_func": -0.06769874691963196, "rewards/verse_reward_func": -0.0234375, "step": 263 }, { "completion_length": 232.640625, "epoch": 2.112, "grad_norm": 0.515625, "kl": 0.003470710013061762, "learning_rate": 2.112e-05, "loss": 0.0001, "reward": -0.07243519648909569, "reward_std": 0.13522030599415302, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.016293449327349663, "rewards/no_repetition_reward_func": -0.0887286439538002, "rewards/verse_reward_func": 0.0, "step": 264 }, { "completion_length": 225.328125, "epoch": 2.12, "grad_norm": 3.84375, "kl": 0.0076830885373055935, "learning_rate": 2.12e-05, "loss": 0.0003, "reward": -0.084867674857378, "reward_std": 0.12966392189264297, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0078125, "rewards/no_repetition_reward_func": -0.08486766554415226, "rewards/verse_reward_func": -0.0078125, "step": 265 }, { "completion_length": 228.875, "epoch": 2.128, "grad_norm": 0.56640625, "kl": 0.003921155468560755, "learning_rate": 2.128e-05, "loss": 0.0002, "reward": -0.036508625373244286, "reward_std": 0.1839357689023018, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.045048702508211136, "rewards/no_repetition_reward_func": -0.08155732974410057, "rewards/verse_reward_func": 0.0, "step": 266 }, { "completion_length": 232.171875, "epoch": 2.136, "grad_norm": 0.93359375, "kl": 0.004300195723772049, "learning_rate": 2.1360000000000002e-05, "loss": 0.0002, "reward": -0.08056743815541267, "reward_std": 0.10348787158727646, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.06494243629276752, "rewards/verse_reward_func": -0.015625, "step": 267 }, { "completion_length": 223.265625, "epoch": 2.144, "grad_norm": 0.640625, "kl": 0.004817612934857607, "learning_rate": 2.144e-05, "loss": 0.0002, "reward": -0.08473814278841019, "reward_std": 0.23167073726654053, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.039144737645983696, "rewards/no_repetition_reward_func": -0.10044537857174873, "rewards/verse_reward_func": -0.0234375, "step": 268 }, { "completion_length": 237.015625, "epoch": 2.152, "grad_norm": 0.7421875, "kl": 0.004980408353731036, "learning_rate": 2.152e-05, "loss": 0.0002, "reward": 0.012621529400348663, "reward_std": 0.2719830088317394, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.07581845670938492, "rewards/no_repetition_reward_func": -0.05538441799581051, "rewards/verse_reward_func": -0.0078125, "step": 269 }, { "completion_length": 243.203125, "epoch": 2.16, "grad_norm": 0.439453125, "kl": 0.004581427201628685, "learning_rate": 2.16e-05, "loss": 0.0002, "reward": -0.0772817600518465, "reward_std": 0.11025489866733551, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0078125, "rewards/no_repetition_reward_func": -0.07728175446391106, "rewards/verse_reward_func": -0.0078125, "step": 270 }, { "completion_length": 242.015625, "epoch": 2.168, "grad_norm": 0.515625, "kl": 0.004535979591310024, "learning_rate": 2.168e-05, "loss": 0.0002, "reward": -0.06230304762721062, "reward_std": 0.08008710667490959, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.008223684504628181, "rewards/no_repetition_reward_func": -0.07052673026919365, "rewards/verse_reward_func": 0.0, "step": 271 }, { "completion_length": 227.359375, "epoch": 2.176, "grad_norm": 0.5078125, "kl": 0.005439299391582608, "learning_rate": 2.176e-05, "loss": 0.0002, "reward": -0.019914996810257435, "reward_std": 0.18856079503893852, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0484006735496223, "rewards/no_repetition_reward_func": -0.06831567361950874, "rewards/verse_reward_func": 0.0, "step": 272 }, { "completion_length": 242.5, "epoch": 2.184, "grad_norm": 0.478515625, "kl": 0.004828153410926461, "learning_rate": 2.184e-05, "loss": 0.0002, "reward": -0.07365656644105911, "reward_std": 0.07607416063547134, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.07365657016634941, "rewards/verse_reward_func": 0.0, "step": 273 }, { "completion_length": 237.734375, "epoch": 2.192, "grad_norm": 0.6171875, "kl": 0.004853398771956563, "learning_rate": 2.192e-05, "loss": 0.0002, "reward": -0.0625002570450306, "reward_std": 0.15285128727555275, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.022017046809196472, "rewards/no_repetition_reward_func": -0.06889230385422707, "rewards/verse_reward_func": -0.015625, "step": 274 }, { "completion_length": 223.03125, "epoch": 2.2, "grad_norm": 0.9609375, "kl": 0.005187596660107374, "learning_rate": 2.2000000000000003e-05, "loss": 0.0002, "reward": -0.01681758090853691, "reward_std": 0.20550133287906647, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05254160054028034, "rewards/no_repetition_reward_func": -0.04592168144881725, "rewards/verse_reward_func": -0.0234375, "step": 275 }, { "completion_length": 233.375, "epoch": 2.208, "grad_norm": 0.625, "kl": 0.006131743546575308, "learning_rate": 2.2080000000000002e-05, "loss": 0.0002, "reward": -0.05350736901164055, "reward_std": 0.1286502741277218, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02038043551146984, "rewards/no_repetition_reward_func": -0.07388780638575554, "rewards/verse_reward_func": 0.0, "step": 276 }, { "completion_length": 238.671875, "epoch": 2.216, "grad_norm": 0.486328125, "kl": 0.005390022415667772, "learning_rate": 2.216e-05, "loss": 0.0002, "reward": -0.05471794493496418, "reward_std": 0.14561660960316658, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02997342310845852, "rewards/no_repetition_reward_func": -0.069066371768713, "rewards/verse_reward_func": -0.015625, "step": 277 }, { "completion_length": 228.953125, "epoch": 2.224, "grad_norm": 0.74609375, "kl": 0.005803634412586689, "learning_rate": 2.224e-05, "loss": 0.0002, "reward": -0.0760074257850647, "reward_std": 0.1090971939265728, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.007102272938936949, "rewards/no_repetition_reward_func": -0.06748469546437263, "rewards/verse_reward_func": -0.015625, "step": 278 }, { "completion_length": 237.234375, "epoch": 2.232, "grad_norm": 0.51953125, "kl": 0.0053972171153873205, "learning_rate": 2.2320000000000003e-05, "loss": 0.0002, "reward": 0.012312987819314003, "reward_std": 0.3380180448293686, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.10455964133143425, "rewards/no_repetition_reward_func": -0.0766216516494751, "rewards/verse_reward_func": -0.015625, "step": 279 }, { "completion_length": 238.296875, "epoch": 2.24, "grad_norm": 0.494140625, "kl": 0.005415135761722922, "learning_rate": 2.2400000000000002e-05, "loss": 0.0002, "reward": -0.03469303622841835, "reward_std": 0.14302489906549454, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03183479607105255, "rewards/no_repetition_reward_func": -0.0665278322994709, "rewards/verse_reward_func": 0.0, "step": 280 }, { "completion_length": 235.65625, "epoch": 2.248, "grad_norm": 0.46875, "kl": 0.00529671274125576, "learning_rate": 2.248e-05, "loss": 0.0002, "reward": -0.058354055508971214, "reward_std": 0.13246193900704384, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.020833332557231188, "rewards/no_repetition_reward_func": -0.07918738201260567, "rewards/verse_reward_func": 0.0, "step": 281 }, { "completion_length": 234.75, "epoch": 2.2560000000000002, "grad_norm": 0.53515625, "kl": 0.005237654782831669, "learning_rate": 2.256e-05, "loss": 0.0002, "reward": -0.0221202471293509, "reward_std": 0.12244434282183647, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.028825432062149048, "rewards/no_repetition_reward_func": -0.05094567686319351, "rewards/verse_reward_func": 0.0, "step": 282 }, { "completion_length": 235.859375, "epoch": 2.2640000000000002, "grad_norm": 0.625, "kl": 0.006987065076828003, "learning_rate": 2.264e-05, "loss": 0.0003, "reward": -0.08162225596606731, "reward_std": 0.1679166480898857, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0271267369389534, "rewards/no_repetition_reward_func": -0.10093648359179497, "rewards/verse_reward_func": -0.0078125, "step": 283 }, { "completion_length": 250.0, "epoch": 2.2720000000000002, "grad_norm": 0.498046875, "kl": 0.007001107092946768, "learning_rate": 2.2720000000000003e-05, "loss": 0.0003, "reward": 0.02368918899446726, "reward_std": 0.24485184997320175, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.08553335070610046, "rewards/no_repetition_reward_func": -0.06184415705502033, "rewards/verse_reward_func": 0.0, "step": 284 }, { "completion_length": 235.75, "epoch": 2.2800000000000002, "grad_norm": 0.52734375, "kl": 0.006975970230996609, "learning_rate": 2.2800000000000002e-05, "loss": 0.0003, "reward": -0.05558047629892826, "reward_std": 0.10127467289566994, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.013612689450383186, "rewards/no_repetition_reward_func": -0.06919316202402115, "rewards/verse_reward_func": 0.0, "step": 285 }, { "completion_length": 241.015625, "epoch": 2.288, "grad_norm": 0.5, "kl": 0.006126968655735254, "learning_rate": 2.288e-05, "loss": 0.0002, "reward": -0.033850546926259995, "reward_std": 0.12177097052335739, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.030024509876966476, "rewards/no_repetition_reward_func": -0.06387505307793617, "rewards/verse_reward_func": 0.0, "step": 286 }, { "completion_length": 234.28125, "epoch": 2.296, "grad_norm": 0.7734375, "kl": 0.006108261412009597, "learning_rate": 2.296e-05, "loss": 0.0002, "reward": -0.0674353800714016, "reward_std": 0.09418784454464912, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.007102272938936949, "rewards/no_repetition_reward_func": -0.06672515347599983, "rewards/verse_reward_func": -0.0078125, "step": 287 }, { "completion_length": 230.28125, "epoch": 2.304, "grad_norm": 0.51953125, "kl": 0.006996115669608116, "learning_rate": 2.304e-05, "loss": 0.0003, "reward": -0.054842349141836166, "reward_std": 0.1350337117910385, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02272727293893695, "rewards/no_repetition_reward_func": -0.0775696150958538, "rewards/verse_reward_func": 0.0, "step": 288 }, { "completion_length": 221.84375, "epoch": 2.312, "grad_norm": 0.796875, "kl": 0.008953278884291649, "learning_rate": 2.312e-05, "loss": 0.0004, "reward": -0.07409230060875416, "reward_std": 0.12033498287200928, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.016447369009256363, "rewards/no_repetition_reward_func": -0.06710217148065567, "rewards/verse_reward_func": -0.0234375, "step": 289 }, { "completion_length": 252.046875, "epoch": 2.32, "grad_norm": 0.490234375, "kl": 0.0075039907824248075, "learning_rate": 2.32e-05, "loss": 0.0003, "reward": -0.056784817948937416, "reward_std": 0.0876019075512886, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0086805559694767, "rewards/no_repetition_reward_func": -0.06546537578105927, "rewards/verse_reward_func": 0.0, "step": 290 }, { "completion_length": 219.5625, "epoch": 2.328, "grad_norm": 0.99609375, "kl": 0.009035517927259207, "learning_rate": 2.328e-05, "loss": 0.0004, "reward": -0.08352427557110786, "reward_std": 0.0914892926812172, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.06789927184581757, "rewards/verse_reward_func": -0.015625, "step": 291 }, { "completion_length": 232.59375, "epoch": 2.336, "grad_norm": 0.486328125, "kl": 0.006946749519556761, "learning_rate": 2.336e-05, "loss": 0.0003, "reward": 0.01033009635284543, "reward_std": 0.21241962164640427, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06639638915657997, "rewards/no_repetition_reward_func": -0.05606629326939583, "rewards/verse_reward_func": 0.0, "step": 292 }, { "completion_length": 219.859375, "epoch": 2.344, "grad_norm": 0.7734375, "kl": 0.00893594860099256, "learning_rate": 2.344e-05, "loss": 0.0004, "reward": -0.061482148244977, "reward_std": 0.16855042427778244, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01953125, "rewards/no_repetition_reward_func": -0.06538839638233185, "rewards/verse_reward_func": -0.015625, "step": 293 }, { "completion_length": 232.625, "epoch": 2.352, "grad_norm": 1.015625, "kl": 0.008183754049241543, "learning_rate": 2.3520000000000002e-05, "loss": 0.0003, "reward": -0.08570479229092598, "reward_std": 0.13833484053611755, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.005387931130826473, "rewards/no_repetition_reward_func": -0.07546772435307503, "rewards/verse_reward_func": -0.015625, "step": 294 }, { "completion_length": 238.125, "epoch": 2.36, "grad_norm": 0.52734375, "kl": 0.008390391245484352, "learning_rate": 2.36e-05, "loss": 0.0003, "reward": -0.08877097815275192, "reward_std": 0.1285780929028988, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.013352273032069206, "rewards/no_repetition_reward_func": -0.09431074932217598, "rewards/verse_reward_func": -0.0078125, "step": 295 }, { "completion_length": 236.328125, "epoch": 2.368, "grad_norm": 0.458984375, "kl": 0.008149172645062208, "learning_rate": 2.3680000000000004e-05, "loss": 0.0003, "reward": -0.06581801548600197, "reward_std": 0.17215793579816818, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.028776703402400017, "rewards/no_repetition_reward_func": -0.08678221702575684, "rewards/verse_reward_func": -0.0078125, "step": 296 }, { "completion_length": 240.453125, "epoch": 2.376, "grad_norm": 0.494140625, "kl": 0.007105192402377725, "learning_rate": 2.3760000000000003e-05, "loss": 0.0003, "reward": -0.019828658550977707, "reward_std": 0.17039917409420013, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04940856620669365, "rewards/no_repetition_reward_func": -0.06923722289502621, "rewards/verse_reward_func": 0.0, "step": 297 }, { "completion_length": 226.1875, "epoch": 2.384, "grad_norm": 0.63671875, "kl": 0.007211246062070131, "learning_rate": 2.3840000000000002e-05, "loss": 0.0003, "reward": -0.10056837275624275, "reward_std": 0.186808243393898, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02126259170472622, "rewards/no_repetition_reward_func": -0.09058096632361412, "rewards/verse_reward_func": -0.03125, "step": 298 }, { "completion_length": 220.5625, "epoch": 2.392, "grad_norm": 0.828125, "kl": 0.008723886217921972, "learning_rate": 2.392e-05, "loss": 0.0003, "reward": -0.04202473163604736, "reward_std": 0.11143985390663147, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02085597813129425, "rewards/no_repetition_reward_func": -0.055068209767341614, "rewards/verse_reward_func": -0.0078125, "step": 299 }, { "completion_length": 246.828125, "epoch": 2.4, "grad_norm": 0.4921875, "kl": 0.007217486388981342, "learning_rate": 2.4e-05, "loss": 0.0003, "reward": -0.04902455396950245, "reward_std": 0.07893005385994911, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0074404762126505375, "rewards/no_repetition_reward_func": -0.0564650297164917, "rewards/verse_reward_func": 0.0, "step": 300 }, { "completion_length": 235.828125, "epoch": 2.408, "grad_norm": 0.66796875, "kl": 0.00940213305875659, "learning_rate": 2.408e-05, "loss": 0.0004, "reward": -0.04300826694816351, "reward_std": 0.1263992078602314, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01953125, "rewards/no_repetition_reward_func": -0.06253951787948608, "rewards/verse_reward_func": 0.0, "step": 301 }, { "completion_length": 233.390625, "epoch": 2.416, "grad_norm": 0.53125, "kl": 0.007242408115416765, "learning_rate": 2.4160000000000002e-05, "loss": 0.0003, "reward": -0.08689058944582939, "reward_std": 0.09678603336215019, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.07907808572053909, "rewards/verse_reward_func": -0.0078125, "step": 302 }, { "completion_length": 229.59375, "epoch": 2.424, "grad_norm": 0.75390625, "kl": 0.007734660059213638, "learning_rate": 2.4240000000000002e-05, "loss": 0.0003, "reward": -0.03839912544935942, "reward_std": 0.1530543901026249, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.026470924727618694, "rewards/no_repetition_reward_func": -0.05705755203962326, "rewards/verse_reward_func": -0.0078125, "step": 303 }, { "completion_length": 227.1875, "epoch": 2.432, "grad_norm": 0.99609375, "kl": 0.00899244612082839, "learning_rate": 2.432e-05, "loss": 0.0004, "reward": -0.0764115508645773, "reward_std": 0.13001444935798645, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015325957909226418, "rewards/no_repetition_reward_func": -0.07611250504851341, "rewards/verse_reward_func": -0.015625, "step": 304 }, { "completion_length": 239.34375, "epoch": 2.44, "grad_norm": 0.50390625, "kl": 0.008692041970789433, "learning_rate": 2.44e-05, "loss": 0.0003, "reward": -0.05139782279729843, "reward_std": 0.11594870314002037, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.025822829455137253, "rewards/no_repetition_reward_func": -0.07722065411508083, "rewards/verse_reward_func": 0.0, "step": 305 }, { "completion_length": 232.703125, "epoch": 2.448, "grad_norm": 0.54296875, "kl": 0.009106512181460857, "learning_rate": 2.448e-05, "loss": 0.0004, "reward": -0.034036110155284405, "reward_std": 0.15066510811448097, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.030539773404598236, "rewards/no_repetition_reward_func": -0.056763384491205215, "rewards/verse_reward_func": -0.0078125, "step": 306 }, { "completion_length": 231.28125, "epoch": 2.456, "grad_norm": 0.90625, "kl": 0.006938913371413946, "learning_rate": 2.4560000000000002e-05, "loss": 0.0003, "reward": -0.04137927945703268, "reward_std": 0.13129496201872826, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.022916667629033327, "rewards/no_repetition_reward_func": -0.056483449414372444, "rewards/verse_reward_func": -0.0078125, "step": 307 }, { "completion_length": 237.34375, "epoch": 2.464, "grad_norm": 0.64453125, "kl": 0.008360553998500109, "learning_rate": 2.464e-05, "loss": 0.0003, "reward": -0.029219623189419508, "reward_std": 0.2073918730020523, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0523486640304327, "rewards/no_repetition_reward_func": -0.06594328582286835, "rewards/verse_reward_func": -0.015625, "step": 308 }, { "completion_length": 241.703125, "epoch": 2.472, "grad_norm": 1.015625, "kl": 0.009037187788635492, "learning_rate": 2.472e-05, "loss": 0.0004, "reward": -0.037280415184795856, "reward_std": 0.18699373304843903, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04530013166368008, "rewards/no_repetition_reward_func": -0.07476804777979851, "rewards/verse_reward_func": -0.0078125, "step": 309 }, { "completion_length": 234.21875, "epoch": 2.48, "grad_norm": 0.5078125, "kl": 0.007282366510480642, "learning_rate": 2.48e-05, "loss": 0.0003, "reward": -0.10273251309990883, "reward_std": 0.08867935091257095, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.10273251309990883, "rewards/verse_reward_func": 0.0, "step": 310 }, { "completion_length": 230.921875, "epoch": 2.488, "grad_norm": 0.87109375, "kl": 0.009164524730294943, "learning_rate": 2.488e-05, "loss": 0.0004, "reward": -0.012931713834404945, "reward_std": 0.193217895925045, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05426033399999142, "rewards/no_repetition_reward_func": -0.05156704969704151, "rewards/verse_reward_func": -0.015625, "step": 311 }, { "completion_length": 232.890625, "epoch": 2.496, "grad_norm": 0.55859375, "kl": 0.008676509838551283, "learning_rate": 2.496e-05, "loss": 0.0003, "reward": -0.06056796945631504, "reward_std": 0.08284469321370125, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.06056797318160534, "rewards/verse_reward_func": 0.0, "step": 312 }, { "completion_length": 230.125, "epoch": 2.504, "grad_norm": 1.109375, "kl": 0.008246017154306173, "learning_rate": 2.504e-05, "loss": 0.0003, "reward": -0.06922408752143383, "reward_std": 0.1454462930560112, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.020689229015260935, "rewards/no_repetition_reward_func": -0.06647581607103348, "rewards/verse_reward_func": -0.0234375, "step": 313 }, { "completion_length": 231.75, "epoch": 2.512, "grad_norm": 0.59375, "kl": 0.008787957951426506, "learning_rate": 2.512e-05, "loss": 0.0004, "reward": -0.030463173054158688, "reward_std": 0.17231833562254906, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.036371663212776184, "rewards/no_repetition_reward_func": -0.059022340923547745, "rewards/verse_reward_func": -0.0078125, "step": 314 }, { "completion_length": 240.4375, "epoch": 2.52, "grad_norm": 0.77734375, "kl": 0.008813782595098019, "learning_rate": 2.5200000000000003e-05, "loss": 0.0004, "reward": -0.04547702427953482, "reward_std": 0.10106652602553368, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.013020833022892475, "rewards/no_repetition_reward_func": -0.05068535916507244, "rewards/verse_reward_func": -0.0078125, "step": 315 }, { "completion_length": 238.03125, "epoch": 2.528, "grad_norm": 0.78515625, "kl": 0.007975250016897917, "learning_rate": 2.5280000000000005e-05, "loss": 0.0003, "reward": -0.04276014491915703, "reward_std": 0.14108967036008835, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02526041679084301, "rewards/no_repetition_reward_func": -0.06020806543529034, "rewards/verse_reward_func": -0.0078125, "step": 316 }, { "completion_length": 234.078125, "epoch": 2.536, "grad_norm": 0.5078125, "kl": 0.00855105696246028, "learning_rate": 2.536e-05, "loss": 0.0003, "reward": -0.05222897604107857, "reward_std": 0.1463838443160057, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02727743238210678, "rewards/no_repetition_reward_func": -0.07950640469789505, "rewards/verse_reward_func": 0.0, "step": 317 }, { "completion_length": 227.25, "epoch": 2.544, "grad_norm": 0.88671875, "kl": 0.010262751951813698, "learning_rate": 2.5440000000000004e-05, "loss": 0.0004, "reward": -0.05165157373994589, "reward_std": 0.09544248133897781, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0086805559694767, "rewards/no_repetition_reward_func": -0.052519626915454865, "rewards/verse_reward_func": -0.0078125, "step": 318 }, { "completion_length": 246.234375, "epoch": 2.552, "grad_norm": 0.46484375, "kl": 0.007941093295812607, "learning_rate": 2.552e-05, "loss": 0.0003, "reward": -0.02247403794899583, "reward_std": 0.14771982841193676, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03554077446460724, "rewards/no_repetition_reward_func": -0.05801480449736118, "rewards/verse_reward_func": 0.0, "step": 319 }, { "completion_length": 214.3125, "epoch": 2.56, "grad_norm": 0.6484375, "kl": 0.01016224455088377, "learning_rate": 2.5600000000000002e-05, "loss": 0.0004, "reward": -0.06471347622573376, "reward_std": 0.09151896834373474, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.06471347063779831, "rewards/verse_reward_func": 0.0, "step": 320 }, { "completion_length": 236.03125, "epoch": 2.568, "grad_norm": 0.47265625, "kl": 0.008751638233661652, "learning_rate": 2.5679999999999998e-05, "loss": 0.0004, "reward": -0.06076962500810623, "reward_std": 0.09071650356054306, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.006793478038161993, "rewards/no_repetition_reward_func": -0.06756310909986496, "rewards/verse_reward_func": 0.0, "step": 321 }, { "completion_length": 229.78125, "epoch": 2.576, "grad_norm": 0.94921875, "kl": 0.010321576613932848, "learning_rate": 2.576e-05, "loss": 0.0004, "reward": -0.08136460557579994, "reward_std": 0.10926430486142635, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.07355210557579994, "rewards/verse_reward_func": -0.0078125, "step": 322 }, { "completion_length": 239.328125, "epoch": 2.584, "grad_norm": 0.828125, "kl": 0.010588406585156918, "learning_rate": 2.5840000000000003e-05, "loss": 0.0004, "reward": -0.04290631227195263, "reward_std": 0.14716289192438126, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.029658565297722816, "rewards/no_repetition_reward_func": -0.056939875707030296, "rewards/verse_reward_func": -0.015625, "step": 323 }, { "completion_length": 243.953125, "epoch": 2.592, "grad_norm": 0.484375, "kl": 0.008580120746046305, "learning_rate": 2.592e-05, "loss": 0.0003, "reward": -0.02842307137325406, "reward_std": 0.17417649179697037, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.047346262726932764, "rewards/no_repetition_reward_func": -0.0679568350315094, "rewards/verse_reward_func": -0.0078125, "step": 324 }, { "completion_length": 242.59375, "epoch": 2.6, "grad_norm": 0.4609375, "kl": 0.008636913262307644, "learning_rate": 2.6000000000000002e-05, "loss": 0.0003, "reward": -0.023578068241477013, "reward_std": 0.13753096759319305, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03280228842049837, "rewards/no_repetition_reward_func": -0.05638035759329796, "rewards/verse_reward_func": 0.0, "step": 325 }, { "completion_length": 241.578125, "epoch": 2.608, "grad_norm": 0.458984375, "kl": 0.008214535657316446, "learning_rate": 2.6079999999999998e-05, "loss": 0.0003, "reward": -0.004234565421938896, "reward_std": 0.19363371282815933, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05971790850162506, "rewards/no_repetition_reward_func": -0.06395247764885426, "rewards/verse_reward_func": 0.0, "step": 326 }, { "completion_length": 229.640625, "epoch": 2.616, "grad_norm": 0.58203125, "kl": 0.010582291055470705, "learning_rate": 2.616e-05, "loss": 0.0004, "reward": -0.039548033848404884, "reward_std": 0.14804556965827942, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03065191488713026, "rewards/no_repetition_reward_func": -0.06238744407892227, "rewards/verse_reward_func": -0.0078125, "step": 327 }, { "completion_length": 227.46875, "epoch": 2.624, "grad_norm": 1.3984375, "kl": 0.009037589654326439, "learning_rate": 2.6240000000000003e-05, "loss": 0.0004, "reward": -0.05487391725182533, "reward_std": 0.115098986774683, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01406249962747097, "rewards/no_repetition_reward_func": -0.0533114168792963, "rewards/verse_reward_func": -0.015625, "step": 328 }, { "completion_length": 241.21875, "epoch": 2.632, "grad_norm": 0.5625, "kl": 0.008731396868824959, "learning_rate": 2.632e-05, "loss": 0.0003, "reward": -0.05701139569282532, "reward_std": 0.0812770240008831, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.008223684504628181, "rewards/no_repetition_reward_func": -0.06523507833480835, "rewards/verse_reward_func": 0.0, "step": 329 }, { "completion_length": 228.796875, "epoch": 2.64, "grad_norm": 0.625, "kl": 0.011560476385056973, "learning_rate": 2.64e-05, "loss": 0.0005, "reward": -0.01121446955949068, "reward_std": 0.17047807574272156, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05198894999921322, "rewards/no_repetition_reward_func": -0.055390918627381325, "rewards/verse_reward_func": -0.0078125, "step": 330 }, { "completion_length": 225.25, "epoch": 2.648, "grad_norm": 0.6328125, "kl": 0.012911614961922169, "learning_rate": 2.648e-05, "loss": 0.0005, "reward": -0.037497272714972496, "reward_std": 0.1391441598534584, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03164816228672862, "rewards/no_repetition_reward_func": -0.06133293733000755, "rewards/verse_reward_func": -0.0078125, "step": 331 }, { "completion_length": 237.875, "epoch": 2.656, "grad_norm": 0.466796875, "kl": 0.009786697570234537, "learning_rate": 2.6560000000000003e-05, "loss": 0.0004, "reward": -0.04706587828695774, "reward_std": 0.11158376559615135, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.019991582725197077, "rewards/no_repetition_reward_func": -0.06705745682120323, "rewards/verse_reward_func": 0.0, "step": 332 }, { "completion_length": 222.125, "epoch": 2.664, "grad_norm": 0.5703125, "kl": 0.011324240826070309, "learning_rate": 2.6640000000000002e-05, "loss": 0.0005, "reward": -0.037813253700733185, "reward_std": 0.1582675501704216, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03983018547296524, "rewards/no_repetition_reward_func": -0.062018442898988724, "rewards/verse_reward_func": -0.015625, "step": 333 }, { "completion_length": 235.71875, "epoch": 2.672, "grad_norm": 0.765625, "kl": 0.009744621813297272, "learning_rate": 2.672e-05, "loss": 0.0004, "reward": -0.03563865553587675, "reward_std": 0.21912197023630142, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05463160201907158, "rewards/no_repetition_reward_func": -0.0824577584862709, "rewards/verse_reward_func": -0.0078125, "step": 334 }, { "completion_length": 239.359375, "epoch": 2.68, "grad_norm": 0.443359375, "kl": 0.009721199050545692, "learning_rate": 2.6800000000000004e-05, "loss": 0.0004, "reward": -0.04432407394051552, "reward_std": 0.15145293623209, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03255208348855376, "rewards/no_repetition_reward_func": -0.06906365603208542, "rewards/verse_reward_func": -0.0078125, "step": 335 }, { "completion_length": 237.796875, "epoch": 2.6879999999999997, "grad_norm": 0.94140625, "kl": 0.010722438804805279, "learning_rate": 2.688e-05, "loss": 0.0004, "reward": -0.05764684081077576, "reward_std": 0.14341941848397255, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.022727273404598236, "rewards/no_repetition_reward_func": -0.07256161421537399, "rewards/verse_reward_func": -0.0078125, "step": 336 }, { "completion_length": 238.46875, "epoch": 2.6959999999999997, "grad_norm": 0.46875, "kl": 0.009795757941901684, "learning_rate": 2.6960000000000003e-05, "loss": 0.0004, "reward": 0.01410132646560669, "reward_std": 0.22052322328090668, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.07516512274742126, "rewards/no_repetition_reward_func": -0.06106380745768547, "rewards/verse_reward_func": 0.0, "step": 337 }, { "completion_length": 243.1875, "epoch": 2.7039999999999997, "grad_norm": 0.53515625, "kl": 0.00955283734947443, "learning_rate": 2.704e-05, "loss": 0.0004, "reward": -0.030458178371191025, "reward_std": 0.17566930502653122, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05082757165655494, "rewards/no_repetition_reward_func": -0.07347325794398785, "rewards/verse_reward_func": -0.0078125, "step": 338 }, { "completion_length": 232.546875, "epoch": 2.7119999999999997, "grad_norm": 0.46875, "kl": 0.011000760830938816, "learning_rate": 2.712e-05, "loss": 0.0004, "reward": -0.048142015002667904, "reward_std": 0.14783094078302383, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02660778909921646, "rewards/no_repetition_reward_func": -0.07474980503320694, "rewards/verse_reward_func": 0.0, "step": 339 }, { "completion_length": 238.859375, "epoch": 2.7199999999999998, "grad_norm": 0.51953125, "kl": 0.011155461426824331, "learning_rate": 2.7200000000000004e-05, "loss": 0.0004, "reward": -0.012415225617587566, "reward_std": 0.1335914060473442, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03872963786125183, "rewards/no_repetition_reward_func": -0.05114486254751682, "rewards/verse_reward_func": 0.0, "step": 340 }, { "completion_length": 230.03125, "epoch": 2.7279999999999998, "grad_norm": 0.52734375, "kl": 0.012291349936276674, "learning_rate": 2.728e-05, "loss": 0.0005, "reward": -0.04963056556880474, "reward_std": 0.16898885741829872, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.031700342893600464, "rewards/no_repetition_reward_func": -0.07351841405034065, "rewards/verse_reward_func": -0.0078125, "step": 341 }, { "completion_length": 228.984375, "epoch": 2.7359999999999998, "grad_norm": 0.95703125, "kl": 0.011105780955404043, "learning_rate": 2.7360000000000002e-05, "loss": 0.0004, "reward": -0.05559682659804821, "reward_std": 0.08153890632092953, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.03997182659804821, "rewards/verse_reward_func": -0.015625, "step": 342 }, { "completion_length": 234.671875, "epoch": 2.7439999999999998, "grad_norm": 0.48828125, "kl": 0.010773263406008482, "learning_rate": 2.7439999999999998e-05, "loss": 0.0004, "reward": 0.004505883902311325, "reward_std": 0.18114853650331497, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.056018101051449776, "rewards/no_repetition_reward_func": -0.0515122190117836, "rewards/verse_reward_func": 0.0, "step": 343 }, { "completion_length": 235.125, "epoch": 2.752, "grad_norm": 0.56640625, "kl": 0.011072130873799324, "learning_rate": 2.752e-05, "loss": 0.0004, "reward": -0.0319189727306366, "reward_std": 0.12802115082740784, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0348440520465374, "rewards/no_repetition_reward_func": -0.06676302663981915, "rewards/verse_reward_func": 0.0, "step": 344 }, { "completion_length": 241.328125, "epoch": 2.76, "grad_norm": 0.69921875, "kl": 0.011116509791463614, "learning_rate": 2.7600000000000003e-05, "loss": 0.0004, "reward": 0.029919429682195187, "reward_std": 0.23586387187242508, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.08870406076312065, "rewards/no_repetition_reward_func": -0.05097212828695774, "rewards/verse_reward_func": -0.0078125, "step": 345 }, { "completion_length": 236.53125, "epoch": 2.768, "grad_norm": 0.490234375, "kl": 0.010139886289834976, "learning_rate": 2.768e-05, "loss": 0.0004, "reward": -0.033147094771265984, "reward_std": 0.07280692271888256, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.009191176854074001, "rewards/no_repetition_reward_func": -0.04233827255666256, "rewards/verse_reward_func": 0.0, "step": 346 }, { "completion_length": 240.234375, "epoch": 2.776, "grad_norm": 1.2578125, "kl": 0.010852721519768238, "learning_rate": 2.7760000000000002e-05, "loss": 0.0004, "reward": -0.05623111501336098, "reward_std": 0.16298049688339233, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02660778909921646, "rewards/no_repetition_reward_func": -0.06721390597522259, "rewards/verse_reward_func": -0.015625, "step": 347 }, { "completion_length": 236.078125, "epoch": 2.784, "grad_norm": 0.4609375, "kl": 0.009764721151441336, "learning_rate": 2.7839999999999998e-05, "loss": 0.0004, "reward": -0.05516189709305763, "reward_std": 0.09781555086374283, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.006009615492075682, "rewards/no_repetition_reward_func": -0.0611715130507946, "rewards/verse_reward_func": 0.0, "step": 348 }, { "completion_length": 245.046875, "epoch": 2.792, "grad_norm": 0.56640625, "kl": 0.01156250387430191, "learning_rate": 2.792e-05, "loss": 0.0005, "reward": -0.041289196349680424, "reward_std": 0.138705775141716, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02935463795438409, "rewards/no_repetition_reward_func": -0.06283133290708065, "rewards/verse_reward_func": -0.0078125, "step": 349 }, { "completion_length": 234.671875, "epoch": 2.8, "grad_norm": 0.84375, "kl": 0.01344511704519391, "learning_rate": 2.8000000000000003e-05, "loss": 0.0005, "reward": -0.03395612724125385, "reward_std": 0.12124654278159142, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.023664860986173153, "rewards/no_repetition_reward_func": -0.04980848915874958, "rewards/verse_reward_func": -0.0078125, "step": 350 }, { "completion_length": 247.484375, "epoch": 2.808, "grad_norm": 0.453125, "kl": 0.01129910908639431, "learning_rate": 2.8080000000000002e-05, "loss": 0.0005, "reward": -0.05953957140445709, "reward_std": 0.07155771180987358, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.05953957140445709, "rewards/verse_reward_func": 0.0, "step": 351 }, { "completion_length": 234.953125, "epoch": 2.816, "grad_norm": 0.470703125, "kl": 0.011360213626176119, "learning_rate": 2.816e-05, "loss": 0.0005, "reward": -0.031167125329375267, "reward_std": 0.09097147732973099, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015625, "rewards/no_repetition_reward_func": -0.046792127192020416, "rewards/verse_reward_func": 0.0, "step": 352 }, { "completion_length": 232.6875, "epoch": 2.824, "grad_norm": 0.6015625, "kl": 0.011513453908264637, "learning_rate": 2.824e-05, "loss": 0.0005, "reward": -0.04906431958079338, "reward_std": 0.11778809130191803, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.014233299996703863, "rewards/no_repetition_reward_func": -0.05548512190580368, "rewards/verse_reward_func": -0.0078125, "step": 353 }, { "completion_length": 244.4375, "epoch": 2.832, "grad_norm": 0.53515625, "kl": 0.01286635035648942, "learning_rate": 2.8320000000000003e-05, "loss": 0.0005, "reward": -0.0014553982764482498, "reward_std": 0.1761842668056488, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05333561450242996, "rewards/no_repetition_reward_func": -0.05479101091623306, "rewards/verse_reward_func": 0.0, "step": 354 }, { "completion_length": 249.578125, "epoch": 2.84, "grad_norm": 0.482421875, "kl": 0.010188809130340815, "learning_rate": 2.84e-05, "loss": 0.0004, "reward": -0.05334261432290077, "reward_std": 0.11322798579931259, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.014880952425301075, "rewards/no_repetition_reward_func": -0.06822356581687927, "rewards/verse_reward_func": 0.0, "step": 355 }, { "completion_length": 250.6875, "epoch": 2.848, "grad_norm": 0.453125, "kl": 0.011324257589876652, "learning_rate": 2.8480000000000002e-05, "loss": 0.0005, "reward": -0.03283315896987915, "reward_std": 0.08946863561868668, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01603618450462818, "rewards/no_repetition_reward_func": -0.04886934347450733, "rewards/verse_reward_func": 0.0, "step": 356 }, { "completion_length": 238.78125, "epoch": 2.856, "grad_norm": 0.51953125, "kl": 0.01349584013223648, "learning_rate": 2.8560000000000004e-05, "loss": 0.0005, "reward": 0.01162075437605381, "reward_std": 0.18128911405801773, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06428286712616682, "rewards/no_repetition_reward_func": -0.052662115544080734, "rewards/verse_reward_func": 0.0, "step": 357 }, { "completion_length": 245.03125, "epoch": 2.864, "grad_norm": 0.81640625, "kl": 0.013129050843417645, "learning_rate": 2.864e-05, "loss": 0.0005, "reward": -0.03384148329496384, "reward_std": 0.1608506143093109, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.034229010343551636, "rewards/no_repetition_reward_func": -0.06025799550116062, "rewards/verse_reward_func": -0.0078125, "step": 358 }, { "completion_length": 249.890625, "epoch": 2.872, "grad_norm": 0.455078125, "kl": 0.01144699938595295, "learning_rate": 2.8720000000000003e-05, "loss": 0.0005, "reward": -0.03325829841196537, "reward_std": 0.10874250531196594, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.019836956169456244, "rewards/no_repetition_reward_func": -0.05309525318443775, "rewards/verse_reward_func": 0.0, "step": 359 }, { "completion_length": 238.875, "epoch": 2.88, "grad_norm": 0.71875, "kl": 0.013876235112547874, "learning_rate": 2.88e-05, "loss": 0.0006, "reward": -0.015215843915939331, "reward_std": 0.18013138696551323, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0500788907520473, "rewards/no_repetition_reward_func": -0.05748223327100277, "rewards/verse_reward_func": -0.0078125, "step": 360 }, { "completion_length": 241.59375, "epoch": 2.888, "grad_norm": 0.4375, "kl": 0.013495533727109432, "learning_rate": 2.888e-05, "loss": 0.0005, "reward": -0.018838297110050917, "reward_std": 0.11159836873412132, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.031010141596198082, "rewards/no_repetition_reward_func": -0.049848439171910286, "rewards/verse_reward_func": 0.0, "step": 361 }, { "completion_length": 230.09375, "epoch": 2.896, "grad_norm": 0.6171875, "kl": 0.012598065193742514, "learning_rate": 2.8960000000000004e-05, "loss": 0.0005, "reward": -0.04897595942020416, "reward_std": 0.11859872937202454, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.013950892724096775, "rewards/no_repetition_reward_func": -0.055114349350333214, "rewards/verse_reward_func": -0.0078125, "step": 362 }, { "completion_length": 245.4375, "epoch": 2.904, "grad_norm": 0.494140625, "kl": 0.013602377381175756, "learning_rate": 2.904e-05, "loss": 0.0005, "reward": -0.012419357895851135, "reward_std": 0.16858398169279099, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05170019716024399, "rewards/no_repetition_reward_func": -0.06411955319344997, "rewards/verse_reward_func": 0.0, "step": 363 }, { "completion_length": 240.4375, "epoch": 2.912, "grad_norm": 0.50390625, "kl": 0.012021504342556, "learning_rate": 2.9120000000000002e-05, "loss": 0.0005, "reward": -0.010379347018897533, "reward_std": 0.12298992276191711, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04080958850681782, "rewards/no_repetition_reward_func": -0.051188938319683075, "rewards/verse_reward_func": 0.0, "step": 364 }, { "completion_length": 237.046875, "epoch": 2.92, "grad_norm": 0.515625, "kl": 0.015193395782262087, "learning_rate": 2.9199999999999998e-05, "loss": 0.0006, "reward": -0.035467661917209625, "reward_std": 0.15555986016988754, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03890072740614414, "rewards/no_repetition_reward_func": -0.07436838373541832, "rewards/verse_reward_func": 0.0, "step": 365 }, { "completion_length": 239.171875, "epoch": 2.928, "grad_norm": 0.453125, "kl": 0.014583153650164604, "learning_rate": 2.928e-05, "loss": 0.0006, "reward": -0.04361729323863983, "reward_std": 0.16031897068023682, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.029237689916044474, "rewards/no_repetition_reward_func": -0.07285498455166817, "rewards/verse_reward_func": 0.0, "step": 366 }, { "completion_length": 235.375, "epoch": 2.936, "grad_norm": 0.51171875, "kl": 0.012697895523160696, "learning_rate": 2.9360000000000003e-05, "loss": 0.0005, "reward": -0.04568045400083065, "reward_std": 0.08611566200852394, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.008223684504628181, "rewards/no_repetition_reward_func": -0.05390413664281368, "rewards/verse_reward_func": 0.0, "step": 367 }, { "completion_length": 229.609375, "epoch": 2.944, "grad_norm": 1.125, "kl": 0.016071072779595852, "learning_rate": 2.944e-05, "loss": 0.0006, "reward": -0.051461005583405495, "reward_std": 0.1539662629365921, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.023871528450399637, "rewards/no_repetition_reward_func": -0.051895035430788994, "rewards/verse_reward_func": -0.0234375, "step": 368 }, { "completion_length": 241.203125, "epoch": 2.952, "grad_norm": 0.439453125, "kl": 0.01416090875864029, "learning_rate": 2.9520000000000002e-05, "loss": 0.0006, "reward": -0.03628994058817625, "reward_std": 0.13305235654115677, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0357321253977716, "rewards/no_repetition_reward_func": -0.07202206552028656, "rewards/verse_reward_func": 0.0, "step": 369 }, { "completion_length": 244.953125, "epoch": 2.96, "grad_norm": 1.0546875, "kl": 0.015308070927858353, "learning_rate": 2.96e-05, "loss": 0.0006, "reward": -0.08887846022844315, "reward_std": 0.15760249644517899, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.006009615492075682, "rewards/no_repetition_reward_func": -0.07926307618618011, "rewards/verse_reward_func": -0.015625, "step": 370 }, { "completion_length": 242.171875, "epoch": 2.968, "grad_norm": 0.5078125, "kl": 0.012247861362993717, "learning_rate": 2.9680000000000004e-05, "loss": 0.0005, "reward": -0.003845873288810253, "reward_std": 0.19544096291065216, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.055559076368808746, "rewards/no_repetition_reward_func": -0.05940495431423187, "rewards/verse_reward_func": 0.0, "step": 371 }, { "completion_length": 246.421875, "epoch": 2.976, "grad_norm": 0.44921875, "kl": 0.013545302208513021, "learning_rate": 2.976e-05, "loss": 0.0005, "reward": -0.034474088810384274, "reward_std": 0.13363806158304214, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.032963267993181944, "rewards/no_repetition_reward_func": -0.06743735633790493, "rewards/verse_reward_func": 0.0, "step": 372 }, { "completion_length": 230.59375, "epoch": 2.984, "grad_norm": 0.51171875, "kl": 0.01631010416895151, "learning_rate": 2.9840000000000002e-05, "loss": 0.0007, "reward": -0.0236738882958889, "reward_std": 0.15777894854545593, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03550111735239625, "rewards/no_repetition_reward_func": -0.05136250518262386, "rewards/verse_reward_func": -0.0078125, "step": 373 }, { "completion_length": 244.671875, "epoch": 2.992, "grad_norm": 0.5, "kl": 0.014521681237965822, "learning_rate": 2.9920000000000005e-05, "loss": 0.0006, "reward": -0.029303422197699547, "reward_std": 0.08758307807147503, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.015625, "rewards/no_repetition_reward_func": -0.0449284203350544, "rewards/verse_reward_func": 0.0, "step": 374 }, { "completion_length": 256.0, "epoch": 3.0, "grad_norm": 0.453125, "kl": 0.014965548645704985, "learning_rate": 3e-05, "loss": 0.0006, "reward": -0.06690370664000511, "reward_std": 0.130007304251194, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0065104165114462376, "rewards/no_repetition_reward_func": -0.06560162454843521, "rewards/verse_reward_func": -0.0078125, "step": 375 }, { "completion_length": 240.5625, "epoch": 3.008, "grad_norm": 0.5703125, "kl": 0.015707400627434254, "learning_rate": 3.0080000000000003e-05, "loss": 0.0006, "reward": -0.05780006945133209, "reward_std": 0.14183755964040756, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.016293449327349663, "rewards/no_repetition_reward_func": -0.0662810206413269, "rewards/verse_reward_func": -0.0078125, "step": 376 }, { "completion_length": 234.328125, "epoch": 3.016, "grad_norm": 1.03125, "kl": 0.01532835979014635, "learning_rate": 3.016e-05, "loss": 0.0006, "reward": -0.03837151452898979, "reward_std": 0.13769876211881638, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.022293245419859886, "rewards/no_repetition_reward_func": -0.04503976181149483, "rewards/verse_reward_func": -0.015625, "step": 377 }, { "completion_length": 234.953125, "epoch": 3.024, "grad_norm": 0.66015625, "kl": 0.014521858189255, "learning_rate": 3.0240000000000002e-05, "loss": 0.0006, "reward": -0.023602099157869816, "reward_std": 0.18821649253368378, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0390625, "rewards/no_repetition_reward_func": -0.05485209450125694, "rewards/verse_reward_func": -0.0078125, "step": 378 }, { "completion_length": 237.375, "epoch": 3.032, "grad_norm": 0.478515625, "kl": 0.016269493848085403, "learning_rate": 3.0320000000000004e-05, "loss": 0.0007, "reward": -0.03862000769004226, "reward_std": 0.15373040735721588, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.029543068259954453, "rewards/no_repetition_reward_func": -0.06816307827830315, "rewards/verse_reward_func": 0.0, "step": 379 }, { "completion_length": 239.421875, "epoch": 3.04, "grad_norm": 0.484375, "kl": 0.016884273383766413, "learning_rate": 3.04e-05, "loss": 0.0007, "reward": 0.010008413344621658, "reward_std": 0.18298637866973877, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.059567904099822044, "rewards/no_repetition_reward_func": -0.049559492617845535, "rewards/verse_reward_func": 0.0, "step": 380 }, { "completion_length": 242.953125, "epoch": 3.048, "grad_norm": 0.458984375, "kl": 0.013702856376767159, "learning_rate": 3.0480000000000003e-05, "loss": 0.0005, "reward": -0.012718127574771643, "reward_std": 0.1084300484508276, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02985895425081253, "rewards/no_repetition_reward_func": -0.04257708229124546, "rewards/verse_reward_func": 0.0, "step": 381 }, { "completion_length": 235.8125, "epoch": 3.056, "grad_norm": 0.5078125, "kl": 0.014615205116569996, "learning_rate": 3.056e-05, "loss": 0.0006, "reward": -0.02634394494816661, "reward_std": 0.1090940535068512, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02038043411448598, "rewards/no_repetition_reward_func": -0.04672437720000744, "rewards/verse_reward_func": 0.0, "step": 382 }, { "completion_length": 234.21875, "epoch": 3.064, "grad_norm": 0.6796875, "kl": 0.017962178215384483, "learning_rate": 3.0640000000000005e-05, "loss": 0.0007, "reward": -0.0717819333076477, "reward_std": 0.09360820800065994, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.056156937032938004, "rewards/verse_reward_func": -0.015625, "step": 383 }, { "completion_length": 242.703125, "epoch": 3.072, "grad_norm": 0.4921875, "kl": 0.017112262547016144, "learning_rate": 3.072e-05, "loss": 0.0007, "reward": -0.036318885162472725, "reward_std": 0.17713246494531631, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04060195945203304, "rewards/no_repetition_reward_func": -0.07692084461450577, "rewards/verse_reward_func": 0.0, "step": 384 }, { "completion_length": 241.15625, "epoch": 3.08, "grad_norm": 0.98828125, "kl": 0.017028262838721275, "learning_rate": 3.08e-05, "loss": 0.0007, "reward": 0.03361378703266382, "reward_std": 0.26399078220129013, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0893044974654913, "rewards/no_repetition_reward_func": -0.040065716952085495, "rewards/verse_reward_func": -0.015625, "step": 385 }, { "completion_length": 240.109375, "epoch": 3.088, "grad_norm": 0.625, "kl": 0.015291132964193821, "learning_rate": 3.088e-05, "loss": 0.0006, "reward": -0.0297950878739357, "reward_std": 0.1799006573855877, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04818213731050491, "rewards/no_repetition_reward_func": -0.07016472890973091, "rewards/verse_reward_func": -0.0078125, "step": 386 }, { "completion_length": 236.0, "epoch": 3.096, "grad_norm": 0.703125, "kl": 0.019207621924579144, "learning_rate": 3.096e-05, "loss": 0.0008, "reward": -0.013380438555032015, "reward_std": 0.1908210664987564, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04868141561746597, "rewards/no_repetition_reward_func": -0.0542493537068367, "rewards/verse_reward_func": -0.0078125, "step": 387 }, { "completion_length": 242.5, "epoch": 3.104, "grad_norm": 0.478515625, "kl": 0.01500282995402813, "learning_rate": 3.104e-05, "loss": 0.0006, "reward": -0.06712588295340538, "reward_std": 0.08325900509953499, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.06712588667869568, "rewards/verse_reward_func": 0.0, "step": 388 }, { "completion_length": 250.046875, "epoch": 3.112, "grad_norm": 0.466796875, "kl": 0.01617514342069626, "learning_rate": 3.112e-05, "loss": 0.0006, "reward": -0.013413841370493174, "reward_std": 0.10887505859136581, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.027062781620770693, "rewards/no_repetition_reward_func": -0.040476622991263866, "rewards/verse_reward_func": 0.0, "step": 389 }, { "completion_length": 240.421875, "epoch": 3.12, "grad_norm": 0.765625, "kl": 0.016393445432186127, "learning_rate": 3.12e-05, "loss": 0.0007, "reward": -0.028848390094935894, "reward_std": 0.14384103566408157, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0329861119389534, "rewards/no_repetition_reward_func": -0.05402200110256672, "rewards/verse_reward_func": -0.0078125, "step": 390 }, { "completion_length": 232.328125, "epoch": 3.128, "grad_norm": 0.62890625, "kl": 0.020986102521419525, "learning_rate": 3.1280000000000005e-05, "loss": 0.0008, "reward": 0.035930952057242393, "reward_std": 0.25870005786418915, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0938301756978035, "rewards/no_repetition_reward_func": -0.05008672550320625, "rewards/verse_reward_func": -0.0078125, "step": 391 }, { "completion_length": 239.6875, "epoch": 3.136, "grad_norm": 0.494140625, "kl": 0.01622056495398283, "learning_rate": 3.136e-05, "loss": 0.0006, "reward": 0.00018683122470974922, "reward_std": 0.18747776001691818, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.050451990216970444, "rewards/no_repetition_reward_func": -0.05026515945792198, "rewards/verse_reward_func": 0.0, "step": 392 }, { "completion_length": 243.4375, "epoch": 3.144, "grad_norm": 0.75, "kl": 0.01890282053500414, "learning_rate": 3.1440000000000004e-05, "loss": 0.0008, "reward": 0.08187306672334671, "reward_std": 0.35313913226127625, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1337842047214508, "rewards/no_repetition_reward_func": -0.044098636135458946, "rewards/verse_reward_func": -0.0078125, "step": 393 }, { "completion_length": 232.65625, "epoch": 3.152, "grad_norm": 0.91015625, "kl": 0.018001144751906395, "learning_rate": 3.1519999999999996e-05, "loss": 0.0007, "reward": -0.038361312821507454, "reward_std": 0.20000632107257843, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04019923880696297, "rewards/no_repetition_reward_func": -0.05512304790318012, "rewards/verse_reward_func": -0.0234375, "step": 394 }, { "completion_length": 242.671875, "epoch": 3.16, "grad_norm": 0.50390625, "kl": 0.016229936853051186, "learning_rate": 3.16e-05, "loss": 0.0006, "reward": 0.02138965018093586, "reward_std": 0.20283784717321396, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.07572906091809273, "rewards/no_repetition_reward_func": -0.05433941259980202, "rewards/verse_reward_func": 0.0, "step": 395 }, { "completion_length": 237.109375, "epoch": 3.168, "grad_norm": 0.703125, "kl": 0.019789774902164936, "learning_rate": 3.168e-05, "loss": 0.0008, "reward": -0.016139812767505646, "reward_std": 0.1574670560657978, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04356594383716583, "rewards/no_repetition_reward_func": -0.04408075660467148, "rewards/verse_reward_func": -0.015625, "step": 396 }, { "completion_length": 238.921875, "epoch": 3.176, "grad_norm": 0.7734375, "kl": 0.019063200801610947, "learning_rate": 3.176e-05, "loss": 0.0008, "reward": -0.037327468395233154, "reward_std": 0.1290760226547718, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02935878373682499, "rewards/no_repetition_reward_func": -0.05106125399470329, "rewards/verse_reward_func": -0.015625, "step": 397 }, { "completion_length": 229.59375, "epoch": 3.184, "grad_norm": 0.75390625, "kl": 0.02120271883904934, "learning_rate": 3.184e-05, "loss": 0.0008, "reward": -0.0010987929999828339, "reward_std": 0.21165038645267487, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.057554274797439575, "rewards/no_repetition_reward_func": -0.043028063140809536, "rewards/verse_reward_func": -0.015625, "step": 398 }, { "completion_length": 238.890625, "epoch": 3.192, "grad_norm": 0.546875, "kl": 0.018731256015598774, "learning_rate": 3.192e-05, "loss": 0.0007, "reward": -0.04720832780003548, "reward_std": 0.12094857916235924, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.021267162635922432, "rewards/no_repetition_reward_func": -0.06066298857331276, "rewards/verse_reward_func": -0.0078125, "step": 399 }, { "completion_length": 240.859375, "epoch": 3.2, "grad_norm": 0.9921875, "kl": 0.01899643987417221, "learning_rate": 3.2000000000000005e-05, "loss": 0.0008, "reward": -0.03217708505690098, "reward_std": 0.13377967104315758, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.023240847513079643, "rewards/no_repetition_reward_func": -0.04760543256998062, "rewards/verse_reward_func": -0.0078125, "step": 400 }, { "completion_length": 225.921875, "epoch": 3.208, "grad_norm": 1.1484375, "kl": 0.030190441757440567, "learning_rate": 3.208e-05, "loss": 0.0012, "reward": -0.05602763220667839, "reward_std": 0.10498693212866783, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0074404762126505375, "rewards/no_repetition_reward_func": -0.05565560981631279, "rewards/verse_reward_func": -0.0078125, "step": 401 }, { "completion_length": 244.828125, "epoch": 3.216, "grad_norm": 0.53515625, "kl": 0.017653128132224083, "learning_rate": 3.2160000000000004e-05, "loss": 0.0007, "reward": 0.0038253050297498703, "reward_std": 0.16172325611114502, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04849699093028903, "rewards/no_repetition_reward_func": -0.04467168264091015, "rewards/verse_reward_func": 0.0, "step": 402 }, { "completion_length": 240.046875, "epoch": 3.224, "grad_norm": 0.494140625, "kl": 0.019314267672598362, "learning_rate": 3.224e-05, "loss": 0.0008, "reward": 0.043051013723015785, "reward_std": 0.20032405853271484, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.07960063219070435, "rewards/no_repetition_reward_func": -0.03654961660504341, "rewards/verse_reward_func": 0.0, "step": 403 }, { "completion_length": 245.453125, "epoch": 3.232, "grad_norm": 0.5078125, "kl": 0.020115578547120094, "learning_rate": 3.232e-05, "loss": 0.0008, "reward": -0.02376787457615137, "reward_std": 0.1516386792063713, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0352328447625041, "rewards/no_repetition_reward_func": -0.05900072120130062, "rewards/verse_reward_func": 0.0, "step": 404 }, { "completion_length": 240.53125, "epoch": 3.24, "grad_norm": 0.84765625, "kl": 0.01895293779671192, "learning_rate": 3.24e-05, "loss": 0.0008, "reward": -0.031105971429497004, "reward_std": 0.1076810173690319, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.019283682107925415, "rewards/no_repetition_reward_func": -0.03476465307176113, "rewards/verse_reward_func": -0.015625, "step": 405 }, { "completion_length": 242.65625, "epoch": 3.248, "grad_norm": 1.015625, "kl": 0.020879889838397503, "learning_rate": 3.248e-05, "loss": 0.0008, "reward": -0.03362682554870844, "reward_std": 0.1263464279472828, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.030877976212650537, "rewards/no_repetition_reward_func": -0.056692298501729965, "rewards/verse_reward_func": -0.0078125, "step": 406 }, { "completion_length": 222.796875, "epoch": 3.2560000000000002, "grad_norm": 0.5859375, "kl": 0.024170653894543648, "learning_rate": 3.256e-05, "loss": 0.001, "reward": -0.009934714995324612, "reward_std": 0.1299058496952057, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03929612785577774, "rewards/no_repetition_reward_func": -0.049230845645070076, "rewards/verse_reward_func": 0.0, "step": 407 }, { "completion_length": 250.890625, "epoch": 3.2640000000000002, "grad_norm": 0.66796875, "kl": 0.02079705335199833, "learning_rate": 3.2640000000000006e-05, "loss": 0.0008, "reward": -0.026647585444152355, "reward_std": 0.19041452556848526, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.038352275267243385, "rewards/no_repetition_reward_func": -0.049374861642718315, "rewards/verse_reward_func": -0.015625, "step": 408 }, { "completion_length": 245.15625, "epoch": 3.2720000000000002, "grad_norm": 0.49609375, "kl": 0.019563451409339905, "learning_rate": 3.272e-05, "loss": 0.0008, "reward": -0.018649566685780883, "reward_std": 0.10264587588608265, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02688211016356945, "rewards/no_repetition_reward_func": -0.04553167708218098, "rewards/verse_reward_func": 0.0, "step": 409 }, { "completion_length": 243.734375, "epoch": 3.2800000000000002, "grad_norm": 0.435546875, "kl": 0.019890771247446537, "learning_rate": 3.2800000000000004e-05, "loss": 0.0008, "reward": -0.028835387900471687, "reward_std": 0.09791626036167145, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.024484623223543167, "rewards/no_repetition_reward_func": -0.053320007398724556, "rewards/verse_reward_func": 0.0, "step": 410 }, { "completion_length": 237.640625, "epoch": 3.288, "grad_norm": 0.498046875, "kl": 0.021984337829053402, "learning_rate": 3.288e-05, "loss": 0.0009, "reward": -0.014256389811635017, "reward_std": 0.1404847390949726, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03380848094820976, "rewards/no_repetition_reward_func": -0.04806487075984478, "rewards/verse_reward_func": 0.0, "step": 411 }, { "completion_length": 233.0625, "epoch": 3.296, "grad_norm": 0.93359375, "kl": 0.029230687767267227, "learning_rate": 3.296e-05, "loss": 0.0012, "reward": -0.030722773168236017, "reward_std": 0.14950154721736908, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.039512842893600464, "rewards/no_repetition_reward_func": -0.05461061932146549, "rewards/verse_reward_func": -0.015625, "step": 412 }, { "completion_length": 237.09375, "epoch": 3.304, "grad_norm": 0.64453125, "kl": 0.02862715907394886, "learning_rate": 3.304e-05, "loss": 0.0011, "reward": 0.012736026663333178, "reward_std": 0.25329770147800446, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.08088225871324539, "rewards/no_repetition_reward_func": -0.05252123437821865, "rewards/verse_reward_func": -0.015625, "step": 413 }, { "completion_length": 250.84375, "epoch": 3.312, "grad_norm": 0.4921875, "kl": 0.020083222538232803, "learning_rate": 3.312e-05, "loss": 0.0008, "reward": -0.0305123133584857, "reward_std": 0.07188881933689117, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.013020833022892475, "rewards/no_repetition_reward_func": -0.043533144518733025, "rewards/verse_reward_func": 0.0, "step": 414 }, { "completion_length": 242.328125, "epoch": 3.32, "grad_norm": 0.455078125, "kl": 0.022441250272095203, "learning_rate": 3.32e-05, "loss": 0.0009, "reward": 0.03145274519920349, "reward_std": 0.1756446361541748, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06908800452947617, "rewards/no_repetition_reward_func": -0.037635261192917824, "rewards/verse_reward_func": 0.0, "step": 415 }, { "completion_length": 240.984375, "epoch": 3.328, "grad_norm": 0.490234375, "kl": 0.02248027827590704, "learning_rate": 3.328e-05, "loss": 0.0009, "reward": -0.01135792350396514, "reward_std": 0.14302878826856613, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.040088385343551636, "rewards/no_repetition_reward_func": -0.05144630745053291, "rewards/verse_reward_func": 0.0, "step": 416 }, { "completion_length": 240.25, "epoch": 3.336, "grad_norm": 1.0234375, "kl": 0.022231711074709892, "learning_rate": 3.336e-05, "loss": 0.0009, "reward": 0.010694042779505253, "reward_std": 0.24245372414588928, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0715346410870552, "rewards/no_repetition_reward_func": -0.06084059923887253, "rewards/verse_reward_func": 0.0, "step": 417 }, { "completion_length": 236.71875, "epoch": 3.344, "grad_norm": 0.54296875, "kl": 0.020849433727562428, "learning_rate": 3.344e-05, "loss": 0.0008, "reward": -0.0364662716165185, "reward_std": 0.13892662525177002, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.030829327180981636, "rewards/no_repetition_reward_func": -0.06729559972882271, "rewards/verse_reward_func": 0.0, "step": 418 }, { "completion_length": 244.65625, "epoch": 3.352, "grad_norm": 0.7890625, "kl": 0.02277727983891964, "learning_rate": 3.3520000000000004e-05, "loss": 0.0009, "reward": -0.009109475649893284, "reward_std": 0.1699194610118866, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04916371125727892, "rewards/no_repetition_reward_func": -0.04264818876981735, "rewards/verse_reward_func": -0.015625, "step": 419 }, { "completion_length": 227.34375, "epoch": 3.36, "grad_norm": 0.56640625, "kl": 0.03098505176603794, "learning_rate": 3.3600000000000004e-05, "loss": 0.0012, "reward": 0.02442232146859169, "reward_std": 0.16385630518198013, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.060983122792094946, "rewards/no_repetition_reward_func": -0.03656080458313227, "rewards/verse_reward_func": 0.0, "step": 420 }, { "completion_length": 241.53125, "epoch": 3.368, "grad_norm": 0.466796875, "kl": 0.022340167313814163, "learning_rate": 3.368e-05, "loss": 0.0009, "reward": 0.011761133559048176, "reward_std": 0.19573388993740082, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06857015565037727, "rewards/no_repetition_reward_func": -0.05680902674794197, "rewards/verse_reward_func": 0.0, "step": 421 }, { "completion_length": 241.53125, "epoch": 3.376, "grad_norm": 0.5234375, "kl": 0.022792726755142212, "learning_rate": 3.376e-05, "loss": 0.0009, "reward": 0.026306890416890383, "reward_std": 0.20395511388778687, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.07103180699050426, "rewards/no_repetition_reward_func": -0.04472492076456547, "rewards/verse_reward_func": 0.0, "step": 422 }, { "completion_length": 233.765625, "epoch": 3.384, "grad_norm": 0.76171875, "kl": 0.02250310219824314, "learning_rate": 3.384e-05, "loss": 0.0009, "reward": 0.021658487617969513, "reward_std": 0.271790511906147, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.084692457690835, "rewards/no_repetition_reward_func": -0.04740896821022034, "rewards/verse_reward_func": -0.015625, "step": 423 }, { "completion_length": 242.953125, "epoch": 3.392, "grad_norm": 0.53515625, "kl": 0.023451777175068855, "learning_rate": 3.392e-05, "loss": 0.0009, "reward": -0.0045844679698348045, "reward_std": 0.1363266035914421, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0361461378633976, "rewards/no_repetition_reward_func": -0.04073060676455498, "rewards/verse_reward_func": 0.0, "step": 424 }, { "completion_length": 248.65625, "epoch": 3.4, "grad_norm": 0.484375, "kl": 0.024371756240725517, "learning_rate": 3.4000000000000007e-05, "loss": 0.001, "reward": 0.03648353926837444, "reward_std": 0.2087213099002838, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.07665486261248589, "rewards/no_repetition_reward_func": -0.04017132706940174, "rewards/verse_reward_func": 0.0, "step": 425 }, { "completion_length": 238.296875, "epoch": 3.408, "grad_norm": 0.54296875, "kl": 0.024519427679479122, "learning_rate": 3.408e-05, "loss": 0.001, "reward": -0.008571111597120762, "reward_std": 0.10699578374624252, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.026710116304457188, "rewards/no_repetition_reward_func": -0.03528122790157795, "rewards/verse_reward_func": 0.0, "step": 426 }, { "completion_length": 248.296875, "epoch": 3.416, "grad_norm": 0.453125, "kl": 0.021393200382590294, "learning_rate": 3.4160000000000005e-05, "loss": 0.0009, "reward": 0.023186320438981056, "reward_std": 0.17081153392791748, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06876396387815475, "rewards/no_repetition_reward_func": -0.0455776434391737, "rewards/verse_reward_func": 0.0, "step": 427 }, { "completion_length": 242.828125, "epoch": 3.424, "grad_norm": 0.51171875, "kl": 0.02350664883852005, "learning_rate": 3.424e-05, "loss": 0.0009, "reward": -0.013990387786179781, "reward_std": 0.1435392200946808, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0390625, "rewards/no_repetition_reward_func": -0.04524039104580879, "rewards/verse_reward_func": -0.0078125, "step": 428 }, { "completion_length": 243.671875, "epoch": 3.432, "grad_norm": 0.70703125, "kl": 0.02534983493387699, "learning_rate": 3.4320000000000003e-05, "loss": 0.001, "reward": 0.023601185530424118, "reward_std": 0.15404490381479263, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.07370912190526724, "rewards/no_repetition_reward_func": -0.042295437306165695, "rewards/verse_reward_func": -0.0078125, "step": 429 }, { "completion_length": 243.21875, "epoch": 3.44, "grad_norm": 0.546875, "kl": 0.022448722273111343, "learning_rate": 3.4399999999999996e-05, "loss": 0.0009, "reward": 0.022728771436959505, "reward_std": 0.18340525776147842, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.067808723077178, "rewards/no_repetition_reward_func": -0.045079950243234634, "rewards/verse_reward_func": 0.0, "step": 430 }, { "completion_length": 248.578125, "epoch": 3.448, "grad_norm": 0.4609375, "kl": 0.022520788945257664, "learning_rate": 3.448e-05, "loss": 0.0009, "reward": -0.022925328463315964, "reward_std": 0.12567488104104996, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03546195663511753, "rewards/no_repetition_reward_func": -0.058387285098433495, "rewards/verse_reward_func": 0.0, "step": 431 }, { "completion_length": 238.90625, "epoch": 3.456, "grad_norm": 0.453125, "kl": 0.027233799919486046, "learning_rate": 3.456e-05, "loss": 0.0011, "reward": -0.04618854634463787, "reward_std": 0.05428118631243706, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0, "rewards/no_repetition_reward_func": -0.04618854634463787, "rewards/verse_reward_func": 0.0, "step": 432 }, { "completion_length": 233.765625, "epoch": 3.464, "grad_norm": 0.92578125, "kl": 0.034397805109620094, "learning_rate": 3.464e-05, "loss": 0.0014, "reward": -0.03992709703743458, "reward_std": 0.11953939497470856, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.020483954343944788, "rewards/no_repetition_reward_func": -0.05259854719042778, "rewards/verse_reward_func": -0.0078125, "step": 433 }, { "completion_length": 241.71875, "epoch": 3.472, "grad_norm": 0.56640625, "kl": 0.027738087810575962, "learning_rate": 3.472e-05, "loss": 0.0011, "reward": -0.0316309817135334, "reward_std": 0.09328093938529491, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0173611119389534, "rewards/no_repetition_reward_func": -0.0411795973777771, "rewards/verse_reward_func": -0.0078125, "step": 434 }, { "completion_length": 237.0625, "epoch": 3.48, "grad_norm": 0.59765625, "kl": 0.026468138210475445, "learning_rate": 3.48e-05, "loss": 0.0011, "reward": -0.04906369186937809, "reward_std": 0.0980117879807949, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0078125, "rewards/no_repetition_reward_func": -0.04906369559466839, "rewards/verse_reward_func": -0.0078125, "step": 435 }, { "completion_length": 242.125, "epoch": 3.488, "grad_norm": 0.5, "kl": 0.029030085541307926, "learning_rate": 3.4880000000000005e-05, "loss": 0.0012, "reward": -0.018200830090790987, "reward_std": 0.1590312346816063, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0382500309497118, "rewards/no_repetition_reward_func": -0.0486383605748415, "rewards/verse_reward_func": -0.0078125, "step": 436 }, { "completion_length": 233.296875, "epoch": 3.496, "grad_norm": 1.2109375, "kl": 0.046614741906523705, "learning_rate": 3.4960000000000004e-05, "loss": 0.0019, "reward": 0.01641658879816532, "reward_std": 0.19590599089860916, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06431306153535843, "rewards/no_repetition_reward_func": -0.032271476462483406, "rewards/verse_reward_func": -0.015625, "step": 437 }, { "completion_length": 243.046875, "epoch": 3.504, "grad_norm": 0.9609375, "kl": 0.026037227362394333, "learning_rate": 3.504e-05, "loss": 0.001, "reward": -0.02332550147548318, "reward_std": 0.08707543462514877, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.01311188843101263, "rewards/no_repetition_reward_func": -0.028624888509511948, "rewards/verse_reward_func": -0.0078125, "step": 438 }, { "completion_length": 241.90625, "epoch": 3.512, "grad_norm": 0.90234375, "kl": 0.03035850916057825, "learning_rate": 3.512e-05, "loss": 0.0012, "reward": -0.001972535625100136, "reward_std": 0.14351198449730873, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03924983739852905, "rewards/no_repetition_reward_func": -0.03340987302362919, "rewards/verse_reward_func": -0.0078125, "step": 439 }, { "completion_length": 240.484375, "epoch": 3.52, "grad_norm": 0.50390625, "kl": 0.02591067086905241, "learning_rate": 3.52e-05, "loss": 0.001, "reward": -0.01638489542528987, "reward_std": 0.12097782641649246, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.02766597270965576, "rewards/no_repetition_reward_func": -0.03623836673796177, "rewards/verse_reward_func": -0.0078125, "step": 440 }, { "completion_length": 243.28125, "epoch": 3.528, "grad_norm": 0.69140625, "kl": 0.030955465510487556, "learning_rate": 3.528e-05, "loss": 0.0012, "reward": 0.04333726782351732, "reward_std": 0.30375438928604126, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09672822058200836, "rewards/no_repetition_reward_func": -0.03776596300303936, "rewards/verse_reward_func": -0.015625, "step": 441 }, { "completion_length": 242.609375, "epoch": 3.536, "grad_norm": 0.51171875, "kl": 0.029352176003158092, "learning_rate": 3.536000000000001e-05, "loss": 0.0012, "reward": 0.06978249736130238, "reward_std": 0.2633589468896389, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11237665265798569, "rewards/no_repetition_reward_func": -0.04259415157139301, "rewards/verse_reward_func": 0.0, "step": 442 }, { "completion_length": 237.8125, "epoch": 3.544, "grad_norm": 0.6328125, "kl": 0.04270121827721596, "learning_rate": 3.544e-05, "loss": 0.0017, "reward": 0.05512842908501625, "reward_std": 0.22631409764289856, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09279860183596611, "rewards/no_repetition_reward_func": -0.03767017461359501, "rewards/verse_reward_func": 0.0, "step": 443 }, { "completion_length": 243.75, "epoch": 3.552, "grad_norm": 0.84375, "kl": 0.03977113589644432, "learning_rate": 3.5520000000000006e-05, "loss": 0.0016, "reward": 0.025970839895308018, "reward_std": 0.22308802232146263, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.08024872001260519, "rewards/no_repetition_reward_func": -0.04646538197994232, "rewards/verse_reward_func": -0.0078125, "step": 444 }, { "completion_length": 240.953125, "epoch": 3.56, "grad_norm": 0.4921875, "kl": 0.027992784045636654, "learning_rate": 3.56e-05, "loss": 0.0011, "reward": -0.02867790311574936, "reward_std": 0.10478883981704712, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.022017045877873898, "rewards/no_repetition_reward_func": -0.05069494806230068, "rewards/verse_reward_func": 0.0, "step": 445 }, { "completion_length": 244.84375, "epoch": 3.568, "grad_norm": 0.8359375, "kl": 0.03637537453323603, "learning_rate": 3.5680000000000004e-05, "loss": 0.0015, "reward": -0.05869480408728123, "reward_std": 0.10196885094046593, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.011780754197388887, "rewards/no_repetition_reward_func": -0.06266305595636368, "rewards/verse_reward_func": -0.0078125, "step": 446 }, { "completion_length": 249.53125, "epoch": 3.576, "grad_norm": 0.478515625, "kl": 0.02827051840722561, "learning_rate": 3.5759999999999996e-05, "loss": 0.0011, "reward": 0.0047455355525016785, "reward_std": 0.13350757956504822, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04885912872850895, "rewards/no_repetition_reward_func": -0.04411358945071697, "rewards/verse_reward_func": 0.0, "step": 447 }, { "completion_length": 229.296875, "epoch": 3.584, "grad_norm": 0.83984375, "kl": 0.03348237834870815, "learning_rate": 3.584e-05, "loss": 0.0013, "reward": -0.03715553134679794, "reward_std": 0.1063782162964344, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.012019230984151363, "rewards/no_repetition_reward_func": -0.04136226139962673, "rewards/verse_reward_func": -0.0078125, "step": 448 }, { "completion_length": 253.96875, "epoch": 3.592, "grad_norm": 0.443359375, "kl": 0.027051135897636414, "learning_rate": 3.592e-05, "loss": 0.0011, "reward": 0.0670342855155468, "reward_std": 0.26983700692653656, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.10671132430434227, "rewards/no_repetition_reward_func": -0.03967704251408577, "rewards/verse_reward_func": 0.0, "step": 449 }, { "completion_length": 239.6875, "epoch": 3.6, "grad_norm": 0.49609375, "kl": 0.03040954004973173, "learning_rate": 3.6e-05, "loss": 0.0012, "reward": -0.003717208281159401, "reward_std": 0.1673784777522087, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04635201022028923, "rewards/no_repetition_reward_func": -0.04225671850144863, "rewards/verse_reward_func": -0.0078125, "step": 450 }, { "completion_length": 241.359375, "epoch": 3.608, "grad_norm": 0.63671875, "kl": 0.057053858414292336, "learning_rate": 3.608e-05, "loss": 0.0023, "reward": 0.04743345733731985, "reward_std": 0.2781504914164543, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.10321359895169735, "rewards/no_repetition_reward_func": -0.04796764627099037, "rewards/verse_reward_func": -0.0078125, "step": 451 }, { "completion_length": 236.375, "epoch": 3.616, "grad_norm": 0.7265625, "kl": 0.031686993315815926, "learning_rate": 3.616e-05, "loss": 0.0013, "reward": -0.013426479883491993, "reward_std": 0.18829511106014252, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05404259543865919, "rewards/no_repetition_reward_func": -0.059656575322151184, "rewards/verse_reward_func": -0.0078125, "step": 452 }, { "completion_length": 244.296875, "epoch": 3.624, "grad_norm": 0.48046875, "kl": 0.02991360891610384, "learning_rate": 3.624e-05, "loss": 0.0012, "reward": -0.03246229887008667, "reward_std": 0.06574996002018452, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.008223684504628181, "rewards/no_repetition_reward_func": -0.0406859815120697, "rewards/verse_reward_func": 0.0, "step": 453 }, { "completion_length": 233.296875, "epoch": 3.632, "grad_norm": 0.5546875, "kl": 0.05198664218187332, "learning_rate": 3.6320000000000005e-05, "loss": 0.0021, "reward": 0.006985250860452652, "reward_std": 0.130693931132555, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04471050063148141, "rewards/no_repetition_reward_func": -0.03772524744272232, "rewards/verse_reward_func": 0.0, "step": 454 }, { "completion_length": 249.171875, "epoch": 3.64, "grad_norm": 0.5078125, "kl": 0.03660042677074671, "learning_rate": 3.6400000000000004e-05, "loss": 0.0015, "reward": -0.024495745543390512, "reward_std": 0.12807487696409225, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.029380342923104763, "rewards/no_repetition_reward_func": -0.05387608893215656, "rewards/verse_reward_func": 0.0, "step": 455 }, { "completion_length": 253.078125, "epoch": 3.648, "grad_norm": 0.486328125, "kl": 0.03213914670050144, "learning_rate": 3.648e-05, "loss": 0.0013, "reward": 0.060078807175159454, "reward_std": 0.21009264886379242, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09086541831493378, "rewards/no_repetition_reward_func": -0.030786610208451748, "rewards/verse_reward_func": 0.0, "step": 456 }, { "completion_length": 239.765625, "epoch": 3.656, "grad_norm": 0.451171875, "kl": 0.03061517607420683, "learning_rate": 3.656e-05, "loss": 0.0012, "reward": -0.040774807799607515, "reward_std": 0.1550755873322487, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0282118059694767, "rewards/no_repetition_reward_func": -0.06117411330342293, "rewards/verse_reward_func": -0.0078125, "step": 457 }, { "completion_length": 248.953125, "epoch": 3.664, "grad_norm": 0.5234375, "kl": 0.0314072435721755, "learning_rate": 3.664e-05, "loss": 0.0013, "reward": 0.012699034996330738, "reward_std": 0.14346564188599586, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04850646434351802, "rewards/no_repetition_reward_func": -0.03580743260681629, "rewards/verse_reward_func": 0.0, "step": 458 }, { "completion_length": 246.921875, "epoch": 3.672, "grad_norm": 0.4921875, "kl": 0.02861008048057556, "learning_rate": 3.672000000000001e-05, "loss": 0.0011, "reward": 0.013856125995516777, "reward_std": 0.17066562175750732, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.055141756311059, "rewards/no_repetition_reward_func": -0.04128563031554222, "rewards/verse_reward_func": 0.0, "step": 459 }, { "completion_length": 243.84375, "epoch": 3.68, "grad_norm": 0.451171875, "kl": 0.027066603302955627, "learning_rate": 3.68e-05, "loss": 0.0011, "reward": -0.04342888854444027, "reward_std": 0.08955635875463486, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.013020833022892475, "rewards/no_repetition_reward_func": -0.05644972249865532, "rewards/verse_reward_func": 0.0, "step": 460 }, { "completion_length": 249.0625, "epoch": 3.6879999999999997, "grad_norm": 0.484375, "kl": 0.03490014187991619, "learning_rate": 3.6880000000000006e-05, "loss": 0.0014, "reward": 0.0023589441552758217, "reward_std": 0.13792498409748077, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.046016431879252195, "rewards/no_repetition_reward_func": -0.04365748539566994, "rewards/verse_reward_func": 0.0, "step": 461 }, { "completion_length": 238.34375, "epoch": 3.6959999999999997, "grad_norm": 0.56640625, "kl": 0.046821922063827515, "learning_rate": 3.696e-05, "loss": 0.0019, "reward": 0.06736010126769543, "reward_std": 0.27551232278347015, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.10543972998857498, "rewards/no_repetition_reward_func": -0.03807962406426668, "rewards/verse_reward_func": 0.0, "step": 462 }, { "completion_length": 237.515625, "epoch": 3.7039999999999997, "grad_norm": 0.478515625, "kl": 0.030246025882661343, "learning_rate": 3.7040000000000005e-05, "loss": 0.0012, "reward": 0.021595700178295374, "reward_std": 0.1885473094880581, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06262896163389087, "rewards/no_repetition_reward_func": -0.041033266112208366, "rewards/verse_reward_func": 0.0, "step": 463 }, { "completion_length": 251.875, "epoch": 3.7119999999999997, "grad_norm": 0.455078125, "kl": 0.030631249770522118, "learning_rate": 3.712e-05, "loss": 0.0012, "reward": 0.03151591029018164, "reward_std": 0.25088144838809967, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.07822965085506439, "rewards/no_repetition_reward_func": -0.04671374894678593, "rewards/verse_reward_func": 0.0, "step": 464 }, { "completion_length": 244.625, "epoch": 3.7199999999999998, "grad_norm": 0.46875, "kl": 0.03252090513706207, "learning_rate": 3.72e-05, "loss": 0.0013, "reward": 0.022434046491980553, "reward_std": 0.19705982506275177, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0606666412204504, "rewards/no_repetition_reward_func": -0.0382325928658247, "rewards/verse_reward_func": 0.0, "step": 465 }, { "completion_length": 248.25, "epoch": 3.7279999999999998, "grad_norm": 0.478515625, "kl": 0.02860953565686941, "learning_rate": 3.728e-05, "loss": 0.0011, "reward": -0.018416729755699635, "reward_std": 0.138840701431036, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03690905449911952, "rewards/no_repetition_reward_func": -0.05532578378915787, "rewards/verse_reward_func": 0.0, "step": 466 }, { "completion_length": 250.34375, "epoch": 3.7359999999999998, "grad_norm": 0.52734375, "kl": 0.03555143252015114, "learning_rate": 3.736e-05, "loss": 0.0014, "reward": 0.040693528950214386, "reward_std": 0.2256314903497696, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.08702114596962929, "rewards/no_repetition_reward_func": -0.0463276170194149, "rewards/verse_reward_func": 0.0, "step": 467 }, { "completion_length": 247.296875, "epoch": 3.7439999999999998, "grad_norm": 0.482421875, "kl": 0.03155149146914482, "learning_rate": 3.744e-05, "loss": 0.0013, "reward": -0.01786798983812332, "reward_std": 0.14981414377689362, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.030497994273900986, "rewards/no_repetition_reward_func": -0.048365985974669456, "rewards/verse_reward_func": 0.0, "step": 468 }, { "completion_length": 250.75, "epoch": 3.752, "grad_norm": 0.45703125, "kl": 0.031155558302998543, "learning_rate": 3.752e-05, "loss": 0.0012, "reward": -0.015851656906306744, "reward_std": 0.16256694495677948, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.044188471511006355, "rewards/no_repetition_reward_func": -0.060040125623345375, "rewards/verse_reward_func": 0.0, "step": 469 }, { "completion_length": 250.71875, "epoch": 3.76, "grad_norm": 0.4609375, "kl": 0.02751152776181698, "learning_rate": 3.76e-05, "loss": 0.0011, "reward": 0.003079892136156559, "reward_std": 0.12066692486405373, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03407804435119033, "rewards/no_repetition_reward_func": -0.030998149886727333, "rewards/verse_reward_func": 0.0, "step": 470 }, { "completion_length": 244.546875, "epoch": 3.768, "grad_norm": 0.5, "kl": 0.03137166891247034, "learning_rate": 3.7680000000000005e-05, "loss": 0.0013, "reward": -0.013079248368740082, "reward_std": 0.13828442990779877, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.028034856542944908, "rewards/no_repetition_reward_func": -0.04111410304903984, "rewards/verse_reward_func": 0.0, "step": 471 }, { "completion_length": 253.265625, "epoch": 3.776, "grad_norm": 0.48046875, "kl": 0.030893877148628235, "learning_rate": 3.776e-05, "loss": 0.0012, "reward": 0.020838232710957527, "reward_std": 0.19720080494880676, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06232244521379471, "rewards/no_repetition_reward_func": -0.04148421250283718, "rewards/verse_reward_func": 0.0, "step": 472 }, { "completion_length": 247.484375, "epoch": 3.784, "grad_norm": 0.451171875, "kl": 0.035093311220407486, "learning_rate": 3.7840000000000004e-05, "loss": 0.0014, "reward": 0.0733255036175251, "reward_std": 0.26990167796611786, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11399689316749573, "rewards/no_repetition_reward_func": -0.04067138768732548, "rewards/verse_reward_func": 0.0, "step": 473 }, { "completion_length": 231.8125, "epoch": 3.792, "grad_norm": 0.6171875, "kl": 0.04094773903489113, "learning_rate": 3.792e-05, "loss": 0.0016, "reward": 0.015420694835484028, "reward_std": 0.17936459183692932, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06329505518078804, "rewards/no_repetition_reward_func": -0.04006185755133629, "rewards/verse_reward_func": -0.0078125, "step": 474 }, { "completion_length": 243.46875, "epoch": 3.8, "grad_norm": 0.48828125, "kl": 0.03654756769537926, "learning_rate": 3.8e-05, "loss": 0.0015, "reward": 0.006172303110361099, "reward_std": 0.15505491942167282, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04876135289669037, "rewards/no_repetition_reward_func": -0.04258905164897442, "rewards/verse_reward_func": 0.0, "step": 475 }, { "completion_length": 251.046875, "epoch": 3.808, "grad_norm": 0.46875, "kl": 0.03095207829028368, "learning_rate": 3.808e-05, "loss": 0.0012, "reward": 0.02417832612991333, "reward_std": 0.19049294292926788, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06985086016356945, "rewards/no_repetition_reward_func": -0.04567253403365612, "rewards/verse_reward_func": 0.0, "step": 476 }, { "completion_length": 239.3125, "epoch": 3.816, "grad_norm": 0.4765625, "kl": 0.034040018916130066, "learning_rate": 3.816e-05, "loss": 0.0014, "reward": 0.06871397234499454, "reward_std": 0.18267083913087845, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09562961012125015, "rewards/no_repetition_reward_func": -0.02691563405096531, "rewards/verse_reward_func": 0.0, "step": 477 }, { "completion_length": 243.390625, "epoch": 3.824, "grad_norm": 0.462890625, "kl": 0.03272627107799053, "learning_rate": 3.8240000000000007e-05, "loss": 0.0013, "reward": -0.03360329568386078, "reward_std": 0.08190182596445084, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.012828407809138298, "rewards/no_repetition_reward_func": -0.03861920349299908, "rewards/verse_reward_func": -0.0078125, "step": 478 }, { "completion_length": 239.59375, "epoch": 3.832, "grad_norm": 0.625, "kl": 0.06898735091090202, "learning_rate": 3.832e-05, "loss": 0.0028, "reward": 0.008612965699285269, "reward_std": 0.14468061178922653, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04203607328236103, "rewards/no_repetition_reward_func": -0.033423107117414474, "rewards/verse_reward_func": 0.0, "step": 479 }, { "completion_length": 234.921875, "epoch": 3.84, "grad_norm": 0.59375, "kl": 0.03874201700091362, "learning_rate": 3.8400000000000005e-05, "loss": 0.0015, "reward": -0.012950667180120945, "reward_std": 0.12432153150439262, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03299708850681782, "rewards/no_repetition_reward_func": -0.03813525475561619, "rewards/verse_reward_func": -0.0078125, "step": 480 }, { "completion_length": 253.34375, "epoch": 3.848, "grad_norm": 0.49609375, "kl": 0.03738209791481495, "learning_rate": 3.848e-05, "loss": 0.0015, "reward": 0.01918705180287361, "reward_std": 0.1942702829837799, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06325278338044882, "rewards/no_repetition_reward_func": -0.04406573250889778, "rewards/verse_reward_func": 0.0, "step": 481 }, { "completion_length": 241.796875, "epoch": 3.856, "grad_norm": 0.484375, "kl": 0.03831005468964577, "learning_rate": 3.8560000000000004e-05, "loss": 0.0015, "reward": 0.020677795633673668, "reward_std": 0.19063696637749672, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06736797466874123, "rewards/no_repetition_reward_func": -0.04669017717242241, "rewards/verse_reward_func": 0.0, "step": 482 }, { "completion_length": 250.75, "epoch": 3.864, "grad_norm": 0.474609375, "kl": 0.031092967838048935, "learning_rate": 3.864e-05, "loss": 0.0012, "reward": 0.06771814357489347, "reward_std": 0.2640887051820755, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11210381239652634, "rewards/no_repetition_reward_func": -0.04438566416501999, "rewards/verse_reward_func": 0.0, "step": 483 }, { "completion_length": 242.515625, "epoch": 3.872, "grad_norm": 0.498046875, "kl": 0.03696655109524727, "learning_rate": 3.872e-05, "loss": 0.0015, "reward": 0.04239062638953328, "reward_std": 0.21585527807474136, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.07219271175563335, "rewards/no_repetition_reward_func": -0.02980208583176136, "rewards/verse_reward_func": 0.0, "step": 484 }, { "completion_length": 247.078125, "epoch": 3.88, "grad_norm": 0.46875, "kl": 0.03354097530245781, "learning_rate": 3.88e-05, "loss": 0.0013, "reward": 0.12232698500156403, "reward_std": 0.3383868485689163, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.15273047238588333, "rewards/no_repetition_reward_func": -0.030403485521674156, "rewards/verse_reward_func": 0.0, "step": 485 }, { "completion_length": 240.578125, "epoch": 3.888, "grad_norm": 0.87109375, "kl": 0.04256278648972511, "learning_rate": 3.888e-05, "loss": 0.0017, "reward": 0.050419691018760204, "reward_std": 0.3210597485303879, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.10866209119558334, "rewards/no_repetition_reward_func": -0.042617397382855415, "rewards/verse_reward_func": -0.015625, "step": 486 }, { "completion_length": 243.90625, "epoch": 3.896, "grad_norm": 0.53125, "kl": 0.03662655130028725, "learning_rate": 3.896e-05, "loss": 0.0015, "reward": 0.04829729534685612, "reward_std": 0.259415365755558, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.105563934892416, "rewards/no_repetition_reward_func": -0.041641637682914734, "rewards/verse_reward_func": -0.015625, "step": 487 }, { "completion_length": 237.984375, "epoch": 3.904, "grad_norm": 0.494140625, "kl": 0.0394320972263813, "learning_rate": 3.9040000000000006e-05, "loss": 0.0016, "reward": 0.048824653029441833, "reward_std": 0.24345767125487328, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09372962079942226, "rewards/no_repetition_reward_func": -0.04490497149527073, "rewards/verse_reward_func": 0.0, "step": 488 }, { "completion_length": 242.46875, "epoch": 3.912, "grad_norm": 0.5546875, "kl": 0.036301784217357635, "learning_rate": 3.912e-05, "loss": 0.0015, "reward": 0.042970918118953705, "reward_std": 0.23882311582565308, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09069685265421867, "rewards/no_repetition_reward_func": -0.032100933603942394, "rewards/verse_reward_func": -0.015625, "step": 489 }, { "completion_length": 246.53125, "epoch": 3.92, "grad_norm": 0.6875, "kl": 0.041871290653944016, "learning_rate": 3.9200000000000004e-05, "loss": 0.0017, "reward": 0.04859830066561699, "reward_std": 0.2501128166913986, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.08978711813688278, "rewards/no_repetition_reward_func": -0.025563813745975494, "rewards/verse_reward_func": -0.015625, "step": 490 }, { "completion_length": 244.046875, "epoch": 3.928, "grad_norm": 0.51953125, "kl": 0.03754214756190777, "learning_rate": 3.9280000000000003e-05, "loss": 0.0015, "reward": 0.0350759644061327, "reward_std": 0.1970890909433365, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.07509705051779747, "rewards/no_repetition_reward_func": -0.04002108797430992, "rewards/verse_reward_func": 0.0, "step": 491 }, { "completion_length": 243.109375, "epoch": 3.936, "grad_norm": 0.57421875, "kl": 0.04054178111255169, "learning_rate": 3.936e-05, "loss": 0.0016, "reward": 0.024980800226330757, "reward_std": 0.14916175603866577, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05783415213227272, "rewards/no_repetition_reward_func": -0.03285335376858711, "rewards/verse_reward_func": 0.0, "step": 492 }, { "completion_length": 239.90625, "epoch": 3.944, "grad_norm": 0.52734375, "kl": 0.047671666368842125, "learning_rate": 3.944e-05, "loss": 0.0019, "reward": 0.04220822174102068, "reward_std": 0.256586030125618, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0922556146979332, "rewards/no_repetition_reward_func": -0.05004739388823509, "rewards/verse_reward_func": 0.0, "step": 493 }, { "completion_length": 242.359375, "epoch": 3.952, "grad_norm": 0.462890625, "kl": 0.03648048825562, "learning_rate": 3.952e-05, "loss": 0.0015, "reward": -0.03875821363180876, "reward_std": 0.126910001039505, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.026095418259501457, "rewards/no_repetition_reward_func": -0.06485363282263279, "rewards/verse_reward_func": 0.0, "step": 494 }, { "completion_length": 240.625, "epoch": 3.96, "grad_norm": 0.88671875, "kl": 0.044688284397125244, "learning_rate": 3.960000000000001e-05, "loss": 0.0018, "reward": 0.013294420205056667, "reward_std": 0.15973186492919922, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04752531601116061, "rewards/no_repetition_reward_func": -0.03423089347779751, "rewards/verse_reward_func": 0.0, "step": 495 }, { "completion_length": 247.6875, "epoch": 3.968, "grad_norm": 0.5546875, "kl": 0.03640627861022949, "learning_rate": 3.968e-05, "loss": 0.0015, "reward": 0.05928905867040157, "reward_std": 0.23196136951446533, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09506436064839363, "rewards/no_repetition_reward_func": -0.03577530011534691, "rewards/verse_reward_func": 0.0, "step": 496 }, { "completion_length": 245.96875, "epoch": 3.976, "grad_norm": 0.78515625, "kl": 0.038454994559288025, "learning_rate": 3.9760000000000006e-05, "loss": 0.0015, "reward": 0.02157383505254984, "reward_std": 0.16150665283203125, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.055384550243616104, "rewards/no_repetition_reward_func": -0.02599821425974369, "rewards/verse_reward_func": -0.0078125, "step": 497 }, { "completion_length": 241.671875, "epoch": 3.984, "grad_norm": 0.455078125, "kl": 0.036561851389706135, "learning_rate": 3.984e-05, "loss": 0.0015, "reward": -0.00990798557177186, "reward_std": 0.1111481748521328, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.03571820259094238, "rewards/no_repetition_reward_func": -0.045626189559698105, "rewards/verse_reward_func": 0.0, "step": 498 }, { "completion_length": 250.15625, "epoch": 3.992, "grad_norm": 0.453125, "kl": 0.03258994780480862, "learning_rate": 3.9920000000000004e-05, "loss": 0.0013, "reward": 0.08689446374773979, "reward_std": 0.26910199224948883, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11205463856458664, "rewards/no_repetition_reward_func": -0.02516017109155655, "rewards/verse_reward_func": 0.0, "step": 499 }, { "completion_length": 256.0, "epoch": 4.0, "grad_norm": 0.478515625, "kl": 0.03490675054490566, "learning_rate": 4e-05, "loss": 0.0014, "reward": 0.07686950825154781, "reward_std": 0.26050467789173126, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11608392372727394, "rewards/no_repetition_reward_func": -0.03921441454440355, "rewards/verse_reward_func": 0.0, "step": 500 }, { "completion_length": 247.078125, "epoch": 4.008, "grad_norm": 0.478515625, "kl": 0.03617750480771065, "learning_rate": 4.008e-05, "loss": 0.0014, "reward": 0.002400541678071022, "reward_std": 0.187410369515419, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05514470860362053, "rewards/no_repetition_reward_func": -0.044931668788194656, "rewards/verse_reward_func": -0.0078125, "step": 501 }, { "completion_length": 248.28125, "epoch": 4.016, "grad_norm": 0.50390625, "kl": 0.03500938601791859, "learning_rate": 4.016e-05, "loss": 0.0014, "reward": -0.00233541801571846, "reward_std": 0.1281067207455635, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04314621491357684, "rewards/no_repetition_reward_func": -0.04548163525760174, "rewards/verse_reward_func": 0.0, "step": 502 }, { "completion_length": 251.125, "epoch": 4.024, "grad_norm": 0.47265625, "kl": 0.03837059997022152, "learning_rate": 4.024e-05, "loss": 0.0015, "reward": 0.02464674785733223, "reward_std": 0.202419713139534, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.05882268771529198, "rewards/no_repetition_reward_func": -0.03417593985795975, "rewards/verse_reward_func": 0.0, "step": 503 }, { "completion_length": 250.5, "epoch": 4.032, "grad_norm": 0.5234375, "kl": 0.037834588438272476, "learning_rate": 4.032e-05, "loss": 0.0015, "reward": 0.04815376549959183, "reward_std": 0.24824197590351105, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09896630793809891, "rewards/no_repetition_reward_func": -0.05081254243850708, "rewards/verse_reward_func": 0.0, "step": 504 }, { "completion_length": 248.53125, "epoch": 4.04, "grad_norm": 0.6953125, "kl": 0.05056304112076759, "learning_rate": 4.0400000000000006e-05, "loss": 0.002, "reward": 0.03720803698524833, "reward_std": 0.22494254261255264, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.08332587219774723, "rewards/no_repetition_reward_func": -0.038305334746837616, "rewards/verse_reward_func": -0.0078125, "step": 505 }, { "completion_length": 242.59375, "epoch": 4.048, "grad_norm": 0.486328125, "kl": 0.03293121047317982, "learning_rate": 4.048e-05, "loss": 0.0013, "reward": 0.05595713108778, "reward_std": 0.2068497259169817, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09048688411712646, "rewards/no_repetition_reward_func": -0.026717256754636765, "rewards/verse_reward_func": -0.0078125, "step": 506 }, { "completion_length": 245.9375, "epoch": 4.056, "grad_norm": 0.4921875, "kl": 0.036115922033786774, "learning_rate": 4.0560000000000005e-05, "loss": 0.0014, "reward": 0.030498466454446316, "reward_std": 0.15923146903514862, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06743213534355164, "rewards/no_repetition_reward_func": -0.036933666095137596, "rewards/verse_reward_func": 0.0, "step": 507 }, { "completion_length": 249.90625, "epoch": 4.064, "grad_norm": 0.4453125, "kl": 0.03816046938300133, "learning_rate": 4.064e-05, "loss": 0.0015, "reward": 0.08826545253396034, "reward_std": 0.2731017917394638, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.12205060198903084, "rewards/no_repetition_reward_func": -0.03378515690565109, "rewards/verse_reward_func": 0.0, "step": 508 }, { "completion_length": 247.15625, "epoch": 4.072, "grad_norm": 0.53515625, "kl": 0.03408598154783249, "learning_rate": 4.072e-05, "loss": 0.0014, "reward": 0.033551509026438, "reward_std": 0.21655159443616867, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.07886924967169762, "rewards/no_repetition_reward_func": -0.045317742973566055, "rewards/verse_reward_func": 0.0, "step": 509 }, { "completion_length": 241.8125, "epoch": 4.08, "grad_norm": 1.1328125, "kl": 0.08159074187278748, "learning_rate": 4.08e-05, "loss": 0.0033, "reward": 0.13130883872509003, "reward_std": 0.35344722121953964, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.17818306013941765, "rewards/no_repetition_reward_func": -0.03906171768903732, "rewards/verse_reward_func": -0.0078125, "step": 510 }, { "completion_length": 246.453125, "epoch": 4.088, "grad_norm": 0.5078125, "kl": 0.0426520723849535, "learning_rate": 4.088e-05, "loss": 0.0017, "reward": 0.055216044187545776, "reward_std": 0.24470873177051544, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.10218168795108795, "rewards/no_repetition_reward_func": -0.039153143763542175, "rewards/verse_reward_func": -0.0078125, "step": 511 }, { "completion_length": 243.875, "epoch": 4.096, "grad_norm": 0.5, "kl": 0.048624031245708466, "learning_rate": 4.096e-05, "loss": 0.0019, "reward": 0.07524730544537306, "reward_std": 0.31555792689323425, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1254706121981144, "rewards/no_repetition_reward_func": -0.05022330768406391, "rewards/verse_reward_func": 0.0, "step": 512 }, { "completion_length": 247.90625, "epoch": 4.104, "grad_norm": 0.44140625, "kl": 0.04376434534788132, "learning_rate": 4.104e-05, "loss": 0.0018, "reward": 0.05632601911202073, "reward_std": 0.22115834057331085, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.10965451970696449, "rewards/no_repetition_reward_func": -0.053328489884734154, "rewards/verse_reward_func": 0.0, "step": 513 }, { "completion_length": 243.390625, "epoch": 4.112, "grad_norm": 0.45703125, "kl": 0.041905054822564125, "learning_rate": 4.1120000000000006e-05, "loss": 0.0017, "reward": 0.04047232959419489, "reward_std": 0.2913016676902771, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09461010992527008, "rewards/no_repetition_reward_func": -0.05413777753710747, "rewards/verse_reward_func": 0.0, "step": 514 }, { "completion_length": 245.953125, "epoch": 4.12, "grad_norm": 0.51171875, "kl": 0.04370741359889507, "learning_rate": 4.12e-05, "loss": 0.0017, "reward": 0.028422784060239792, "reward_std": 0.1432550624012947, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.061536021530628204, "rewards/no_repetition_reward_func": -0.03311323560774326, "rewards/verse_reward_func": 0.0, "step": 515 }, { "completion_length": 240.921875, "epoch": 4.128, "grad_norm": 0.50390625, "kl": 0.047720909118652344, "learning_rate": 4.1280000000000005e-05, "loss": 0.0019, "reward": 0.08046257868409157, "reward_std": 0.2691767066717148, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11472206190228462, "rewards/no_repetition_reward_func": -0.0342594850808382, "rewards/verse_reward_func": 0.0, "step": 516 }, { "completion_length": 243.390625, "epoch": 4.136, "grad_norm": 0.734375, "kl": 0.048062046989798546, "learning_rate": 4.1360000000000004e-05, "loss": 0.0019, "reward": 0.01825772225856781, "reward_std": 0.1740879938006401, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.052227872889488935, "rewards/no_repetition_reward_func": -0.026157650165259838, "rewards/verse_reward_func": -0.0078125, "step": 517 }, { "completion_length": 239.9375, "epoch": 4.144, "grad_norm": 0.47265625, "kl": 0.04331754148006439, "learning_rate": 4.144e-05, "loss": 0.0017, "reward": 0.03235548548400402, "reward_std": 0.1798233836889267, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0733298733830452, "rewards/no_repetition_reward_func": -0.02534938883036375, "rewards/verse_reward_func": -0.015625, "step": 518 }, { "completion_length": 235.765625, "epoch": 4.152, "grad_norm": 0.51171875, "kl": 0.04420698434114456, "learning_rate": 4.152e-05, "loss": 0.0018, "reward": 0.1029331423342228, "reward_std": 0.3948013484477997, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1485285460948944, "rewards/no_repetition_reward_func": -0.037782901898026466, "rewards/verse_reward_func": -0.0078125, "step": 519 }, { "completion_length": 243.234375, "epoch": 4.16, "grad_norm": 0.69140625, "kl": 0.08565759472548962, "learning_rate": 4.16e-05, "loss": 0.0034, "reward": 0.0483834445476532, "reward_std": 0.20004472136497498, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.08025164157152176, "rewards/no_repetition_reward_func": -0.03186819329857826, "rewards/verse_reward_func": 0.0, "step": 520 }, { "completion_length": 249.0, "epoch": 4.168, "grad_norm": 0.490234375, "kl": 0.04132683947682381, "learning_rate": 4.168e-05, "loss": 0.0017, "reward": 0.061979436315596104, "reward_std": 0.2025185078382492, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.08786268159747124, "rewards/no_repetition_reward_func": -0.025883245281875134, "rewards/verse_reward_func": 0.0, "step": 521 }, { "completion_length": 242.265625, "epoch": 4.176, "grad_norm": 0.51171875, "kl": 0.050365060567855835, "learning_rate": 4.176000000000001e-05, "loss": 0.002, "reward": 0.031935323029756546, "reward_std": 0.2899533808231354, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09761428087949753, "rewards/no_repetition_reward_func": -0.06567895784974098, "rewards/verse_reward_func": 0.0, "step": 522 }, { "completion_length": 250.015625, "epoch": 4.184, "grad_norm": 0.5546875, "kl": 0.04014075919985771, "learning_rate": 4.184e-05, "loss": 0.0016, "reward": 0.13955779373645782, "reward_std": 0.3742830753326416, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.16645560413599014, "rewards/no_repetition_reward_func": -0.026897807605564594, "rewards/verse_reward_func": 0.0, "step": 523 }, { "completion_length": 242.59375, "epoch": 4.192, "grad_norm": 0.494140625, "kl": 0.05799682438373566, "learning_rate": 4.1920000000000005e-05, "loss": 0.0023, "reward": 0.046036661602556705, "reward_std": 0.2762855663895607, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09085780940949917, "rewards/no_repetition_reward_func": -0.044821152463555336, "rewards/verse_reward_func": 0.0, "step": 524 }, { "completion_length": 245.0625, "epoch": 4.2, "grad_norm": 0.453125, "kl": 0.047679780051112175, "learning_rate": 4.2e-05, "loss": 0.0019, "reward": 0.04559140093624592, "reward_std": 0.22712309658527374, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.08189282938838005, "rewards/no_repetition_reward_func": -0.03630142845213413, "rewards/verse_reward_func": 0.0, "step": 525 }, { "completion_length": 251.015625, "epoch": 4.208, "grad_norm": 0.484375, "kl": 0.04380698502063751, "learning_rate": 4.2080000000000004e-05, "loss": 0.0018, "reward": 0.12736070342361927, "reward_std": 0.3534109741449356, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.16497960686683655, "rewards/no_repetition_reward_func": -0.037618898786604404, "rewards/verse_reward_func": 0.0, "step": 526 }, { "completion_length": 248.0625, "epoch": 4.216, "grad_norm": 0.455078125, "kl": 0.04736827500164509, "learning_rate": 4.2159999999999996e-05, "loss": 0.0019, "reward": 0.01083382498472929, "reward_std": 0.14864041283726692, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04958062805235386, "rewards/no_repetition_reward_func": -0.038746802136301994, "rewards/verse_reward_func": 0.0, "step": 527 }, { "completion_length": 241.0, "epoch": 4.224, "grad_norm": 0.47265625, "kl": 0.04235109128057957, "learning_rate": 4.224e-05, "loss": 0.0017, "reward": 0.02667120285332203, "reward_std": 0.19131657108664513, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.06579862534999847, "rewards/no_repetition_reward_func": -0.039127420634031296, "rewards/verse_reward_func": 0.0, "step": 528 }, { "completion_length": 251.828125, "epoch": 4.232, "grad_norm": 1.1015625, "kl": 0.048691730946302414, "learning_rate": 4.232e-05, "loss": 0.0019, "reward": -0.001687031239271164, "reward_std": 0.1433941200375557, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04184877872467041, "rewards/no_repetition_reward_func": -0.035723308101296425, "rewards/verse_reward_func": -0.0078125, "step": 529 }, { "completion_length": 246.390625, "epoch": 4.24, "grad_norm": 0.5234375, "kl": 0.04241489991545677, "learning_rate": 4.24e-05, "loss": 0.0017, "reward": 0.07577682286500931, "reward_std": 0.2161310389637947, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.106600321829319, "rewards/no_repetition_reward_func": -0.03082349058240652, "rewards/verse_reward_func": 0.0, "step": 530 }, { "completion_length": 246.078125, "epoch": 4.248, "grad_norm": 0.474609375, "kl": 0.041017793118953705, "learning_rate": 4.248e-05, "loss": 0.0016, "reward": 0.08190727606415749, "reward_std": 0.3085913807153702, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11464395374059677, "rewards/no_repetition_reward_func": -0.03273667395114899, "rewards/verse_reward_func": 0.0, "step": 531 }, { "completion_length": 249.84375, "epoch": 4.256, "grad_norm": 0.45703125, "kl": 0.03803389519453049, "learning_rate": 4.256e-05, "loss": 0.0015, "reward": 0.07613223977386951, "reward_std": 0.2131497710943222, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.10495045781135559, "rewards/no_repetition_reward_func": -0.028818216174840927, "rewards/verse_reward_func": 0.0, "step": 532 }, { "completion_length": 249.5, "epoch": 4.264, "grad_norm": 0.65625, "kl": 0.04174601659178734, "learning_rate": 4.2640000000000005e-05, "loss": 0.0017, "reward": 0.0037264679558575153, "reward_std": 0.1608051210641861, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0482892245054245, "rewards/no_repetition_reward_func": -0.03675025701522827, "rewards/verse_reward_func": -0.0078125, "step": 533 }, { "completion_length": 234.578125, "epoch": 4.272, "grad_norm": 0.88671875, "kl": 0.12749138846993446, "learning_rate": 4.2720000000000004e-05, "loss": 0.0051, "reward": 0.0806343387812376, "reward_std": 0.34762144088745117, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.14144007489085197, "rewards/no_repetition_reward_func": -0.04518073424696922, "rewards/verse_reward_func": -0.015625, "step": 534 }, { "completion_length": 248.234375, "epoch": 4.28, "grad_norm": 0.51953125, "kl": 0.04425676353275776, "learning_rate": 4.2800000000000004e-05, "loss": 0.0018, "reward": 0.00241730734705925, "reward_std": 0.1554540991783142, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04908722825348377, "rewards/no_repetition_reward_func": -0.04666991904377937, "rewards/verse_reward_func": 0.0, "step": 535 }, { "completion_length": 250.0625, "epoch": 4.288, "grad_norm": 0.5546875, "kl": 0.06320760399103165, "learning_rate": 4.288e-05, "loss": 0.0025, "reward": 0.0670737074688077, "reward_std": 0.3580111041665077, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.12472381070256233, "rewards/no_repetition_reward_func": -0.04983759485185146, "rewards/verse_reward_func": -0.0078125, "step": 536 }, { "completion_length": 247.5625, "epoch": 4.296, "grad_norm": 0.46875, "kl": 0.03986784443259239, "learning_rate": 4.296e-05, "loss": 0.0016, "reward": -0.014083240181207657, "reward_std": 0.13906611874699593, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04260770417749882, "rewards/no_repetition_reward_func": -0.056690940633416176, "rewards/verse_reward_func": 0.0, "step": 537 }, { "completion_length": 248.96875, "epoch": 4.304, "grad_norm": 0.53125, "kl": 0.05683181248605251, "learning_rate": 4.304e-05, "loss": 0.0023, "reward": 0.10736945271492004, "reward_std": 0.2876706123352051, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1545494794845581, "rewards/no_repetition_reward_func": -0.04718002676963806, "rewards/verse_reward_func": 0.0, "step": 538 }, { "completion_length": 249.828125, "epoch": 4.312, "grad_norm": 0.490234375, "kl": 0.0468257050961256, "learning_rate": 4.312000000000001e-05, "loss": 0.0019, "reward": 0.07389931753277779, "reward_std": 0.21816373616456985, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.10927407816052437, "rewards/no_repetition_reward_func": -0.03537475876510143, "rewards/verse_reward_func": 0.0, "step": 539 }, { "completion_length": 234.015625, "epoch": 4.32, "grad_norm": 0.53515625, "kl": 0.05536257289350033, "learning_rate": 4.32e-05, "loss": 0.0022, "reward": 0.05253050522878766, "reward_std": 0.24075300991535187, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0927741788327694, "rewards/no_repetition_reward_func": -0.04024367779493332, "rewards/verse_reward_func": 0.0, "step": 540 }, { "completion_length": 249.078125, "epoch": 4.328, "grad_norm": 0.51171875, "kl": 0.047778815031051636, "learning_rate": 4.3280000000000006e-05, "loss": 0.0019, "reward": 0.025220834650099277, "reward_std": 0.12356575578451157, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.04645469784736633, "rewards/no_repetition_reward_func": -0.021233857609331608, "rewards/verse_reward_func": 0.0, "step": 541 }, { "completion_length": 236.859375, "epoch": 4.336, "grad_norm": 0.5703125, "kl": 0.05690736509859562, "learning_rate": 4.336e-05, "loss": 0.0023, "reward": 0.08625203371047974, "reward_std": 0.24640309065580368, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11826812475919724, "rewards/no_repetition_reward_func": -0.03201609943062067, "rewards/verse_reward_func": 0.0, "step": 542 }, { "completion_length": 252.96875, "epoch": 4.344, "grad_norm": 0.478515625, "kl": 0.048884790390729904, "learning_rate": 4.3440000000000004e-05, "loss": 0.002, "reward": 0.07617694139480591, "reward_std": 0.22511498630046844, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11309164017438889, "rewards/no_repetition_reward_func": -0.03691469691693783, "rewards/verse_reward_func": 0.0, "step": 543 }, { "completion_length": 241.359375, "epoch": 4.352, "grad_norm": 0.46875, "kl": 0.05425545386970043, "learning_rate": 4.352e-05, "loss": 0.0022, "reward": 0.03616620413959026, "reward_std": 0.2580196410417557, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09934832155704498, "rewards/no_repetition_reward_func": -0.06318212486803532, "rewards/verse_reward_func": 0.0, "step": 544 }, { "completion_length": 248.375, "epoch": 4.36, "grad_norm": 0.470703125, "kl": 0.04959530010819435, "learning_rate": 4.36e-05, "loss": 0.002, "reward": 0.12298946641385555, "reward_std": 0.37573856115341187, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.18120631203055382, "rewards/no_repetition_reward_func": -0.05821684189140797, "rewards/verse_reward_func": 0.0, "step": 545 }, { "completion_length": 237.046875, "epoch": 4.368, "grad_norm": 0.96875, "kl": 0.062186360359191895, "learning_rate": 4.368e-05, "loss": 0.0025, "reward": -0.005920643452554941, "reward_std": 0.09748044610023499, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.0272767785936594, "rewards/no_repetition_reward_func": -0.02538492064923048, "rewards/verse_reward_func": -0.0078125, "step": 546 }, { "completion_length": 240.953125, "epoch": 4.376, "grad_norm": 0.7109375, "kl": 0.06237427890300751, "learning_rate": 4.376e-05, "loss": 0.0025, "reward": 0.1198659110814333, "reward_std": 0.3272487074136734, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.17820806056261063, "rewards/no_repetition_reward_func": -0.05052964948117733, "rewards/verse_reward_func": -0.0078125, "step": 547 }, { "completion_length": 247.46875, "epoch": 4.384, "grad_norm": 0.50390625, "kl": 0.0546291284263134, "learning_rate": 4.384e-05, "loss": 0.0022, "reward": 0.04565959144383669, "reward_std": 0.24974250048398972, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09571967273950577, "rewards/no_repetition_reward_func": -0.042247576639056206, "rewards/verse_reward_func": -0.0078125, "step": 548 }, { "completion_length": 242.140625, "epoch": 4.392, "grad_norm": 0.498046875, "kl": 0.04805105738341808, "learning_rate": 4.392e-05, "loss": 0.0019, "reward": 0.06431073509156704, "reward_std": 0.27588097751140594, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11002950370311737, "rewards/no_repetition_reward_func": -0.04571876488626003, "rewards/verse_reward_func": 0.0, "step": 549 }, { "completion_length": 243.125, "epoch": 4.4, "grad_norm": 0.48828125, "kl": 0.05232583358883858, "learning_rate": 4.4000000000000006e-05, "loss": 0.0021, "reward": 0.14153464883565903, "reward_std": 0.3203430622816086, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.18896307051181793, "rewards/no_repetition_reward_func": -0.047428421676158905, "rewards/verse_reward_func": 0.0, "step": 550 }, { "completion_length": 247.96875, "epoch": 4.408, "grad_norm": 0.5703125, "kl": 0.056814925745129585, "learning_rate": 4.4080000000000005e-05, "loss": 0.0023, "reward": 0.17044785618782043, "reward_std": 0.4316002428531647, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.21150408685207367, "rewards/no_repetition_reward_func": -0.04105621948838234, "rewards/verse_reward_func": 0.0, "step": 551 }, { "completion_length": 243.5625, "epoch": 4.416, "grad_norm": 0.7734375, "kl": 0.09733781777322292, "learning_rate": 4.4160000000000004e-05, "loss": 0.0039, "reward": 0.042824333533644676, "reward_std": 0.26197968423366547, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09418383613228798, "rewards/no_repetition_reward_func": -0.0357345100492239, "rewards/verse_reward_func": -0.015625, "step": 552 }, { "completion_length": 250.5625, "epoch": 4.424, "grad_norm": 0.47265625, "kl": 0.048773301765322685, "learning_rate": 4.424e-05, "loss": 0.002, "reward": 0.08212753012776375, "reward_std": 0.37508055567741394, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1396712139248848, "rewards/no_repetition_reward_func": -0.0497311782091856, "rewards/verse_reward_func": -0.0078125, "step": 553 }, { "completion_length": 227.84375, "epoch": 4.432, "grad_norm": 0.5859375, "kl": 0.08658717200160027, "learning_rate": 4.432e-05, "loss": 0.0035, "reward": 0.12664780765771866, "reward_std": 0.34159883856773376, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1693839654326439, "rewards/no_repetition_reward_func": -0.03492367081344128, "rewards/verse_reward_func": -0.0078125, "step": 554 }, { "completion_length": 245.78125, "epoch": 4.44, "grad_norm": 0.4609375, "kl": 0.060531578958034515, "learning_rate": 4.44e-05, "loss": 0.0024, "reward": 0.17303282022476196, "reward_std": 0.47223803400993347, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2316705882549286, "rewards/no_repetition_reward_func": -0.05863776430487633, "rewards/verse_reward_func": 0.0, "step": 555 }, { "completion_length": 247.765625, "epoch": 4.448, "grad_norm": 0.546875, "kl": 0.05408805422484875, "learning_rate": 4.448e-05, "loss": 0.0022, "reward": 0.10261064395308495, "reward_std": 0.31902290880680084, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.14669084548950195, "rewards/no_repetition_reward_func": -0.04408019967377186, "rewards/verse_reward_func": 0.0, "step": 556 }, { "completion_length": 245.9375, "epoch": 4.456, "grad_norm": 0.46875, "kl": 0.0578437726944685, "learning_rate": 4.456e-05, "loss": 0.0023, "reward": 0.11045874655246735, "reward_std": 0.31655558943748474, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.15424561500549316, "rewards/no_repetition_reward_func": -0.043786875903606415, "rewards/verse_reward_func": 0.0, "step": 557 }, { "completion_length": 250.703125, "epoch": 4.464, "grad_norm": 0.4921875, "kl": 0.04751048423349857, "learning_rate": 4.4640000000000006e-05, "loss": 0.0019, "reward": 0.1276624035090208, "reward_std": 0.3324805125594139, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1677662394940853, "rewards/no_repetition_reward_func": -0.04010382853448391, "rewards/verse_reward_func": 0.0, "step": 558 }, { "completion_length": 250.796875, "epoch": 4.4719999999999995, "grad_norm": 0.4609375, "kl": 0.05287078395485878, "learning_rate": 4.472e-05, "loss": 0.0021, "reward": 0.23626679927110672, "reward_std": 0.4682071805000305, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2648310214281082, "rewards/no_repetition_reward_func": -0.028564222157001495, "rewards/verse_reward_func": 0.0, "step": 559 }, { "completion_length": 245.328125, "epoch": 4.48, "grad_norm": 0.4921875, "kl": 0.05308397859334946, "learning_rate": 4.4800000000000005e-05, "loss": 0.0021, "reward": 0.1163865402340889, "reward_std": 0.2657028138637543, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1616678722202778, "rewards/no_repetition_reward_func": -0.03746882453560829, "rewards/verse_reward_func": -0.0078125, "step": 560 }, { "completion_length": 245.78125, "epoch": 4.4879999999999995, "grad_norm": 0.56640625, "kl": 0.058876533061265945, "learning_rate": 4.488e-05, "loss": 0.0024, "reward": 0.24676192551851273, "reward_std": 0.516466960310936, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2931524068117142, "rewards/no_repetition_reward_func": -0.04639047756791115, "rewards/verse_reward_func": 0.0, "step": 561 }, { "completion_length": 252.0, "epoch": 4.496, "grad_norm": 0.5, "kl": 0.06451750174164772, "learning_rate": 4.496e-05, "loss": 0.0026, "reward": 0.09079867601394653, "reward_std": 0.264019638299942, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1355454996228218, "rewards/no_repetition_reward_func": -0.044746821746230125, "rewards/verse_reward_func": 0.0, "step": 562 }, { "completion_length": 242.859375, "epoch": 4.504, "grad_norm": 0.498046875, "kl": 0.06682083010673523, "learning_rate": 4.504e-05, "loss": 0.0027, "reward": 0.043545521795749664, "reward_std": 0.21909944713115692, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.09199747815728188, "rewards/no_repetition_reward_func": -0.04063945636153221, "rewards/verse_reward_func": -0.0078125, "step": 563 }, { "completion_length": 240.6875, "epoch": 4.5120000000000005, "grad_norm": 0.470703125, "kl": 0.061537209898233414, "learning_rate": 4.512e-05, "loss": 0.0025, "reward": 0.0722761326469481, "reward_std": 0.2555946409702301, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11172854341566563, "rewards/no_repetition_reward_func": -0.039452409371733665, "rewards/verse_reward_func": 0.0, "step": 564 }, { "completion_length": 250.828125, "epoch": 4.52, "grad_norm": 0.52734375, "kl": 0.06111456640064716, "learning_rate": 4.52e-05, "loss": 0.0024, "reward": 0.11499336641281843, "reward_std": 0.2983535975217819, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.16004596929997206, "rewards/no_repetition_reward_func": -0.04505259357392788, "rewards/verse_reward_func": 0.0, "step": 565 }, { "completion_length": 248.90625, "epoch": 4.5280000000000005, "grad_norm": 0.443359375, "kl": 0.06014810502529144, "learning_rate": 4.528e-05, "loss": 0.0024, "reward": 0.22362623363733292, "reward_std": 0.5010137856006622, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.27347005903720856, "rewards/no_repetition_reward_func": -0.04984383098781109, "rewards/verse_reward_func": 0.0, "step": 566 }, { "completion_length": 250.25, "epoch": 4.536, "grad_norm": 0.51171875, "kl": 0.057553939521312714, "learning_rate": 4.536e-05, "loss": 0.0023, "reward": 0.08286923915147781, "reward_std": 0.25638245046138763, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1389603465795517, "rewards/no_repetition_reward_func": -0.05609111487865448, "rewards/verse_reward_func": 0.0, "step": 567 }, { "completion_length": 244.859375, "epoch": 4.5440000000000005, "grad_norm": 0.515625, "kl": 0.06487394124269485, "learning_rate": 4.5440000000000005e-05, "loss": 0.0026, "reward": 0.1491219848394394, "reward_std": 0.376513734459877, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1980365365743637, "rewards/no_repetition_reward_func": -0.048914551734924316, "rewards/verse_reward_func": 0.0, "step": 568 }, { "completion_length": 252.3125, "epoch": 4.552, "grad_norm": 0.482421875, "kl": 0.05561799183487892, "learning_rate": 4.5520000000000005e-05, "loss": 0.0022, "reward": 0.14109763596206903, "reward_std": 0.34867730736732483, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.19208068400621414, "rewards/no_repetition_reward_func": -0.05098305642604828, "rewards/verse_reward_func": 0.0, "step": 569 }, { "completion_length": 241.953125, "epoch": 4.5600000000000005, "grad_norm": 0.94140625, "kl": 0.06289758160710335, "learning_rate": 4.5600000000000004e-05, "loss": 0.0025, "reward": 0.06308556534349918, "reward_std": 0.26933879405260086, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11393633484840393, "rewards/no_repetition_reward_func": -0.04303826950490475, "rewards/verse_reward_func": -0.0078125, "step": 570 }, { "completion_length": 248.421875, "epoch": 4.568, "grad_norm": 0.6015625, "kl": 0.06465929001569748, "learning_rate": 4.568e-05, "loss": 0.0026, "reward": 0.18917784094810486, "reward_std": 0.5661404430866241, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.24328748881816864, "rewards/no_repetition_reward_func": -0.05410964414477348, "rewards/verse_reward_func": 0.0, "step": 571 }, { "completion_length": 242.65625, "epoch": 4.576, "grad_norm": 0.498046875, "kl": 0.05894271470606327, "learning_rate": 4.576e-05, "loss": 0.0024, "reward": 0.12052318826317787, "reward_std": 0.29892750829458237, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.17412101477384567, "rewards/no_repetition_reward_func": -0.0535978227853775, "rewards/verse_reward_func": 0.0, "step": 572 }, { "completion_length": 240.734375, "epoch": 4.584, "grad_norm": 0.484375, "kl": 0.06942283362150192, "learning_rate": 4.584e-05, "loss": 0.0028, "reward": 0.08919051103293896, "reward_std": 0.2817714065313339, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.13201912119984627, "rewards/no_repetition_reward_func": -0.04282860644161701, "rewards/verse_reward_func": 0.0, "step": 573 }, { "completion_length": 249.46875, "epoch": 4.592, "grad_norm": 0.462890625, "kl": 0.07049747556447983, "learning_rate": 4.592e-05, "loss": 0.0028, "reward": 0.20653299242258072, "reward_std": 0.40490999817848206, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.25737326592206955, "rewards/no_repetition_reward_func": -0.05084027163684368, "rewards/verse_reward_func": 0.0, "step": 574 }, { "completion_length": 252.875, "epoch": 4.6, "grad_norm": 0.484375, "kl": 0.0594073049724102, "learning_rate": 4.600000000000001e-05, "loss": 0.0024, "reward": 0.12491483613848686, "reward_std": 0.36827419698238373, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1589195355772972, "rewards/no_repetition_reward_func": -0.03400469198822975, "rewards/verse_reward_func": 0.0, "step": 575 }, { "completion_length": 246.828125, "epoch": 4.608, "grad_norm": 0.50390625, "kl": 0.06552321463823318, "learning_rate": 4.608e-05, "loss": 0.0026, "reward": 0.06298316456377506, "reward_std": 0.2517784982919693, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.10681592673063278, "rewards/no_repetition_reward_func": -0.04383276589214802, "rewards/verse_reward_func": 0.0, "step": 576 }, { "completion_length": 253.828125, "epoch": 4.616, "grad_norm": 0.50390625, "kl": 0.070371113717556, "learning_rate": 4.6160000000000005e-05, "loss": 0.0028, "reward": 0.17651895433664322, "reward_std": 0.41989557445049286, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2309720367193222, "rewards/no_repetition_reward_func": -0.05445307120680809, "rewards/verse_reward_func": 0.0, "step": 577 }, { "completion_length": 238.109375, "epoch": 4.624, "grad_norm": 0.5625, "kl": 0.06294895149767399, "learning_rate": 4.624e-05, "loss": 0.0025, "reward": 0.12121243216097355, "reward_std": 0.3120894432067871, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.15295271202921867, "rewards/no_repetition_reward_func": -0.03174027241766453, "rewards/verse_reward_func": 0.0, "step": 578 }, { "completion_length": 250.921875, "epoch": 4.632, "grad_norm": 0.59375, "kl": 0.06734685227274895, "learning_rate": 4.6320000000000004e-05, "loss": 0.0027, "reward": 0.14007411524653435, "reward_std": 0.3392748236656189, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.18415579944849014, "rewards/no_repetition_reward_func": -0.044081687927246094, "rewards/verse_reward_func": 0.0, "step": 579 }, { "completion_length": 251.546875, "epoch": 4.64, "grad_norm": 0.47265625, "kl": 0.0642537958920002, "learning_rate": 4.64e-05, "loss": 0.0026, "reward": 0.11000140383839607, "reward_std": 0.3341681659221649, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1549411155283451, "rewards/no_repetition_reward_func": -0.044939711689949036, "rewards/verse_reward_func": 0.0, "step": 580 }, { "completion_length": 251.546875, "epoch": 4.648, "grad_norm": 0.5234375, "kl": 0.0692688375711441, "learning_rate": 4.648e-05, "loss": 0.0028, "reward": 0.22366990894079208, "reward_std": 0.468620702624321, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.26344625651836395, "rewards/no_repetition_reward_func": -0.03977634385228157, "rewards/verse_reward_func": 0.0, "step": 581 }, { "completion_length": 246.90625, "epoch": 4.656, "grad_norm": 0.51171875, "kl": 0.07631659507751465, "learning_rate": 4.656e-05, "loss": 0.0031, "reward": 0.16867996007204056, "reward_std": 0.38201436400413513, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.21505135297775269, "rewards/no_repetition_reward_func": -0.04637140966951847, "rewards/verse_reward_func": 0.0, "step": 582 }, { "completion_length": 249.046875, "epoch": 4.664, "grad_norm": 0.546875, "kl": 0.07898609712719917, "learning_rate": 4.664e-05, "loss": 0.0032, "reward": 0.20392465591430664, "reward_std": 0.5326818227767944, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2516888976097107, "rewards/no_repetition_reward_func": -0.03995174169540405, "rewards/verse_reward_func": -0.0078125, "step": 583 }, { "completion_length": 241.25, "epoch": 4.672, "grad_norm": 0.49609375, "kl": 0.07729285210371017, "learning_rate": 4.672e-05, "loss": 0.0031, "reward": 0.17351197451353073, "reward_std": 0.4081237018108368, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.22510148584842682, "rewards/no_repetition_reward_func": -0.05158952437341213, "rewards/verse_reward_func": 0.0, "step": 584 }, { "completion_length": 246.15625, "epoch": 4.68, "grad_norm": 0.9453125, "kl": 0.10916192457079887, "learning_rate": 4.6800000000000006e-05, "loss": 0.0044, "reward": 0.19504687935113907, "reward_std": 0.5123040825128555, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2752591446042061, "rewards/no_repetition_reward_func": -0.05677477829158306, "rewards/verse_reward_func": -0.0234375, "step": 585 }, { "completion_length": 236.421875, "epoch": 4.688, "grad_norm": 0.478515625, "kl": 0.10340984165668488, "learning_rate": 4.688e-05, "loss": 0.0041, "reward": 0.11900520324707031, "reward_std": 0.3547206073999405, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1705191805958748, "rewards/no_repetition_reward_func": -0.051513971760869026, "rewards/verse_reward_func": 0.0, "step": 586 }, { "completion_length": 253.5, "epoch": 4.696, "grad_norm": 0.470703125, "kl": 0.06603807583451271, "learning_rate": 4.6960000000000004e-05, "loss": 0.0026, "reward": 0.09853224828839302, "reward_std": 0.3008718267083168, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.14943327754735947, "rewards/no_repetition_reward_func": -0.050901031121611595, "rewards/verse_reward_func": 0.0, "step": 587 }, { "completion_length": 247.5625, "epoch": 4.704, "grad_norm": 0.5078125, "kl": 0.07290445268154144, "learning_rate": 4.7040000000000004e-05, "loss": 0.0029, "reward": 0.18103326112031937, "reward_std": 0.3929567188024521, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.21724969148635864, "rewards/no_repetition_reward_func": -0.036216454580426216, "rewards/verse_reward_func": 0.0, "step": 588 }, { "completion_length": 247.15625, "epoch": 4.712, "grad_norm": 0.5234375, "kl": 0.06967240571975708, "learning_rate": 4.712e-05, "loss": 0.0028, "reward": 0.14617520943284035, "reward_std": 0.37008528411388397, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.19580107927322388, "rewards/no_repetition_reward_func": -0.04962587542831898, "rewards/verse_reward_func": 0.0, "step": 589 }, { "completion_length": 240.8125, "epoch": 4.72, "grad_norm": 0.6171875, "kl": 0.06725095584988594, "learning_rate": 4.72e-05, "loss": 0.0027, "reward": 0.13513822108507156, "reward_std": 0.4048299640417099, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.21014130115509033, "rewards/no_repetition_reward_func": -0.05937807261943817, "rewards/verse_reward_func": -0.015625, "step": 590 }, { "completion_length": 251.375, "epoch": 4.728, "grad_norm": 0.51171875, "kl": 0.0789010226726532, "learning_rate": 4.728e-05, "loss": 0.0032, "reward": 0.23419994488358498, "reward_std": 0.4861316680908203, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.275875061750412, "rewards/no_repetition_reward_func": -0.04167512618005276, "rewards/verse_reward_func": 0.0, "step": 591 }, { "completion_length": 245.984375, "epoch": 4.736, "grad_norm": 0.51953125, "kl": 0.07335853576660156, "learning_rate": 4.736000000000001e-05, "loss": 0.0029, "reward": 0.18885136395692825, "reward_std": 0.48193663358688354, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.25340161472558975, "rewards/no_repetition_reward_func": -0.0645502433180809, "rewards/verse_reward_func": 0.0, "step": 592 }, { "completion_length": 243.09375, "epoch": 4.744, "grad_norm": 0.474609375, "kl": 0.08893422037363052, "learning_rate": 4.744e-05, "loss": 0.0036, "reward": 0.16562946885824203, "reward_std": 0.37279781699180603, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.22220224142074585, "rewards/no_repetition_reward_func": -0.05657275952398777, "rewards/verse_reward_func": 0.0, "step": 593 }, { "completion_length": 249.359375, "epoch": 4.752, "grad_norm": 0.50390625, "kl": 0.07243499532341957, "learning_rate": 4.7520000000000006e-05, "loss": 0.0029, "reward": 0.2762882634997368, "reward_std": 0.531866729259491, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.31926538050174713, "rewards/no_repetition_reward_func": -0.04297712817788124, "rewards/verse_reward_func": 0.0, "step": 594 }, { "completion_length": 248.21875, "epoch": 4.76, "grad_norm": 0.5078125, "kl": 0.0910349190235138, "learning_rate": 4.76e-05, "loss": 0.0036, "reward": 0.3335821256041527, "reward_std": 0.5663000643253326, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.42326126992702484, "rewards/no_repetition_reward_func": -0.08967911824584007, "rewards/verse_reward_func": 0.0, "step": 595 }, { "completion_length": 246.703125, "epoch": 4.768, "grad_norm": 0.48046875, "kl": 0.0753243900835514, "learning_rate": 4.7680000000000004e-05, "loss": 0.003, "reward": 0.17703241854906082, "reward_std": 0.41054315865039825, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2313927263021469, "rewards/no_repetition_reward_func": -0.05436030402779579, "rewards/verse_reward_func": 0.0, "step": 596 }, { "completion_length": 248.765625, "epoch": 4.776, "grad_norm": 0.498046875, "kl": 0.08113643527030945, "learning_rate": 4.7760000000000004e-05, "loss": 0.0032, "reward": 0.2205743044614792, "reward_std": 0.5182320922613144, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2780863419175148, "rewards/no_repetition_reward_func": -0.05751203000545502, "rewards/verse_reward_func": 0.0, "step": 597 }, { "completion_length": 247.453125, "epoch": 4.784, "grad_norm": 0.5, "kl": 0.07604524493217468, "learning_rate": 4.784e-05, "loss": 0.003, "reward": 0.15586494654417038, "reward_std": 0.4280734211206436, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2155994474887848, "rewards/no_repetition_reward_func": -0.05973450094461441, "rewards/verse_reward_func": 0.0, "step": 598 }, { "completion_length": 236.59375, "epoch": 4.792, "grad_norm": 0.5078125, "kl": 0.08659068122506142, "learning_rate": 4.792e-05, "loss": 0.0035, "reward": 0.22756921127438545, "reward_std": 0.40509286522865295, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.28538284823298454, "rewards/no_repetition_reward_func": -0.057813649997115135, "rewards/verse_reward_func": 0.0, "step": 599 }, { "completion_length": 253.453125, "epoch": 4.8, "grad_norm": 0.46484375, "kl": 0.07629616186022758, "learning_rate": 4.8e-05, "loss": 0.0031, "reward": 0.1744313519448042, "reward_std": 0.4477880150079727, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.242221862077713, "rewards/no_repetition_reward_func": -0.06779049150645733, "rewards/verse_reward_func": 0.0, "step": 600 }, { "completion_length": 248.21875, "epoch": 4.808, "grad_norm": 0.66015625, "kl": 0.08337443321943283, "learning_rate": 4.808e-05, "loss": 0.0033, "reward": 0.21022942662239075, "reward_std": 0.5453375577926636, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2882983535528183, "rewards/no_repetition_reward_func": -0.07025642320513725, "rewards/verse_reward_func": -0.0078125, "step": 601 }, { "completion_length": 251.46875, "epoch": 4.816, "grad_norm": 0.478515625, "kl": 0.08959319815039635, "learning_rate": 4.816e-05, "loss": 0.0036, "reward": 0.2928263023495674, "reward_std": 0.6384672820568085, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.35908061265945435, "rewards/no_repetition_reward_func": -0.06625429913401604, "rewards/verse_reward_func": 0.0, "step": 602 }, { "completion_length": 247.890625, "epoch": 4.824, "grad_norm": 0.69140625, "kl": 0.09710849076509476, "learning_rate": 4.824e-05, "loss": 0.0039, "reward": 0.3341042101383209, "reward_std": 0.6910581290721893, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3982357978820801, "rewards/no_repetition_reward_func": -0.06413161009550095, "rewards/verse_reward_func": 0.0, "step": 603 }, { "completion_length": 252.09375, "epoch": 4.832, "grad_norm": 0.60546875, "kl": 0.0944628044962883, "learning_rate": 4.8320000000000005e-05, "loss": 0.0038, "reward": 0.2592407763004303, "reward_std": 0.5019288063049316, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3305037096142769, "rewards/no_repetition_reward_func": -0.07126292586326599, "rewards/verse_reward_func": 0.0, "step": 604 }, { "completion_length": 248.25, "epoch": 4.84, "grad_norm": 0.515625, "kl": 0.09675934910774231, "learning_rate": 4.8400000000000004e-05, "loss": 0.0039, "reward": 0.1080695241689682, "reward_std": 0.34185443073511124, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.169479139149189, "rewards/no_repetition_reward_func": -0.06140962429344654, "rewards/verse_reward_func": 0.0, "step": 605 }, { "completion_length": 244.9375, "epoch": 4.848, "grad_norm": 0.6484375, "kl": 0.1339133232831955, "learning_rate": 4.8480000000000003e-05, "loss": 0.0054, "reward": 0.2529808580875397, "reward_std": 0.6144249439239502, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3406665772199631, "rewards/no_repetition_reward_func": -0.079873226583004, "rewards/verse_reward_func": -0.0078125, "step": 606 }, { "completion_length": 247.640625, "epoch": 4.856, "grad_norm": 0.49609375, "kl": 0.10442933440208435, "learning_rate": 4.856e-05, "loss": 0.0042, "reward": 0.3255202993750572, "reward_std": 0.6259570121765137, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.41073091328144073, "rewards/no_repetition_reward_func": -0.08521062135696411, "rewards/verse_reward_func": 0.0, "step": 607 }, { "completion_length": 252.390625, "epoch": 4.864, "grad_norm": 0.5078125, "kl": 0.08998768031597137, "learning_rate": 4.864e-05, "loss": 0.0036, "reward": 0.31819823384284973, "reward_std": 0.7106236219406128, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3650926649570465, "rewards/no_repetition_reward_func": -0.04689442738890648, "rewards/verse_reward_func": 0.0, "step": 608 }, { "completion_length": 237.765625, "epoch": 4.872, "grad_norm": 0.5703125, "kl": 0.09412843734025955, "learning_rate": 4.872000000000001e-05, "loss": 0.0038, "reward": 0.30225516855716705, "reward_std": 0.5493065863847733, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.38312481343746185, "rewards/no_repetition_reward_func": -0.08086967468261719, "rewards/verse_reward_func": 0.0, "step": 609 }, { "completion_length": 246.609375, "epoch": 4.88, "grad_norm": 0.46875, "kl": 0.1129220649600029, "learning_rate": 4.88e-05, "loss": 0.0045, "reward": 0.13578515499830246, "reward_std": 0.33328405022621155, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.19100195169448853, "rewards/no_repetition_reward_func": -0.055216800421476364, "rewards/verse_reward_func": 0.0, "step": 610 }, { "completion_length": 238.84375, "epoch": 4.888, "grad_norm": 0.546875, "kl": 0.1045229434967041, "learning_rate": 4.8880000000000006e-05, "loss": 0.0042, "reward": 0.2549683004617691, "reward_std": 0.4534274786710739, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.31245770305395126, "rewards/no_repetition_reward_func": -0.05748940259218216, "rewards/verse_reward_func": 0.0, "step": 611 }, { "completion_length": 242.375, "epoch": 4.896, "grad_norm": 2.5, "kl": 0.16610446572303772, "learning_rate": 4.896e-05, "loss": 0.0066, "reward": 0.04496264085173607, "reward_std": 0.231070876121521, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.11814423277974129, "rewards/no_repetition_reward_func": -0.06536909006536007, "rewards/verse_reward_func": -0.0078125, "step": 612 }, { "completion_length": 240.109375, "epoch": 4.904, "grad_norm": 0.58984375, "kl": 0.15176084637641907, "learning_rate": 4.9040000000000005e-05, "loss": 0.0061, "reward": 0.2035682424902916, "reward_std": 0.51158607006073, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2977846711874008, "rewards/no_repetition_reward_func": -0.08640392124652863, "rewards/verse_reward_func": -0.0078125, "step": 613 }, { "completion_length": 243.390625, "epoch": 4.912, "grad_norm": 0.59375, "kl": 0.14970503002405167, "learning_rate": 4.9120000000000004e-05, "loss": 0.006, "reward": 0.4014853686094284, "reward_std": 0.6771009564399719, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.4803149253129959, "rewards/no_repetition_reward_func": -0.07101703062653542, "rewards/verse_reward_func": -0.0078125, "step": 614 }, { "completion_length": 249.9375, "epoch": 4.92, "grad_norm": 0.515625, "kl": 0.10031254962086678, "learning_rate": 4.92e-05, "loss": 0.004, "reward": 0.3008745163679123, "reward_std": 0.5557428896427155, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.394342377781868, "rewards/no_repetition_reward_func": -0.08565537258982658, "rewards/verse_reward_func": -0.0078125, "step": 615 }, { "completion_length": 240.90625, "epoch": 4.928, "grad_norm": 0.515625, "kl": 0.1589900478720665, "learning_rate": 4.928e-05, "loss": 0.0064, "reward": 0.19885680824518204, "reward_std": 0.502071738243103, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2651725560426712, "rewards/no_repetition_reward_func": -0.06631575524806976, "rewards/verse_reward_func": 0.0, "step": 616 }, { "completion_length": 250.203125, "epoch": 4.936, "grad_norm": 0.486328125, "kl": 0.10171061009168625, "learning_rate": 4.936e-05, "loss": 0.0041, "reward": 0.19686460494995117, "reward_std": 0.4519178420305252, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.28136055171489716, "rewards/no_repetition_reward_func": -0.08449595049023628, "rewards/verse_reward_func": 0.0, "step": 617 }, { "completion_length": 248.59375, "epoch": 4.944, "grad_norm": 0.54296875, "kl": 0.09560269862413406, "learning_rate": 4.944e-05, "loss": 0.0038, "reward": 0.26918403059244156, "reward_std": 0.5991262197494507, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.34583303332328796, "rewards/no_repetition_reward_func": -0.07664897292852402, "rewards/verse_reward_func": 0.0, "step": 618 }, { "completion_length": 256.0, "epoch": 4.952, "grad_norm": 0.46875, "kl": 0.09957870841026306, "learning_rate": 4.952e-05, "loss": 0.004, "reward": 0.40699057281017303, "reward_std": 0.6853319108486176, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5069164931774139, "rewards/no_repetition_reward_func": -0.09211339801549911, "rewards/verse_reward_func": -0.0078125, "step": 619 }, { "completion_length": 243.4375, "epoch": 4.96, "grad_norm": 0.486328125, "kl": 0.1106242686510086, "learning_rate": 4.96e-05, "loss": 0.0044, "reward": 0.2448071539402008, "reward_std": 0.5312705039978027, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3106445297598839, "rewards/no_repetition_reward_func": -0.06583736836910248, "rewards/verse_reward_func": 0.0, "step": 620 }, { "completion_length": 250.203125, "epoch": 4.968, "grad_norm": 0.55859375, "kl": 0.1022007130086422, "learning_rate": 4.9680000000000005e-05, "loss": 0.0041, "reward": 0.28312238305807114, "reward_std": 0.4877689480781555, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.34327971935272217, "rewards/no_repetition_reward_func": -0.060157328844070435, "rewards/verse_reward_func": 0.0, "step": 621 }, { "completion_length": 241.6875, "epoch": 4.976, "grad_norm": 0.515625, "kl": 0.13872842863202095, "learning_rate": 4.976e-05, "loss": 0.0055, "reward": 0.4026813507080078, "reward_std": 0.7721879780292511, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.49762773513793945, "rewards/no_repetition_reward_func": -0.09494641795754433, "rewards/verse_reward_func": 0.0, "step": 622 }, { "completion_length": 251.078125, "epoch": 4.984, "grad_norm": 0.50390625, "kl": 0.09314364194869995, "learning_rate": 4.9840000000000004e-05, "loss": 0.0037, "reward": 0.31646864116191864, "reward_std": 0.5770153105258942, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.38593439757823944, "rewards/no_repetition_reward_func": -0.06946573406457901, "rewards/verse_reward_func": 0.0, "step": 623 }, { "completion_length": 247.875, "epoch": 4.992, "grad_norm": 0.50390625, "kl": 0.10386937484145164, "learning_rate": 4.992e-05, "loss": 0.0042, "reward": 0.2517606168985367, "reward_std": 0.5387460589408875, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3291439712047577, "rewards/no_repetition_reward_func": -0.07738335430622101, "rewards/verse_reward_func": 0.0, "step": 624 }, { "completion_length": 256.0, "epoch": 5.0, "grad_norm": 0.51171875, "kl": 0.09986187890172005, "learning_rate": 5e-05, "loss": 0.004, "reward": 0.39339618384838104, "reward_std": 0.6682592928409576, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.483162984251976, "rewards/no_repetition_reward_func": -0.08976681903004646, "rewards/verse_reward_func": 0.0, "step": 625 }, { "completion_length": 248.109375, "epoch": 5.008, "grad_norm": 0.47265625, "kl": 0.10701540112495422, "learning_rate": 4.9999996100897126e-05, "loss": 0.0043, "reward": 0.2133556306362152, "reward_std": 0.45655883848667145, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.2812703549861908, "rewards/no_repetition_reward_func": -0.06791472434997559, "rewards/verse_reward_func": 0.0, "step": 626 }, { "completion_length": 250.6875, "epoch": 5.016, "grad_norm": 0.53125, "kl": 0.10232346504926682, "learning_rate": 4.999998440358973e-05, "loss": 0.0041, "reward": 0.29579395055770874, "reward_std": 0.6751673817634583, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.369898185133934, "rewards/no_repetition_reward_func": -0.07410424575209618, "rewards/verse_reward_func": 0.0, "step": 627 }, { "completion_length": 245.59375, "epoch": 5.024, "grad_norm": 0.50390625, "kl": 0.12306656315922737, "learning_rate": 4.9999964908081455e-05, "loss": 0.0049, "reward": 0.2991075962781906, "reward_std": 0.7553718686103821, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.40697433054447174, "rewards/no_repetition_reward_func": -0.07661674916744232, "rewards/verse_reward_func": -0.03125, "step": 628 }, { "completion_length": 242.328125, "epoch": 5.032, "grad_norm": 0.54296875, "kl": 0.14630481600761414, "learning_rate": 4.999993761437838e-05, "loss": 0.0059, "reward": 0.3525944724678993, "reward_std": 0.6620518416166306, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.449413537979126, "rewards/no_repetition_reward_func": -0.08900660648941994, "rewards/verse_reward_func": -0.0078125, "step": 629 }, { "completion_length": 246.859375, "epoch": 5.04, "grad_norm": 0.515625, "kl": 0.10001561418175697, "learning_rate": 4.9999902522489015e-05, "loss": 0.004, "reward": 0.17926031909883022, "reward_std": 0.46991148591041565, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.24129701405763626, "rewards/no_repetition_reward_func": -0.062036702409386635, "rewards/verse_reward_func": 0.0, "step": 630 }, { "completion_length": 241.671875, "epoch": 5.048, "grad_norm": 0.69140625, "kl": 0.11016019061207771, "learning_rate": 4.999985963242432e-05, "loss": 0.0044, "reward": 0.3123588301241398, "reward_std": 0.5659860968589783, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3785107880830765, "rewards/no_repetition_reward_func": -0.058339452371001244, "rewards/verse_reward_func": -0.0078125, "step": 631 }, { "completion_length": 253.203125, "epoch": 5.056, "grad_norm": 0.5078125, "kl": 0.1335635744035244, "learning_rate": 4.9999808944197666e-05, "loss": 0.0053, "reward": 0.3049922585487366, "reward_std": 0.7226390540599823, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3897777944803238, "rewards/no_repetition_reward_func": -0.08478553220629692, "rewards/verse_reward_func": 0.0, "step": 632 }, { "completion_length": 251.796875, "epoch": 5.064, "grad_norm": 0.46484375, "kl": 0.11302272975444794, "learning_rate": 4.999975045782486e-05, "loss": 0.0045, "reward": 0.25280340015888214, "reward_std": 0.633881151676178, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.33503320813179016, "rewards/no_repetition_reward_func": -0.08222980052232742, "rewards/verse_reward_func": 0.0, "step": 633 }, { "completion_length": 246.4375, "epoch": 5.072, "grad_norm": 1.875, "kl": 0.30635254457592964, "learning_rate": 4.999968417332415e-05, "loss": 0.0123, "reward": 0.3728778213262558, "reward_std": 0.6740923523902893, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.46109867095947266, "rewards/no_repetition_reward_func": -0.08040833100676537, "rewards/verse_reward_func": -0.0078125, "step": 634 }, { "completion_length": 244.15625, "epoch": 5.08, "grad_norm": 0.5078125, "kl": 0.11850904673337936, "learning_rate": 4.999961009071621e-05, "loss": 0.0047, "reward": 0.38646478950977325, "reward_std": 0.6769618391990662, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.46459659934043884, "rewards/no_repetition_reward_func": -0.07813180983066559, "rewards/verse_reward_func": 0.0, "step": 635 }, { "completion_length": 230.59375, "epoch": 5.088, "grad_norm": 0.6328125, "kl": 0.1527973674237728, "learning_rate": 4.999952821002415e-05, "loss": 0.0061, "reward": 0.36407271027565, "reward_std": 0.8414862751960754, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.45755907893180847, "rewards/no_repetition_reward_func": -0.07786133885383606, "rewards/verse_reward_func": -0.015625, "step": 636 }, { "completion_length": 248.734375, "epoch": 5.096, "grad_norm": 0.546875, "kl": 0.11745407059788704, "learning_rate": 4.999943853127351e-05, "loss": 0.0047, "reward": 0.40544579923152924, "reward_std": 0.8066418170928955, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5170612931251526, "rewards/no_repetition_reward_func": -0.11161550506949425, "rewards/verse_reward_func": 0.0, "step": 637 }, { "completion_length": 247.15625, "epoch": 5.104, "grad_norm": 0.5234375, "kl": 0.13256538659334183, "learning_rate": 4.9999341054492265e-05, "loss": 0.0053, "reward": 0.33550435677170753, "reward_std": 0.6524185091257095, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.4198118895292282, "rewards/no_repetition_reward_func": -0.08430751040577888, "rewards/verse_reward_func": 0.0, "step": 638 }, { "completion_length": 249.859375, "epoch": 5.112, "grad_norm": 0.93359375, "kl": 0.24010591208934784, "learning_rate": 4.9999235779710826e-05, "loss": 0.0096, "reward": 0.3309745788574219, "reward_std": 0.6208302676677704, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.43026456236839294, "rewards/no_repetition_reward_func": -0.09928997606039047, "rewards/verse_reward_func": 0.0, "step": 639 }, { "completion_length": 251.53125, "epoch": 5.12, "grad_norm": 54.25, "kl": 1.669109657406807, "learning_rate": 4.999912270696202e-05, "loss": 0.0668, "reward": 0.47512732446193695, "reward_std": 0.8353016376495361, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5713015645742416, "rewards/no_repetition_reward_func": -0.09617426246404648, "rewards/verse_reward_func": 0.0, "step": 640 }, { "completion_length": 246.34375, "epoch": 5.128, "grad_norm": 0.50390625, "kl": 0.11164813488721848, "learning_rate": 4.999900183628112e-05, "loss": 0.0045, "reward": 0.45047521591186523, "reward_std": 0.7869266122579575, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5524164736270905, "rewards/no_repetition_reward_func": -0.10194121301174164, "rewards/verse_reward_func": 0.0, "step": 641 }, { "completion_length": 249.109375, "epoch": 5.136, "grad_norm": 0.494140625, "kl": 0.12122597172856331, "learning_rate": 4.999887316770584e-05, "loss": 0.0048, "reward": 0.3594793379306793, "reward_std": 0.545298159122467, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.4564395248889923, "rewards/no_repetition_reward_func": -0.09696022048592567, "rewards/verse_reward_func": 0.0, "step": 642 }, { "completion_length": 249.90625, "epoch": 5.144, "grad_norm": 0.51171875, "kl": 0.11259621009230614, "learning_rate": 4.9998736701276295e-05, "loss": 0.0045, "reward": 0.30456364154815674, "reward_std": 0.6497884690761566, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.4008449763059616, "rewards/no_repetition_reward_func": -0.08846884593367577, "rewards/verse_reward_func": -0.0078125, "step": 643 }, { "completion_length": 256.0, "epoch": 5.152, "grad_norm": 0.53125, "kl": 0.12345957010984421, "learning_rate": 4.9998592437035076e-05, "loss": 0.0049, "reward": 0.3114955872297287, "reward_std": 0.595977857708931, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.4089653789997101, "rewards/no_repetition_reward_func": -0.09746980294585228, "rewards/verse_reward_func": 0.0, "step": 644 }, { "completion_length": 251.421875, "epoch": 5.16, "grad_norm": 0.455078125, "kl": 0.11941179633140564, "learning_rate": 4.9998440375027166e-05, "loss": 0.0048, "reward": 0.31190137565135956, "reward_std": 0.7205438613891602, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.43679986894130707, "rewards/no_repetition_reward_func": -0.1248985081911087, "rewards/verse_reward_func": 0.0, "step": 645 }, { "completion_length": 248.046875, "epoch": 5.168, "grad_norm": 0.51953125, "kl": 0.13833057135343552, "learning_rate": 4.99982805153e-05, "loss": 0.0055, "reward": 0.2767088860273361, "reward_std": 0.664696216583252, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3830173462629318, "rewards/no_repetition_reward_func": -0.1063084527850151, "rewards/verse_reward_func": 0.0, "step": 646 }, { "completion_length": 252.078125, "epoch": 5.176, "grad_norm": 0.4375, "kl": 0.11104291677474976, "learning_rate": 4.9998112857903454e-05, "loss": 0.0044, "reward": 0.38983583450317383, "reward_std": 0.7454847097396851, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.50460484623909, "rewards/no_repetition_reward_func": -0.10695652291178703, "rewards/verse_reward_func": -0.0078125, "step": 647 }, { "completion_length": 243.34375, "epoch": 5.184, "grad_norm": 0.54296875, "kl": 0.12240078300237656, "learning_rate": 4.999793740288982e-05, "loss": 0.0049, "reward": 0.4561731368303299, "reward_std": 0.9152504801750183, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5617558360099792, "rewards/no_repetition_reward_func": -0.10558267310261726, "rewards/verse_reward_func": 0.0, "step": 648 }, { "completion_length": 254.171875, "epoch": 5.192, "grad_norm": 0.455078125, "kl": 0.128336101770401, "learning_rate": 4.9997754150313815e-05, "loss": 0.0051, "reward": 0.39917364716529846, "reward_std": 0.9331632554531097, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.4955682158470154, "rewards/no_repetition_reward_func": -0.08858205750584602, "rewards/verse_reward_func": -0.0078125, "step": 649 }, { "completion_length": 243.140625, "epoch": 5.2, "grad_norm": 0.478515625, "kl": 0.13098017871379852, "learning_rate": 4.999756310023261e-05, "loss": 0.0052, "reward": 0.4007929861545563, "reward_std": 0.7436605393886566, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5267364233732224, "rewards/no_repetition_reward_func": -0.11813091486692429, "rewards/verse_reward_func": -0.0078125, "step": 650 }, { "completion_length": 246.265625, "epoch": 5.208, "grad_norm": 0.46875, "kl": 0.1445467323064804, "learning_rate": 4.99973642527058e-05, "loss": 0.0058, "reward": 0.40201570093631744, "reward_std": 0.7815326452255249, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5376110672950745, "rewards/no_repetition_reward_func": -0.13559535145759583, "rewards/verse_reward_func": 0.0, "step": 651 }, { "completion_length": 249.140625, "epoch": 5.216, "grad_norm": 0.5390625, "kl": 0.11913314089179039, "learning_rate": 4.999715760779541e-05, "loss": 0.0048, "reward": 0.39222583174705505, "reward_std": 0.7636627852916718, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5041632652282715, "rewards/no_repetition_reward_func": -0.10412491112947464, "rewards/verse_reward_func": -0.0078125, "step": 652 }, { "completion_length": 243.53125, "epoch": 5.224, "grad_norm": 0.46875, "kl": 0.11703946813941002, "learning_rate": 4.9996943165565905e-05, "loss": 0.0047, "reward": 0.44306911528110504, "reward_std": 0.9623886942863464, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5805923640727997, "rewards/no_repetition_reward_func": -0.12971074879169464, "rewards/verse_reward_func": -0.0078125, "step": 653 }, { "completion_length": 247.890625, "epoch": 5.232, "grad_norm": 0.484375, "kl": 0.1395590454339981, "learning_rate": 4.9996720926084164e-05, "loss": 0.0056, "reward": 0.46885932981967926, "reward_std": 0.7528263926506042, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5812180042266846, "rewards/no_repetition_reward_func": -0.1123587042093277, "rewards/verse_reward_func": 0.0, "step": 654 }, { "completion_length": 249.90625, "epoch": 5.24, "grad_norm": 0.51171875, "kl": 0.12654933333396912, "learning_rate": 4.9996490889419514e-05, "loss": 0.0051, "reward": 0.4136147275567055, "reward_std": 0.69993457198143, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5312722474336624, "rewards/no_repetition_reward_func": -0.10203253477811813, "rewards/verse_reward_func": -0.015625, "step": 655 }, { "completion_length": 249.625, "epoch": 5.248, "grad_norm": 0.5078125, "kl": 0.13581901788711548, "learning_rate": 4.999625305564371e-05, "loss": 0.0054, "reward": 0.503404438495636, "reward_std": 0.9802872538566589, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6158753335475922, "rewards/no_repetition_reward_func": -0.11247089877724648, "rewards/verse_reward_func": 0.0, "step": 656 }, { "completion_length": 244.515625, "epoch": 5.256, "grad_norm": 0.79296875, "kl": 0.1455385833978653, "learning_rate": 4.999600742483094e-05, "loss": 0.0058, "reward": 0.27414485812187195, "reward_std": 0.539372593164444, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.37582769989967346, "rewards/no_repetition_reward_func": -0.09387034922838211, "rewards/verse_reward_func": -0.0078125, "step": 657 }, { "completion_length": 247.34375, "epoch": 5.264, "grad_norm": 0.640625, "kl": 0.1820712760090828, "learning_rate": 4.999575399705783e-05, "loss": 0.0073, "reward": 0.3789956122636795, "reward_std": 0.7400930821895599, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5091849118471146, "rewards/no_repetition_reward_func": -0.12237682938575745, "rewards/verse_reward_func": -0.0078125, "step": 658 }, { "completion_length": 246.265625, "epoch": 5.272, "grad_norm": 0.54296875, "kl": 0.12969781830906868, "learning_rate": 4.999549277240342e-05, "loss": 0.0052, "reward": 0.08780116401612759, "reward_std": 0.44848786294460297, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.1938406564295292, "rewards/no_repetition_reward_func": -0.09041449055075645, "rewards/verse_reward_func": -0.015625, "step": 659 }, { "completion_length": 247.6875, "epoch": 5.28, "grad_norm": 0.46875, "kl": 0.1340378224849701, "learning_rate": 4.999522375094919e-05, "loss": 0.0054, "reward": 0.5041311681270599, "reward_std": 1.0117403864860535, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6252117156982422, "rewards/no_repetition_reward_func": -0.11326809599995613, "rewards/verse_reward_func": -0.0078125, "step": 660 }, { "completion_length": 252.390625, "epoch": 5.288, "grad_norm": 0.490234375, "kl": 0.1409493312239647, "learning_rate": 4.999494693277907e-05, "loss": 0.0056, "reward": 0.4231944978237152, "reward_std": 0.9550087153911591, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5419183820486069, "rewards/no_repetition_reward_func": -0.11091138422489166, "rewards/verse_reward_func": -0.0078125, "step": 661 }, { "completion_length": 250.90625, "epoch": 5.296, "grad_norm": 0.470703125, "kl": 0.13993193209171295, "learning_rate": 4.999466231797941e-05, "loss": 0.0056, "reward": 0.28351063281297684, "reward_std": 0.7013645470142365, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.399834543466568, "rewards/no_repetition_reward_func": -0.11632390320301056, "rewards/verse_reward_func": 0.0, "step": 662 }, { "completion_length": 248.765625, "epoch": 5.304, "grad_norm": 0.58203125, "kl": 0.13995571434497833, "learning_rate": 4.999436990663897e-05, "loss": 0.0056, "reward": 0.5342452824115753, "reward_std": 0.9100777506828308, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6605062484741211, "rewards/no_repetition_reward_func": -0.11844848468899727, "rewards/verse_reward_func": -0.0078125, "step": 663 }, { "completion_length": 241.5625, "epoch": 5.312, "grad_norm": 0.6171875, "kl": 0.1295226365327835, "learning_rate": 4.999406969884897e-05, "loss": 0.0052, "reward": 0.1783992052078247, "reward_std": 0.540722206234932, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.30272285640239716, "rewards/no_repetition_reward_func": -0.12432365491986275, "rewards/verse_reward_func": 0.0, "step": 664 }, { "completion_length": 248.53125, "epoch": 5.32, "grad_norm": 0.515625, "kl": 0.12663038820028305, "learning_rate": 4.999376169470306e-05, "loss": 0.0051, "reward": 0.2642257511615753, "reward_std": 0.6418294608592987, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3918743282556534, "rewards/no_repetition_reward_func": -0.11983609199523926, "rewards/verse_reward_func": -0.0078125, "step": 665 }, { "completion_length": 254.9375, "epoch": 5.328, "grad_norm": 0.4375, "kl": 0.14321079850196838, "learning_rate": 4.99934458942973e-05, "loss": 0.0057, "reward": 0.4180586338043213, "reward_std": 0.7224173992872238, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5418678373098373, "rewards/no_repetition_reward_func": -0.12380922585725784, "rewards/verse_reward_func": 0.0, "step": 666 }, { "completion_length": 249.546875, "epoch": 5.336, "grad_norm": 0.451171875, "kl": 0.1310047209262848, "learning_rate": 4.999312229773022e-05, "loss": 0.0052, "reward": 0.5690373778343201, "reward_std": 1.0140226483345032, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6862686574459076, "rewards/no_repetition_reward_func": -0.11723127588629723, "rewards/verse_reward_func": 0.0, "step": 667 }, { "completion_length": 252.6875, "epoch": 5.344, "grad_norm": 0.44140625, "kl": 0.13601497560739517, "learning_rate": 4.9992790905102734e-05, "loss": 0.0054, "reward": 0.35459624230861664, "reward_std": 0.6595225632190704, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.4783027917146683, "rewards/no_repetition_reward_func": -0.12370655685663223, "rewards/verse_reward_func": 0.0, "step": 668 }, { "completion_length": 253.890625, "epoch": 5.352, "grad_norm": 0.466796875, "kl": 0.13507455587387085, "learning_rate": 4.999245171651823e-05, "loss": 0.0054, "reward": 0.37831754982471466, "reward_std": 0.8082159161567688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.4737679064273834, "rewards/no_repetition_reward_func": -0.09545035660266876, "rewards/verse_reward_func": 0.0, "step": 669 }, { "completion_length": 244.171875, "epoch": 5.36, "grad_norm": 0.59375, "kl": 0.1265162006020546, "learning_rate": 4.99921047320825e-05, "loss": 0.0051, "reward": 0.27883175015449524, "reward_std": 0.6507651507854462, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3699118345975876, "rewards/no_repetition_reward_func": -0.09108009189367294, "rewards/verse_reward_func": 0.0, "step": 670 }, { "completion_length": 248.09375, "epoch": 5.368, "grad_norm": 0.53515625, "kl": 0.1375051662325859, "learning_rate": 4.999174995190379e-05, "loss": 0.0055, "reward": 0.5170107632875443, "reward_std": 0.984917014837265, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6361339092254639, "rewards/no_repetition_reward_func": -0.11912310495972633, "rewards/verse_reward_func": 0.0, "step": 671 }, { "completion_length": 251.53125, "epoch": 5.376, "grad_norm": 0.45703125, "kl": 0.14494258165359497, "learning_rate": 4.999138737609276e-05, "loss": 0.0058, "reward": 0.6156136393547058, "reward_std": 1.3068419098854065, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7640573978424072, "rewards/no_repetition_reward_func": -0.14844374358654022, "rewards/verse_reward_func": 0.0, "step": 672 }, { "completion_length": 249.546875, "epoch": 5.384, "grad_norm": 0.4609375, "kl": 0.14483436942100525, "learning_rate": 4.9991017004762496e-05, "loss": 0.0058, "reward": 0.34533610939979553, "reward_std": 0.7228029668331146, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.4619716703891754, "rewards/no_repetition_reward_func": -0.10882304236292839, "rewards/verse_reward_func": -0.0078125, "step": 673 }, { "completion_length": 250.140625, "epoch": 5.392, "grad_norm": 0.4453125, "kl": 0.15210238099098206, "learning_rate": 4.9990638838028546e-05, "loss": 0.0061, "reward": 0.38983848690986633, "reward_std": 0.7219408452510834, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5183121711015701, "rewards/no_repetition_reward_func": -0.1284737065434456, "rewards/verse_reward_func": 0.0, "step": 674 }, { "completion_length": 249.671875, "epoch": 5.4, "grad_norm": 0.4765625, "kl": 0.15239562094211578, "learning_rate": 4.999025287600886e-05, "loss": 0.0061, "reward": 0.38928496092557907, "reward_std": 0.7205881178379059, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.49898236989974976, "rewards/no_repetition_reward_func": -0.10969742387533188, "rewards/verse_reward_func": 0.0, "step": 675 }, { "completion_length": 247.640625, "epoch": 5.408, "grad_norm": 0.462890625, "kl": 0.15584347397089005, "learning_rate": 4.998985911882384e-05, "loss": 0.0062, "reward": 0.3636281341314316, "reward_std": 0.8058235943317413, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.504246860742569, "rewards/no_repetition_reward_func": -0.13280624151229858, "rewards/verse_reward_func": -0.0078125, "step": 676 }, { "completion_length": 255.4375, "epoch": 5.416, "grad_norm": 0.44140625, "kl": 0.1491314098238945, "learning_rate": 4.99894575665963e-05, "loss": 0.006, "reward": 0.5286481976509094, "reward_std": 1.1412528157234192, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6491147577762604, "rewards/no_repetition_reward_func": -0.12046653777360916, "rewards/verse_reward_func": 0.0, "step": 677 }, { "completion_length": 256.0, "epoch": 5.424, "grad_norm": 0.474609375, "kl": 0.1559154912829399, "learning_rate": 4.9989048219451495e-05, "loss": 0.0062, "reward": 0.3995189815759659, "reward_std": 0.7596677541732788, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5369147658348083, "rewards/no_repetition_reward_func": -0.1373957321047783, "rewards/verse_reward_func": 0.0, "step": 678 }, { "completion_length": 251.5, "epoch": 5.432, "grad_norm": 0.5078125, "kl": 0.1582529917359352, "learning_rate": 4.998863107751711e-05, "loss": 0.0063, "reward": 0.4750232398509979, "reward_std": 0.9802969098091125, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6337597370147705, "rewards/no_repetition_reward_func": -0.1587364748120308, "rewards/verse_reward_func": 0.0, "step": 679 }, { "completion_length": 255.78125, "epoch": 5.44, "grad_norm": 0.486328125, "kl": 0.18000592291355133, "learning_rate": 4.998820614092328e-05, "loss": 0.0072, "reward": 0.2971479296684265, "reward_std": 0.6778858304023743, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.4005459100008011, "rewards/no_repetition_reward_func": -0.10339797288179398, "rewards/verse_reward_func": 0.0, "step": 680 }, { "completion_length": 249.078125, "epoch": 5.448, "grad_norm": 0.453125, "kl": 0.1542302668094635, "learning_rate": 4.998777340980254e-05, "loss": 0.0062, "reward": 0.49287815392017365, "reward_std": 1.0549920797348022, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6098714172840118, "rewards/no_repetition_reward_func": -0.1091807633638382, "rewards/verse_reward_func": -0.0078125, "step": 681 }, { "completion_length": 248.8125, "epoch": 5.456, "grad_norm": 0.70703125, "kl": 0.20026244223117828, "learning_rate": 4.998733288428987e-05, "loss": 0.008, "reward": 0.5548458695411682, "reward_std": 0.871604859828949, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6985208094120026, "rewards/no_repetition_reward_func": -0.13586243987083435, "rewards/verse_reward_func": -0.0078125, "step": 682 }, { "completion_length": 249.640625, "epoch": 5.464, "grad_norm": 0.5078125, "kl": 0.14475535601377487, "learning_rate": 4.9986884564522696e-05, "loss": 0.0058, "reward": 0.22187060117721558, "reward_std": 0.6292142868041992, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.3622288703918457, "rewards/no_repetition_reward_func": -0.14035826921463013, "rewards/verse_reward_func": 0.0, "step": 683 }, { "completion_length": 250.53125, "epoch": 5.4719999999999995, "grad_norm": 0.5546875, "kl": 0.18535170704126358, "learning_rate": 4.998642845064086e-05, "loss": 0.0074, "reward": 0.6845551133155823, "reward_std": 1.1221315264701843, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8421261012554169, "rewards/no_repetition_reward_func": -0.15757101029157639, "rewards/verse_reward_func": 0.0, "step": 684 }, { "completion_length": 252.671875, "epoch": 5.48, "grad_norm": 0.482421875, "kl": 0.17272599041461945, "learning_rate": 4.9985964542786614e-05, "loss": 0.0069, "reward": 0.45751863718032837, "reward_std": 0.8148531913757324, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5854408740997314, "rewards/no_repetition_reward_func": -0.12792222201824188, "rewards/verse_reward_func": 0.0, "step": 685 }, { "completion_length": 253.25, "epoch": 5.4879999999999995, "grad_norm": 0.42578125, "kl": 0.16551405936479568, "learning_rate": 4.998549284110468e-05, "loss": 0.0066, "reward": 0.5293207615613937, "reward_std": 1.015488624572754, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.711938887834549, "rewards/no_repetition_reward_func": -0.1826181709766388, "rewards/verse_reward_func": 0.0, "step": 686 }, { "completion_length": 246.046875, "epoch": 5.496, "grad_norm": 0.484375, "kl": 0.18930746614933014, "learning_rate": 4.99850133457422e-05, "loss": 0.0076, "reward": 0.558516725897789, "reward_std": 1.0941195487976074, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.712071418762207, "rewards/no_repetition_reward_func": -0.14574220031499863, "rewards/verse_reward_func": -0.0078125, "step": 687 }, { "completion_length": 253.078125, "epoch": 5.504, "grad_norm": 0.515625, "kl": 0.14487101882696152, "learning_rate": 4.998452605684874e-05, "loss": 0.0058, "reward": 0.5334058254957199, "reward_std": 0.8256373703479767, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6645531058311462, "rewards/no_repetition_reward_func": -0.13114727661013603, "rewards/verse_reward_func": 0.0, "step": 688 }, { "completion_length": 242.3125, "epoch": 5.5120000000000005, "grad_norm": 0.494140625, "kl": 0.17619455605745316, "learning_rate": 4.9984030974576285e-05, "loss": 0.007, "reward": 0.4992864727973938, "reward_std": 0.9501466453075409, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.668072372674942, "rewards/no_repetition_reward_func": -0.16878590732812881, "rewards/verse_reward_func": 0.0, "step": 689 }, { "completion_length": 251.34375, "epoch": 5.52, "grad_norm": 0.51171875, "kl": 0.17270071059465408, "learning_rate": 4.998352809907928e-05, "loss": 0.0069, "reward": 0.35439471900463104, "reward_std": 0.7605064809322357, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5070485025644302, "rewards/no_repetition_reward_func": -0.152653768658638, "rewards/verse_reward_func": 0.0, "step": 690 }, { "completion_length": 255.125, "epoch": 5.5280000000000005, "grad_norm": 0.494140625, "kl": 0.16953882575035095, "learning_rate": 4.998301743051459e-05, "loss": 0.0068, "reward": 0.5025836825370789, "reward_std": 0.9486787915229797, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6369436979293823, "rewards/no_repetition_reward_func": -0.13436000049114227, "rewards/verse_reward_func": 0.0, "step": 691 }, { "completion_length": 248.8125, "epoch": 5.536, "grad_norm": 0.49609375, "kl": 0.1897086501121521, "learning_rate": 4.998249896904149e-05, "loss": 0.0076, "reward": 0.38673415780067444, "reward_std": 0.8600207567214966, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.49977873265743256, "rewards/no_repetition_reward_func": -0.11304455623030663, "rewards/verse_reward_func": 0.0, "step": 692 }, { "completion_length": 254.03125, "epoch": 5.5440000000000005, "grad_norm": 0.478515625, "kl": 0.15600059181451797, "learning_rate": 4.998197271482171e-05, "loss": 0.0062, "reward": 0.3364868685603142, "reward_std": 0.823783665895462, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.4426930248737335, "rewards/no_repetition_reward_func": -0.10620617121458054, "rewards/verse_reward_func": 0.0, "step": 693 }, { "completion_length": 252.28125, "epoch": 5.552, "grad_norm": 0.5078125, "kl": 0.17891667038202286, "learning_rate": 4.998143866801942e-05, "loss": 0.0072, "reward": 0.5494273900985718, "reward_std": 0.8924142122268677, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6988071203231812, "rewards/no_repetition_reward_func": -0.149379700422287, "rewards/verse_reward_func": 0.0, "step": 694 }, { "completion_length": 247.171875, "epoch": 5.5600000000000005, "grad_norm": 0.7734375, "kl": 0.20747724175453186, "learning_rate": 4.998089682880117e-05, "loss": 0.0083, "reward": 0.43480245769023895, "reward_std": 0.8035052120685577, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5705868750810623, "rewards/no_repetition_reward_func": -0.12797194346785545, "rewards/verse_reward_func": -0.0078125, "step": 695 }, { "completion_length": 242.25, "epoch": 5.568, "grad_norm": 0.8203125, "kl": 0.2464146912097931, "learning_rate": 4.9980347197336005e-05, "loss": 0.0099, "reward": 0.32615747302770615, "reward_std": 0.7327385172247887, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.45756763219833374, "rewards/no_repetition_reward_func": -0.10797268152236938, "rewards/verse_reward_func": -0.0234375, "step": 696 }, { "completion_length": 255.890625, "epoch": 5.576, "grad_norm": 0.451171875, "kl": 0.17807751148939133, "learning_rate": 4.997978977379536e-05, "loss": 0.0071, "reward": 0.7817997634410858, "reward_std": 1.2075615525245667, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9552492499351501, "rewards/no_repetition_reward_func": -0.17344945669174194, "rewards/verse_reward_func": 0.0, "step": 697 }, { "completion_length": 254.625, "epoch": 5.584, "grad_norm": 0.53125, "kl": 0.17059578001499176, "learning_rate": 4.997922455835311e-05, "loss": 0.0068, "reward": 0.5995948016643524, "reward_std": 1.103611320257187, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7629223167896271, "rewards/no_repetition_reward_func": -0.16332751512527466, "rewards/verse_reward_func": 0.0, "step": 698 }, { "completion_length": 253.8125, "epoch": 5.592, "grad_norm": 0.44140625, "kl": 0.16337041556835175, "learning_rate": 4.997865155118557e-05, "loss": 0.0065, "reward": 0.6446380615234375, "reward_std": 1.3242871165275574, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8243588209152222, "rewards/no_repetition_reward_func": -0.17972075194120407, "rewards/verse_reward_func": 0.0, "step": 699 }, { "completion_length": 250.984375, "epoch": 5.6, "grad_norm": 0.51953125, "kl": 0.16224903613328934, "learning_rate": 4.997807075247146e-05, "loss": 0.0065, "reward": 0.42602070420980453, "reward_std": 0.8488226532936096, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5866514816880226, "rewards/no_repetition_reward_func": -0.1606307253241539, "rewards/verse_reward_func": 0.0, "step": 700 }, { "completion_length": 249.390625, "epoch": 5.608, "grad_norm": 0.59765625, "kl": 0.2014477476477623, "learning_rate": 4.997748216239196e-05, "loss": 0.0081, "reward": 0.8515930473804474, "reward_std": 1.3228483200073242, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0221942365169525, "rewards/no_repetition_reward_func": -0.1627887487411499, "rewards/verse_reward_func": -0.0078125, "step": 701 }, { "completion_length": 247.75, "epoch": 5.616, "grad_norm": 0.515625, "kl": 0.20237335562705994, "learning_rate": 4.9976885781130665e-05, "loss": 0.0081, "reward": 0.752217561006546, "reward_std": 1.0292336344718933, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9115981459617615, "rewards/no_repetition_reward_func": -0.15938057750463486, "rewards/verse_reward_func": 0.0, "step": 702 }, { "completion_length": 245.4375, "epoch": 5.624, "grad_norm": 0.4609375, "kl": 0.17530740052461624, "learning_rate": 4.997628160887361e-05, "loss": 0.007, "reward": 0.5094923973083496, "reward_std": 0.974005401134491, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6455775499343872, "rewards/no_repetition_reward_func": -0.1360851377248764, "rewards/verse_reward_func": 0.0, "step": 703 }, { "completion_length": 248.890625, "epoch": 5.632, "grad_norm": 0.4765625, "kl": 0.18039298802614212, "learning_rate": 4.9975669645809244e-05, "loss": 0.0072, "reward": 0.5087114572525024, "reward_std": 1.193378746509552, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6680203974246979, "rewards/no_repetition_reward_func": -0.15149643272161484, "rewards/verse_reward_func": -0.0078125, "step": 704 }, { "completion_length": 249.21875, "epoch": 5.64, "grad_norm": 0.41015625, "kl": 0.1772746592760086, "learning_rate": 4.9975049892128455e-05, "loss": 0.0071, "reward": 0.9059257805347443, "reward_std": 1.502757728099823, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0723873376846313, "rewards/no_repetition_reward_func": -0.16646156460046768, "rewards/verse_reward_func": 0.0, "step": 705 }, { "completion_length": 242.5, "epoch": 5.648, "grad_norm": 0.4765625, "kl": 0.23317763209342957, "learning_rate": 4.997442234802456e-05, "loss": 0.0093, "reward": 0.6872413158416748, "reward_std": 1.0807230472564697, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.82759690284729, "rewards/no_repetition_reward_func": -0.14035557210445404, "rewards/verse_reward_func": 0.0, "step": 706 }, { "completion_length": 253.40625, "epoch": 5.656, "grad_norm": 0.470703125, "kl": 0.17160408198833466, "learning_rate": 4.997378701369332e-05, "loss": 0.0069, "reward": 0.7719415724277496, "reward_std": 1.0859298408031464, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9412103593349457, "rewards/no_repetition_reward_func": -0.16926878690719604, "rewards/verse_reward_func": 0.0, "step": 707 }, { "completion_length": 254.3125, "epoch": 5.664, "grad_norm": 0.484375, "kl": 0.17148134112358093, "learning_rate": 4.997314388933291e-05, "loss": 0.0069, "reward": 0.45283710956573486, "reward_std": 0.9366581737995148, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6371408402919769, "rewards/no_repetition_reward_func": -0.18430374562740326, "rewards/verse_reward_func": 0.0, "step": 708 }, { "completion_length": 252.203125, "epoch": 5.672, "grad_norm": 0.439453125, "kl": 0.15853429585695267, "learning_rate": 4.997249297514394e-05, "loss": 0.0063, "reward": 0.5117930471897125, "reward_std": 1.1053049564361572, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.688453733921051, "rewards/no_repetition_reward_func": -0.1766606941819191, "rewards/verse_reward_func": 0.0, "step": 709 }, { "completion_length": 245.03125, "epoch": 5.68, "grad_norm": 0.93359375, "kl": 0.3620625138282776, "learning_rate": 4.997183427132943e-05, "loss": 0.0145, "reward": 0.567943163216114, "reward_std": 1.005713015794754, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7208413332700729, "rewards/no_repetition_reward_func": -0.1450856700539589, "rewards/verse_reward_func": -0.0078125, "step": 710 }, { "completion_length": 250.453125, "epoch": 5.688, "grad_norm": 0.4140625, "kl": 0.18372776359319687, "learning_rate": 4.9971167778094863e-05, "loss": 0.0073, "reward": 0.8247154355049133, "reward_std": 1.4822508692741394, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9922906458377838, "rewards/no_repetition_reward_func": -0.1675751954317093, "rewards/verse_reward_func": 0.0, "step": 711 }, { "completion_length": 251.953125, "epoch": 5.696, "grad_norm": 0.470703125, "kl": 0.17487269639968872, "learning_rate": 4.997049349564814e-05, "loss": 0.007, "reward": 0.42038509249687195, "reward_std": 0.9565409123897552, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5735895484685898, "rewards/no_repetition_reward_func": -0.14539194107055664, "rewards/verse_reward_func": -0.0078125, "step": 712 }, { "completion_length": 253.609375, "epoch": 5.704, "grad_norm": 0.515625, "kl": 0.20144179463386536, "learning_rate": 4.996981142419959e-05, "loss": 0.0081, "reward": 0.3787817806005478, "reward_std": 0.7478663921356201, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5247930586338043, "rewards/no_repetition_reward_func": -0.13038625568151474, "rewards/verse_reward_func": -0.015625, "step": 713 }, { "completion_length": 249.0625, "epoch": 5.712, "grad_norm": 0.5078125, "kl": 0.20867335051298141, "learning_rate": 4.9969121563961956e-05, "loss": 0.0083, "reward": 0.3678915351629257, "reward_std": 0.8032286763191223, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5153140425682068, "rewards/no_repetition_reward_func": -0.14742248505353928, "rewards/verse_reward_func": 0.0, "step": 714 }, { "completion_length": 249.46875, "epoch": 5.72, "grad_norm": 0.44140625, "kl": 0.202217236161232, "learning_rate": 4.996842391515044e-05, "loss": 0.0081, "reward": 0.6162835359573364, "reward_std": 1.1908870935440063, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7522868067026138, "rewards/no_repetition_reward_func": -0.13600321114063263, "rewards/verse_reward_func": 0.0, "step": 715 }, { "completion_length": 250.5625, "epoch": 5.728, "grad_norm": 0.4296875, "kl": 0.19037244468927383, "learning_rate": 4.996771847798265e-05, "loss": 0.0076, "reward": 0.47363385558128357, "reward_std": 1.0081435441970825, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6279376745223999, "rewards/no_repetition_reward_func": -0.15430379658937454, "rewards/verse_reward_func": 0.0, "step": 716 }, { "completion_length": 252.546875, "epoch": 5.736, "grad_norm": 1.046875, "kl": 0.17706604301929474, "learning_rate": 4.9967005252678634e-05, "loss": 0.0071, "reward": 0.759191632270813, "reward_std": 1.7419695854187012, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9455537796020508, "rewards/no_repetition_reward_func": -0.17073708772659302, "rewards/verse_reward_func": -0.015625, "step": 717 }, { "completion_length": 249.59375, "epoch": 5.744, "grad_norm": 0.490234375, "kl": 0.1838879957795143, "learning_rate": 4.996628423946087e-05, "loss": 0.0074, "reward": 0.6231353282928467, "reward_std": 1.0833888947963715, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7773598432540894, "rewards/no_repetition_reward_func": -0.15422453731298447, "rewards/verse_reward_func": 0.0, "step": 718 }, { "completion_length": 249.328125, "epoch": 5.752, "grad_norm": 0.490234375, "kl": 0.18641816079616547, "learning_rate": 4.9965555438554254e-05, "loss": 0.0075, "reward": 0.26994359493255615, "reward_std": 0.720516711473465, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.42678749561309814, "rewards/no_repetition_reward_func": -0.15684393048286438, "rewards/verse_reward_func": 0.0, "step": 719 }, { "completion_length": 255.90625, "epoch": 5.76, "grad_norm": 0.466796875, "kl": 0.1756124198436737, "learning_rate": 4.9964818850186135e-05, "loss": 0.007, "reward": 0.4292195588350296, "reward_std": 0.9144133925437927, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5513154864311218, "rewards/no_repetition_reward_func": -0.12209596484899521, "rewards/verse_reward_func": 0.0, "step": 720 }, { "completion_length": 256.0, "epoch": 5.768, "grad_norm": 0.44140625, "kl": 0.1669309362769127, "learning_rate": 4.996407447458626e-05, "loss": 0.0067, "reward": 0.6341388523578644, "reward_std": 1.0590577125549316, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7934570610523224, "rewards/no_repetition_reward_func": -0.15931819379329681, "rewards/verse_reward_func": 0.0, "step": 721 }, { "completion_length": 252.703125, "epoch": 5.776, "grad_norm": 0.482421875, "kl": 0.17769019305706024, "learning_rate": 4.996332231198683e-05, "loss": 0.0071, "reward": 0.6892836093902588, "reward_std": 1.2828429341316223, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8471010625362396, "rewards/no_repetition_reward_func": -0.15781735628843307, "rewards/verse_reward_func": 0.0, "step": 722 }, { "completion_length": 250.78125, "epoch": 5.784, "grad_norm": 0.51953125, "kl": 0.1676122024655342, "learning_rate": 4.996256236262245e-05, "loss": 0.0067, "reward": 0.9357976317405701, "reward_std": 1.621370255947113, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1130141615867615, "rewards/no_repetition_reward_func": -0.17721650004386902, "rewards/verse_reward_func": 0.0, "step": 723 }, { "completion_length": 250.578125, "epoch": 5.792, "grad_norm": 0.44140625, "kl": 0.1795780435204506, "learning_rate": 4.99617946267302e-05, "loss": 0.0072, "reward": 0.49104416370391846, "reward_std": 1.0676467418670654, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6104935109615326, "rewards/no_repetition_reward_func": -0.11944938451051712, "rewards/verse_reward_func": 0.0, "step": 724 }, { "completion_length": 253.171875, "epoch": 5.8, "grad_norm": 0.5234375, "kl": 0.21098432689905167, "learning_rate": 4.996101910454953e-05, "loss": 0.0084, "reward": 0.6606670618057251, "reward_std": 1.065425455570221, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8139347732067108, "rewards/no_repetition_reward_func": -0.15326770395040512, "rewards/verse_reward_func": 0.0, "step": 725 }, { "completion_length": 246.4375, "epoch": 5.808, "grad_norm": 0.53515625, "kl": 0.17637160420417786, "learning_rate": 4.996023579632236e-05, "loss": 0.0071, "reward": 0.38333237171173096, "reward_std": 0.7253147214651108, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5330064445734024, "rewards/no_repetition_reward_func": -0.14967404305934906, "rewards/verse_reward_func": 0.0, "step": 726 }, { "completion_length": 247.078125, "epoch": 5.816, "grad_norm": 0.5625, "kl": 0.1929822489619255, "learning_rate": 4.995944470229302e-05, "loss": 0.0077, "reward": 0.6325154602527618, "reward_std": 1.224732756614685, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8153691291809082, "rewards/no_repetition_reward_func": -0.17504116892814636, "rewards/verse_reward_func": -0.0078125, "step": 727 }, { "completion_length": 245.90625, "epoch": 5.824, "grad_norm": 0.51171875, "kl": 0.17781709879636765, "learning_rate": 4.9958645822708285e-05, "loss": 0.0071, "reward": 0.7113149762153625, "reward_std": 0.983566403388977, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8803516626358032, "rewards/no_repetition_reward_func": -0.16903671622276306, "rewards/verse_reward_func": 0.0, "step": 728 }, { "completion_length": 251.8125, "epoch": 5.832, "grad_norm": 0.484375, "kl": 0.1729966327548027, "learning_rate": 4.995783915781734e-05, "loss": 0.0069, "reward": 0.7331031560897827, "reward_std": 1.1198042929172516, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8828424215316772, "rewards/no_repetition_reward_func": -0.14973920583724976, "rewards/verse_reward_func": 0.0, "step": 729 }, { "completion_length": 254.59375, "epoch": 5.84, "grad_norm": 0.40234375, "kl": 0.1477445363998413, "learning_rate": 4.9957024707871806e-05, "loss": 0.0059, "reward": 0.9414709210395813, "reward_std": 1.5973269939422607, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1520546674728394, "rewards/no_repetition_reward_func": -0.20277121663093567, "rewards/verse_reward_func": -0.0078125, "step": 730 }, { "completion_length": 253.828125, "epoch": 5.848, "grad_norm": 0.412109375, "kl": 0.1701807975769043, "learning_rate": 4.9956202473125736e-05, "loss": 0.0068, "reward": 0.6099298968911171, "reward_std": 1.185562014579773, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7820281386375427, "rewards/no_repetition_reward_func": -0.15647327527403831, "rewards/verse_reward_func": -0.015625, "step": 731 }, { "completion_length": 256.0, "epoch": 5.856, "grad_norm": 0.44140625, "kl": 0.17476321756839752, "learning_rate": 4.99553724538356e-05, "loss": 0.007, "reward": 0.5888758599758148, "reward_std": 1.0157241821289062, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7430480122566223, "rewards/no_repetition_reward_func": -0.1541721597313881, "rewards/verse_reward_func": 0.0, "step": 732 }, { "completion_length": 251.21875, "epoch": 5.864, "grad_norm": 0.466796875, "kl": 0.1807871088385582, "learning_rate": 4.995453465026032e-05, "loss": 0.0072, "reward": 1.083580732345581, "reward_std": 1.6971482038497925, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2491846084594727, "rewards/no_repetition_reward_func": -0.1499788612127304, "rewards/verse_reward_func": -0.015625, "step": 733 }, { "completion_length": 245.359375, "epoch": 5.872, "grad_norm": 0.42578125, "kl": 0.1793564185500145, "learning_rate": 4.9953689062661226e-05, "loss": 0.0072, "reward": 0.5322750806808472, "reward_std": 1.0826443433761597, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7104762643575668, "rewards/no_repetition_reward_func": -0.17820115387439728, "rewards/verse_reward_func": 0.0, "step": 734 }, { "completion_length": 247.796875, "epoch": 5.88, "grad_norm": 0.466796875, "kl": 0.17884110659360886, "learning_rate": 4.995283569130207e-05, "loss": 0.0072, "reward": 0.5381377339363098, "reward_std": 1.0299710631370544, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7113795578479767, "rewards/no_repetition_reward_func": -0.17324179410934448, "rewards/verse_reward_func": 0.0, "step": 735 }, { "completion_length": 252.9375, "epoch": 5.888, "grad_norm": 0.4609375, "kl": 0.16854141652584076, "learning_rate": 4.995197453644905e-05, "loss": 0.0067, "reward": 0.6616383641958237, "reward_std": 1.1686644852161407, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8252454698085785, "rewards/no_repetition_reward_func": -0.16360709071159363, "rewards/verse_reward_func": 0.0, "step": 736 }, { "completion_length": 251.40625, "epoch": 5.896, "grad_norm": 0.45703125, "kl": 0.16073782742023468, "learning_rate": 4.995110559837078e-05, "loss": 0.0064, "reward": 0.6238666772842407, "reward_std": 1.1097452342510223, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7793495059013367, "rewards/no_repetition_reward_func": -0.15548285841941833, "rewards/verse_reward_func": 0.0, "step": 737 }, { "completion_length": 252.0, "epoch": 5.904, "grad_norm": 0.42578125, "kl": 0.17342502623796463, "learning_rate": 4.995022887733832e-05, "loss": 0.0069, "reward": 0.7196073830127716, "reward_std": 1.2898732423782349, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8822051286697388, "rewards/no_repetition_reward_func": -0.16259765625, "rewards/verse_reward_func": 0.0, "step": 738 }, { "completion_length": 242.59375, "epoch": 5.912, "grad_norm": 0.4140625, "kl": 0.18480830639600754, "learning_rate": 4.994934437362513e-05, "loss": 0.0074, "reward": 0.49986208975315094, "reward_std": 0.8842047452926636, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.645337700843811, "rewards/no_repetition_reward_func": -0.1454755961894989, "rewards/verse_reward_func": 0.0, "step": 739 }, { "completion_length": 244.125, "epoch": 5.92, "grad_norm": 0.46875, "kl": 0.1928974837064743, "learning_rate": 4.9948452087507116e-05, "loss": 0.0077, "reward": 1.1182240843772888, "reward_std": 1.6451348066329956, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2996585965156555, "rewards/no_repetition_reward_func": -0.1736220195889473, "rewards/verse_reward_func": -0.0078125, "step": 740 }, { "completion_length": 245.75, "epoch": 5.928, "grad_norm": 0.5234375, "kl": 0.21645213663578033, "learning_rate": 4.9947552019262605e-05, "loss": 0.0087, "reward": 0.5598815083503723, "reward_std": 1.2801120281219482, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7146241366863251, "rewards/no_repetition_reward_func": -0.14693015068769455, "rewards/verse_reward_func": -0.0078125, "step": 741 }, { "completion_length": 252.515625, "epoch": 5.936, "grad_norm": 0.51953125, "kl": 0.20362242311239243, "learning_rate": 4.9946644169172355e-05, "loss": 0.0081, "reward": 0.8707688748836517, "reward_std": 1.5787217617034912, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.027410089969635, "rewards/no_repetition_reward_func": -0.1566411852836609, "rewards/verse_reward_func": 0.0, "step": 742 }, { "completion_length": 248.046875, "epoch": 5.944, "grad_norm": 0.427734375, "kl": 0.1864793747663498, "learning_rate": 4.9945728537519555e-05, "loss": 0.0075, "reward": 0.5066643804311752, "reward_std": 0.9509263038635254, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.669570654630661, "rewards/no_repetition_reward_func": -0.1629062443971634, "rewards/verse_reward_func": 0.0, "step": 743 }, { "completion_length": 250.21875, "epoch": 5.952, "grad_norm": 0.4453125, "kl": 0.18732674419879913, "learning_rate": 4.994480512458981e-05, "loss": 0.0075, "reward": 0.9596996307373047, "reward_std": 1.7034090757369995, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.144407033920288, "rewards/no_repetition_reward_func": -0.17689498513936996, "rewards/verse_reward_func": -0.0078125, "step": 744 }, { "completion_length": 245.984375, "epoch": 5.96, "grad_norm": 0.56640625, "kl": 0.22450395673513412, "learning_rate": 4.994387393067117e-05, "loss": 0.009, "reward": 0.8454991579055786, "reward_std": 1.2231281399726868, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0351555049419403, "rewards/no_repetition_reward_func": -0.1818438619375229, "rewards/verse_reward_func": -0.0078125, "step": 745 }, { "completion_length": 250.390625, "epoch": 5.968, "grad_norm": 0.453125, "kl": 0.24765124171972275, "learning_rate": 4.9942934956054076e-05, "loss": 0.0099, "reward": 0.6870687007904053, "reward_std": 1.284661054611206, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.820877879858017, "rewards/no_repetition_reward_func": -0.1338091604411602, "rewards/verse_reward_func": 0.0, "step": 746 }, { "completion_length": 244.5625, "epoch": 5.976, "grad_norm": 0.5625, "kl": 0.20249785482883453, "learning_rate": 4.994198820103145e-05, "loss": 0.0081, "reward": 0.7598668932914734, "reward_std": 1.2258395552635193, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9541262686252594, "rewards/no_repetition_reward_func": -0.18644685298204422, "rewards/verse_reward_func": -0.0078125, "step": 747 }, { "completion_length": 252.515625, "epoch": 5.984, "grad_norm": 0.494140625, "kl": 0.24950803816318512, "learning_rate": 4.994103366589859e-05, "loss": 0.01, "reward": 0.8975111842155457, "reward_std": 1.3828513622283936, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.054736077785492, "rewards/no_repetition_reward_func": -0.15722490102052689, "rewards/verse_reward_func": 0.0, "step": 748 }, { "completion_length": 248.453125, "epoch": 5.992, "grad_norm": 0.462890625, "kl": 0.21591290831565857, "learning_rate": 4.9940071350953255e-05, "loss": 0.0086, "reward": 0.7907741665840149, "reward_std": 1.5540838241577148, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0088366568088531, "rewards/no_repetition_reward_func": -0.21024999022483826, "rewards/verse_reward_func": -0.0078125, "step": 749 }, { "completion_length": 252.9375, "epoch": 6.0, "grad_norm": 0.4765625, "kl": 0.26324689388275146, "learning_rate": 4.993910125649561e-05, "loss": 0.0105, "reward": 0.8475098013877869, "reward_std": 1.3263379037380219, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.033380925655365, "rewards/no_repetition_reward_func": -0.18587110191583633, "rewards/verse_reward_func": 0.0, "step": 750 }, { "completion_length": 247.921875, "epoch": 6.008, "grad_norm": 0.447265625, "kl": 0.2119169607758522, "learning_rate": 4.993812338282826e-05, "loss": 0.0085, "reward": 0.8253217339515686, "reward_std": 1.24218288064003, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.023111253976822, "rewards/no_repetition_reward_func": -0.18216446042060852, "rewards/verse_reward_func": -0.015625, "step": 751 }, { "completion_length": 249.640625, "epoch": 6.016, "grad_norm": 0.5625, "kl": 0.23002339899539948, "learning_rate": 4.993713773025623e-05, "loss": 0.0092, "reward": 0.4298057556152344, "reward_std": 1.0520544052124023, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.5983999371528625, "rewards/no_repetition_reward_func": -0.1685941442847252, "rewards/verse_reward_func": 0.0, "step": 752 }, { "completion_length": 245.796875, "epoch": 6.024, "grad_norm": 2.34375, "kl": 0.5441725552082062, "learning_rate": 4.993614429908697e-05, "loss": 0.0218, "reward": 0.6200930774211884, "reward_std": 1.219594120979309, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8279628753662109, "rewards/no_repetition_reward_func": -0.20005732774734497, "rewards/verse_reward_func": -0.0078125, "step": 753 }, { "completion_length": 247.546875, "epoch": 6.032, "grad_norm": 0.5234375, "kl": 0.24957288056612015, "learning_rate": 4.993514308963036e-05, "loss": 0.01, "reward": 0.868014007806778, "reward_std": 1.4657291173934937, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0423423051834106, "rewards/no_repetition_reward_func": -0.16651584953069687, "rewards/verse_reward_func": -0.0078125, "step": 754 }, { "completion_length": 246.09375, "epoch": 6.04, "grad_norm": 0.484375, "kl": 0.20087869465351105, "learning_rate": 4.993413410219871e-05, "loss": 0.008, "reward": 0.8438056707382202, "reward_std": 1.5437666773796082, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9954124689102173, "rewards/no_repetition_reward_func": -0.15160678327083588, "rewards/verse_reward_func": 0.0, "step": 755 }, { "completion_length": 251.625, "epoch": 6.048, "grad_norm": 0.52734375, "kl": 0.278443306684494, "learning_rate": 4.993311733710676e-05, "loss": 0.0111, "reward": 0.6003769934177399, "reward_std": 1.0470578074455261, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7374023199081421, "rewards/no_repetition_reward_func": -0.1370253562927246, "rewards/verse_reward_func": 0.0, "step": 756 }, { "completion_length": 242.25, "epoch": 6.056, "grad_norm": 0.5390625, "kl": 0.257014736533165, "learning_rate": 4.993209279467164e-05, "loss": 0.0103, "reward": 0.8530294299125671, "reward_std": 1.5250130891799927, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0212843120098114, "rewards/no_repetition_reward_func": -0.16044241935014725, "rewards/verse_reward_func": -0.0078125, "step": 757 }, { "completion_length": 246.546875, "epoch": 6.064, "grad_norm": 0.59375, "kl": 0.3399955928325653, "learning_rate": 4.993106047521296e-05, "loss": 0.0136, "reward": 0.7285540997982025, "reward_std": 1.278739333152771, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9236523509025574, "rewards/no_repetition_reward_func": -0.19509834051132202, "rewards/verse_reward_func": 0.0, "step": 758 }, { "completion_length": 251.53125, "epoch": 6.072, "grad_norm": 0.478515625, "kl": 0.2875756621360779, "learning_rate": 4.993002037905272e-05, "loss": 0.0115, "reward": 0.6592495441436768, "reward_std": 1.0799922943115234, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.81866055727005, "rewards/no_repetition_reward_func": -0.1594109609723091, "rewards/verse_reward_func": 0.0, "step": 759 }, { "completion_length": 251.828125, "epoch": 6.08, "grad_norm": 0.50390625, "kl": 0.2525947540998459, "learning_rate": 4.992897250651535e-05, "loss": 0.0101, "reward": 0.7010206952691078, "reward_std": 1.177333950996399, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8709747493267059, "rewards/no_repetition_reward_func": -0.16214154660701752, "rewards/verse_reward_func": -0.0078125, "step": 760 }, { "completion_length": 249.796875, "epoch": 6.088, "grad_norm": 0.65234375, "kl": 0.279588520526886, "learning_rate": 4.992791685792772e-05, "loss": 0.0112, "reward": 0.7331270277500153, "reward_std": 1.2634209394454956, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8788368403911591, "rewards/no_repetition_reward_func": -0.13789734989404678, "rewards/verse_reward_func": -0.0078125, "step": 761 }, { "completion_length": 249.4375, "epoch": 6.096, "grad_norm": 0.47265625, "kl": 0.31277647614479065, "learning_rate": 4.992685343361911e-05, "loss": 0.0125, "reward": 0.7086526453495026, "reward_std": 1.306367039680481, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8753103017807007, "rewards/no_repetition_reward_func": -0.1666577085852623, "rewards/verse_reward_func": 0.0, "step": 762 }, { "completion_length": 252.828125, "epoch": 6.104, "grad_norm": 0.4765625, "kl": 0.3144738972187042, "learning_rate": 4.992578223392124e-05, "loss": 0.0126, "reward": 0.7186440825462341, "reward_std": 1.3360933661460876, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8828155398368835, "rewards/no_repetition_reward_func": -0.1641715168952942, "rewards/verse_reward_func": 0.0, "step": 763 }, { "completion_length": 247.578125, "epoch": 6.112, "grad_norm": 2.1875, "kl": 0.5477823913097382, "learning_rate": 4.9924703259168244e-05, "loss": 0.0219, "reward": 1.0598164200782776, "reward_std": 1.4781323671340942, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2452403903007507, "rewards/no_repetition_reward_func": -0.17761147767305374, "rewards/verse_reward_func": -0.0078125, "step": 764 }, { "completion_length": 248.671875, "epoch": 6.12, "grad_norm": 0.51171875, "kl": 0.2861440181732178, "learning_rate": 4.9923616509696683e-05, "loss": 0.0114, "reward": 0.7842584550380707, "reward_std": 1.3636627197265625, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9461895227432251, "rewards/no_repetition_reward_func": -0.16193103790283203, "rewards/verse_reward_func": 0.0, "step": 765 }, { "completion_length": 247.875, "epoch": 6.128, "grad_norm": 0.421875, "kl": 0.3011786937713623, "learning_rate": 4.992252198584554e-05, "loss": 0.012, "reward": 1.0210082828998566, "reward_std": 1.7178089618682861, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.231904298067093, "rewards/no_repetition_reward_func": -0.2108960896730423, "rewards/verse_reward_func": 0.0, "step": 766 }, { "completion_length": 252.8125, "epoch": 6.136, "grad_norm": 0.458984375, "kl": 0.32319408655166626, "learning_rate": 4.992141968795623e-05, "loss": 0.0129, "reward": 0.9209352731704712, "reward_std": 1.5130998492240906, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1008894741535187, "rewards/no_repetition_reward_func": -0.1799541413784027, "rewards/verse_reward_func": 0.0, "step": 767 }, { "completion_length": 249.21875, "epoch": 6.144, "grad_norm": 0.53515625, "kl": 0.32361043989658356, "learning_rate": 4.9920309616372596e-05, "loss": 0.0129, "reward": 1.0652855038642883, "reward_std": 1.7408231496810913, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2594280242919922, "rewards/no_repetition_reward_func": -0.18633001297712326, "rewards/verse_reward_func": -0.0078125, "step": 768 }, { "completion_length": 256.0, "epoch": 6.152, "grad_norm": 0.53515625, "kl": 0.35369619727134705, "learning_rate": 4.9919191771440905e-05, "loss": 0.0141, "reward": 0.8741940259933472, "reward_std": 1.1013104617595673, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0698899328708649, "rewards/no_repetition_reward_func": -0.19569583982229233, "rewards/verse_reward_func": 0.0, "step": 769 }, { "completion_length": 253.09375, "epoch": 6.16, "grad_norm": 0.54296875, "kl": 0.37957851588726044, "learning_rate": 4.9918066153509834e-05, "loss": 0.0152, "reward": 0.5565506815910339, "reward_std": 1.149831235408783, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7271957993507385, "rewards/no_repetition_reward_func": -0.1706451252102852, "rewards/verse_reward_func": 0.0, "step": 770 }, { "completion_length": 240.40625, "epoch": 6.168, "grad_norm": 0.5, "kl": 0.38696321845054626, "learning_rate": 4.99169327629305e-05, "loss": 0.0155, "reward": 0.8711968660354614, "reward_std": 1.4749884605407715, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0546056926250458, "rewards/no_repetition_reward_func": -0.17559635639190674, "rewards/verse_reward_func": -0.0078125, "step": 771 }, { "completion_length": 244.203125, "epoch": 6.176, "grad_norm": 0.546875, "kl": 0.3767038434743881, "learning_rate": 4.991579160005644e-05, "loss": 0.0151, "reward": 1.0626643896102905, "reward_std": 1.5581230521202087, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.232874870300293, "rewards/no_repetition_reward_func": -0.17021045088768005, "rewards/verse_reward_func": 0.0, "step": 772 }, { "completion_length": 254.765625, "epoch": 6.184, "grad_norm": 0.419921875, "kl": 0.3381216675043106, "learning_rate": 4.99146426652436e-05, "loss": 0.0135, "reward": 1.206312656402588, "reward_std": 2.09549218416214, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4475255012512207, "rewards/no_repetition_reward_func": -0.24121282994747162, "rewards/verse_reward_func": 0.0, "step": 773 }, { "completion_length": 248.59375, "epoch": 6.192, "grad_norm": 1.140625, "kl": 0.4660876542329788, "learning_rate": 4.991348595885039e-05, "loss": 0.0186, "reward": 0.8550926148891449, "reward_std": 1.3759466409683228, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0213705897331238, "rewards/no_repetition_reward_func": -0.1662779524922371, "rewards/verse_reward_func": 0.0, "step": 774 }, { "completion_length": 250.328125, "epoch": 6.2, "grad_norm": 0.5078125, "kl": 0.45688214898109436, "learning_rate": 4.991232148123761e-05, "loss": 0.0183, "reward": 0.651453047990799, "reward_std": 1.2515479922294617, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8475837409496307, "rewards/no_repetition_reward_func": -0.1726931855082512, "rewards/verse_reward_func": -0.0234375, "step": 775 }, { "completion_length": 252.375, "epoch": 6.208, "grad_norm": 0.515625, "kl": 0.3181757926940918, "learning_rate": 4.991114923276849e-05, "loss": 0.0127, "reward": 1.237304002046585, "reward_std": 1.7258180379867554, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4545321464538574, "rewards/no_repetition_reward_func": -0.20941563695669174, "rewards/verse_reward_func": -0.0078125, "step": 776 }, { "completion_length": 252.609375, "epoch": 6.216, "grad_norm": 0.4765625, "kl": 0.3535824567079544, "learning_rate": 4.9909969213808683e-05, "loss": 0.0141, "reward": 0.9251421093940735, "reward_std": 1.6103251576423645, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1311315894126892, "rewards/no_repetition_reward_func": -0.2059895098209381, "rewards/verse_reward_func": 0.0, "step": 777 }, { "completion_length": 244.28125, "epoch": 6.224, "grad_norm": 0.439453125, "kl": 0.37442249059677124, "learning_rate": 4.990878142472628e-05, "loss": 0.015, "reward": 0.7166319489479065, "reward_std": 1.3274941444396973, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9142144322395325, "rewards/no_repetition_reward_func": -0.1897699534893036, "rewards/verse_reward_func": -0.0078125, "step": 778 }, { "completion_length": 251.65625, "epoch": 6.232, "grad_norm": 0.51171875, "kl": 0.38697585463523865, "learning_rate": 4.990758586589178e-05, "loss": 0.0155, "reward": 1.0726556181907654, "reward_std": 1.6902648210525513, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2878784537315369, "rewards/no_repetition_reward_func": -0.21522283554077148, "rewards/verse_reward_func": 0.0, "step": 779 }, { "completion_length": 253.453125, "epoch": 6.24, "grad_norm": 1.171875, "kl": 0.4760042428970337, "learning_rate": 4.990638253767812e-05, "loss": 0.019, "reward": 1.064941793680191, "reward_std": 1.7218309044837952, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.326147437095642, "rewards/no_repetition_reward_func": -0.23776813596487045, "rewards/verse_reward_func": -0.0234375, "step": 780 }, { "completion_length": 251.90625, "epoch": 6.248, "grad_norm": 0.60546875, "kl": 0.4568624496459961, "learning_rate": 4.990517144046064e-05, "loss": 0.0183, "reward": 0.9618114531040192, "reward_std": 1.539563775062561, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.154867023229599, "rewards/no_repetition_reward_func": -0.1852431297302246, "rewards/verse_reward_func": -0.0078125, "step": 781 }, { "completion_length": 245.421875, "epoch": 6.256, "grad_norm": 0.6953125, "kl": 0.39405301213264465, "learning_rate": 4.990395257461712e-05, "loss": 0.0158, "reward": 0.7695044279098511, "reward_std": 1.696589469909668, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9993433952331543, "rewards/no_repetition_reward_func": -0.19858895242214203, "rewards/verse_reward_func": -0.03125, "step": 782 }, { "completion_length": 252.96875, "epoch": 6.264, "grad_norm": 0.51953125, "kl": 0.4403918981552124, "learning_rate": 4.990272594052776e-05, "loss": 0.0176, "reward": 0.7344247102737427, "reward_std": 1.2985727787017822, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9226807951927185, "rewards/no_repetition_reward_func": -0.18825605511665344, "rewards/verse_reward_func": 0.0, "step": 783 }, { "completion_length": 248.453125, "epoch": 6.272, "grad_norm": 0.703125, "kl": 0.5383300483226776, "learning_rate": 4.9901491538575185e-05, "loss": 0.0215, "reward": 0.9601553082466125, "reward_std": 1.500553011894226, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1627925634384155, "rewards/no_repetition_reward_func": -0.19482476264238358, "rewards/verse_reward_func": -0.0078125, "step": 784 }, { "completion_length": 250.921875, "epoch": 6.28, "grad_norm": 0.5, "kl": 0.49294741451740265, "learning_rate": 4.9900249369144434e-05, "loss": 0.0197, "reward": 1.0667186975479126, "reward_std": 1.6515134572982788, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2723859548568726, "rewards/no_repetition_reward_func": -0.19004222005605698, "rewards/verse_reward_func": -0.015625, "step": 785 }, { "completion_length": 245.640625, "epoch": 6.288, "grad_norm": 0.5625, "kl": 0.4439554810523987, "learning_rate": 4.9898999432622974e-05, "loss": 0.0178, "reward": 1.001437485218048, "reward_std": 1.6385822296142578, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1881939768791199, "rewards/no_repetition_reward_func": -0.17894405126571655, "rewards/verse_reward_func": -0.0078125, "step": 786 }, { "completion_length": 252.90625, "epoch": 6.296, "grad_norm": 0.4375, "kl": 0.3969019800424576, "learning_rate": 4.9897741729400705e-05, "loss": 0.0159, "reward": 0.7361856698989868, "reward_std": 1.4977558255195618, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9776726365089417, "rewards/no_repetition_reward_func": -0.23367450386285782, "rewards/verse_reward_func": -0.0078125, "step": 787 }, { "completion_length": 253.765625, "epoch": 6.304, "grad_norm": 0.5390625, "kl": 0.3890359103679657, "learning_rate": 4.989647625986993e-05, "loss": 0.0156, "reward": 0.9723772406578064, "reward_std": 2.0299644470214844, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.19352787733078, "rewards/no_repetition_reward_func": -0.21333814412355423, "rewards/verse_reward_func": -0.0078125, "step": 788 }, { "completion_length": 243.203125, "epoch": 6.312, "grad_norm": 0.66015625, "kl": 0.4765145182609558, "learning_rate": 4.9895203024425385e-05, "loss": 0.0191, "reward": 0.7806112468242645, "reward_std": 1.5411900281906128, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9949837923049927, "rewards/no_repetition_reward_func": -0.19874755293130875, "rewards/verse_reward_func": -0.015625, "step": 789 }, { "completion_length": 254.171875, "epoch": 6.32, "grad_norm": 0.80859375, "kl": 0.40745678544044495, "learning_rate": 4.9893922023464236e-05, "loss": 0.0163, "reward": 1.1173654794692993, "reward_std": 1.6330668926239014, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.385489583015442, "rewards/no_repetition_reward_func": -0.260311558842659, "rewards/verse_reward_func": -0.0078125, "step": 790 }, { "completion_length": 236.9375, "epoch": 6.328, "grad_norm": 0.59765625, "kl": 0.523798018693924, "learning_rate": 4.989263325738605e-05, "loss": 0.021, "reward": 0.879092663526535, "reward_std": 1.6622341871261597, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0618743896484375, "rewards/no_repetition_reward_func": -0.17496921122074127, "rewards/verse_reward_func": -0.0078125, "step": 791 }, { "completion_length": 252.828125, "epoch": 6.336, "grad_norm": 0.67578125, "kl": 0.4038451015949249, "learning_rate": 4.9891336726592844e-05, "loss": 0.0162, "reward": 1.0272591710090637, "reward_std": 1.651059865951538, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2453936338424683, "rewards/no_repetition_reward_func": -0.21032197028398514, "rewards/verse_reward_func": -0.0078125, "step": 792 }, { "completion_length": 247.296875, "epoch": 6.344, "grad_norm": 0.5, "kl": 0.4697895348072052, "learning_rate": 4.989003243148904e-05, "loss": 0.0188, "reward": 0.9415053725242615, "reward_std": 1.6839424967765808, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1949013471603394, "rewards/no_repetition_reward_func": -0.22214601933956146, "rewards/verse_reward_func": -0.03125, "step": 793 }, { "completion_length": 250.296875, "epoch": 6.352, "grad_norm": 0.6484375, "kl": 0.5029455870389938, "learning_rate": 4.988872037248148e-05, "loss": 0.0201, "reward": 0.8641510009765625, "reward_std": 1.5465880036354065, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0637620985507965, "rewards/no_repetition_reward_func": -0.191798597574234, "rewards/verse_reward_func": -0.0078125, "step": 794 }, { "completion_length": 247.0625, "epoch": 6.36, "grad_norm": 0.96875, "kl": 0.5325969159603119, "learning_rate": 4.988740054997943e-05, "loss": 0.0213, "reward": 0.7389090359210968, "reward_std": 1.6957417130470276, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.934080183506012, "rewards/no_repetition_reward_func": -0.18735860288143158, "rewards/verse_reward_func": -0.0078125, "step": 795 }, { "completion_length": 246.375, "epoch": 6.368, "grad_norm": 0.54296875, "kl": 0.6878438591957092, "learning_rate": 4.988607296439458e-05, "loss": 0.0275, "reward": 0.8911568522453308, "reward_std": 1.659566879272461, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1043224930763245, "rewards/no_repetition_reward_func": -0.20535314828157425, "rewards/verse_reward_func": -0.0078125, "step": 796 }, { "completion_length": 238.6875, "epoch": 6.376, "grad_norm": 0.9296875, "kl": 0.7458266913890839, "learning_rate": 4.988473761614105e-05, "loss": 0.0298, "reward": 1.0567657351493835, "reward_std": 1.965587079524994, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3167980909347534, "rewards/no_repetition_reward_func": -0.23659487068653107, "rewards/verse_reward_func": -0.0234375, "step": 797 }, { "completion_length": 248.796875, "epoch": 6.384, "grad_norm": 0.82421875, "kl": 0.956406980752945, "learning_rate": 4.9883394505635364e-05, "loss": 0.0383, "reward": 1.0273758471012115, "reward_std": 1.7875309586524963, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.25430828332901, "rewards/no_repetition_reward_func": -0.22693240642547607, "rewards/verse_reward_func": 0.0, "step": 798 }, { "completion_length": 244.25, "epoch": 6.392, "grad_norm": 1.5703125, "kl": 1.0887822806835175, "learning_rate": 4.988204363329648e-05, "loss": 0.0436, "reward": 1.008812665939331, "reward_std": 1.601359486579895, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2645281553268433, "rewards/no_repetition_reward_func": -0.240090474486351, "rewards/verse_reward_func": -0.015625, "step": 799 }, { "completion_length": 252.9375, "epoch": 6.4, "grad_norm": 1.1875, "kl": 0.9744823276996613, "learning_rate": 4.988068499954578e-05, "loss": 0.039, "reward": 1.1392996907234192, "reward_std": 1.952589988708496, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4079615473747253, "rewards/no_repetition_reward_func": -0.2452244684100151, "rewards/verse_reward_func": -0.0234375, "step": 800 }, { "completion_length": 255.265625, "epoch": 6.408, "grad_norm": 1.953125, "kl": 0.9746177196502686, "learning_rate": 4.987931860480705e-05, "loss": 0.039, "reward": 0.8526526093482971, "reward_std": 1.4870636463165283, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0891418159008026, "rewards/no_repetition_reward_func": -0.22867662459611893, "rewards/verse_reward_func": -0.0078125, "step": 801 }, { "completion_length": 253.1875, "epoch": 6.416, "grad_norm": 2.390625, "kl": 1.32183039188385, "learning_rate": 4.987794444950651e-05, "loss": 0.0529, "reward": 1.065685212612152, "reward_std": 1.775512158870697, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3299841284751892, "rewards/no_repetition_reward_func": -0.24086131155490875, "rewards/verse_reward_func": -0.0234375, "step": 802 }, { "completion_length": 252.15625, "epoch": 6.424, "grad_norm": 19.5, "kl": 1.0203782767057419, "learning_rate": 4.98765625340728e-05, "loss": 0.0408, "reward": 0.712509959936142, "reward_std": 1.1722198128700256, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.963377445936203, "rewards/no_repetition_reward_func": -0.24305498600006104, "rewards/verse_reward_func": -0.0078125, "step": 803 }, { "completion_length": 256.0, "epoch": 6.432, "grad_norm": 1.078125, "kl": 0.6949650645256042, "learning_rate": 4.987517285893697e-05, "loss": 0.0278, "reward": 1.3077793717384338, "reward_std": 2.092375636100769, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5810709595680237, "rewards/no_repetition_reward_func": -0.24985402822494507, "rewards/verse_reward_func": -0.0234375, "step": 804 }, { "completion_length": 241.453125, "epoch": 6.44, "grad_norm": 0.70703125, "kl": 0.7898573875427246, "learning_rate": 4.987377542453251e-05, "loss": 0.0316, "reward": 0.8245095610618591, "reward_std": 1.3742837607860565, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0235246419906616, "rewards/no_repetition_reward_func": -0.1912025436758995, "rewards/verse_reward_func": -0.0078125, "step": 805 }, { "completion_length": 253.1875, "epoch": 6.448, "grad_norm": 0.72265625, "kl": 0.6375373005867004, "learning_rate": 4.987237023129531e-05, "loss": 0.0255, "reward": 1.0698119699954987, "reward_std": 1.7375482320785522, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.318746030330658, "rewards/no_repetition_reward_func": -0.24893417954444885, "rewards/verse_reward_func": 0.0, "step": 806 }, { "completion_length": 253.171875, "epoch": 6.456, "grad_norm": 0.52734375, "kl": 0.4162752479314804, "learning_rate": 4.98709572796637e-05, "loss": 0.0167, "reward": 1.5497819781303406, "reward_std": 1.9512532949447632, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8124139904975891, "rewards/no_repetition_reward_func": -0.2626320719718933, "rewards/verse_reward_func": 0.0, "step": 807 }, { "completion_length": 251.625, "epoch": 6.464, "grad_norm": 0.59375, "kl": 0.5052058100700378, "learning_rate": 4.986953657007841e-05, "loss": 0.0202, "reward": 1.0920945405960083, "reward_std": 1.8527244925498962, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3177654147148132, "rewards/no_repetition_reward_func": -0.2256709337234497, "rewards/verse_reward_func": 0.0, "step": 808 }, { "completion_length": 253.40625, "epoch": 6.4719999999999995, "grad_norm": 0.65234375, "kl": 0.38858653604984283, "learning_rate": 4.9868108102982604e-05, "loss": 0.0155, "reward": 1.1660951375961304, "reward_std": 1.8735880851745605, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4179860353469849, "rewards/no_repetition_reward_func": -0.24407842755317688, "rewards/verse_reward_func": -0.0078125, "step": 809 }, { "completion_length": 243.5, "epoch": 6.48, "grad_norm": 0.62109375, "kl": 0.4306005537509918, "learning_rate": 4.986667187882186e-05, "loss": 0.0172, "reward": 0.503172442317009, "reward_std": 1.1123994886875153, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6966835409402847, "rewards/no_repetition_reward_func": -0.17788607627153397, "rewards/verse_reward_func": -0.015625, "step": 810 }, { "completion_length": 253.984375, "epoch": 6.4879999999999995, "grad_norm": 0.486328125, "kl": 0.3644150048494339, "learning_rate": 4.986522789804417e-05, "loss": 0.0146, "reward": 0.7199172675609589, "reward_std": 1.210126280784607, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9759665131568909, "rewards/no_repetition_reward_func": -0.24042436480522156, "rewards/verse_reward_func": -0.015625, "step": 811 }, { "completion_length": 252.390625, "epoch": 6.496, "grad_norm": 0.5703125, "kl": 0.32757821679115295, "learning_rate": 4.9863776161099964e-05, "loss": 0.0131, "reward": 1.3177664875984192, "reward_std": 2.083250403404236, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5940025448799133, "rewards/no_repetition_reward_func": -0.2684236243367195, "rewards/verse_reward_func": -0.0078125, "step": 812 }, { "completion_length": 253.609375, "epoch": 6.504, "grad_norm": 0.5078125, "kl": 0.37128907442092896, "learning_rate": 4.986231666844208e-05, "loss": 0.0149, "reward": 0.6161737143993378, "reward_std": 1.586263656616211, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9068690538406372, "rewards/no_repetition_reward_func": -0.251632884144783, "rewards/verse_reward_func": -0.0390625, "step": 813 }, { "completion_length": 256.0, "epoch": 6.5120000000000005, "grad_norm": 0.55859375, "kl": 0.38991089165210724, "learning_rate": 4.9860849420525766e-05, "loss": 0.0156, "reward": 0.8751431405544281, "reward_std": 1.677483320236206, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1443889737129211, "rewards/no_repetition_reward_func": -0.23799588531255722, "rewards/verse_reward_func": -0.03125, "step": 814 }, { "completion_length": 256.0, "epoch": 6.52, "grad_norm": 0.46875, "kl": 0.32688403129577637, "learning_rate": 4.98593744178087e-05, "loss": 0.0131, "reward": 0.9184901118278503, "reward_std": 1.6495922803878784, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1689593195915222, "rewards/no_repetition_reward_func": -0.23484419286251068, "rewards/verse_reward_func": -0.015625, "step": 815 }, { "completion_length": 248.125, "epoch": 6.5280000000000005, "grad_norm": 0.66796875, "kl": 0.30271007120609283, "learning_rate": 4.9857891660750986e-05, "loss": 0.0121, "reward": 1.2949085235595703, "reward_std": 2.1821446418762207, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5724899172782898, "rewards/no_repetition_reward_func": -0.2619563043117523, "rewards/verse_reward_func": -0.015625, "step": 816 }, { "completion_length": 247.328125, "epoch": 6.536, "grad_norm": 0.6953125, "kl": 0.38435572385787964, "learning_rate": 4.9856401149815126e-05, "loss": 0.0154, "reward": 0.9628596901893616, "reward_std": 1.6563892364501953, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1966753005981445, "rewards/no_repetition_reward_func": -0.22600318491458893, "rewards/verse_reward_func": -0.0078125, "step": 817 }, { "completion_length": 250.625, "epoch": 6.5440000000000005, "grad_norm": 0.84765625, "kl": 0.424517497420311, "learning_rate": 4.985490288546606e-05, "loss": 0.017, "reward": 1.4241613745689392, "reward_std": 2.1980082988739014, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6518131494522095, "rewards/no_repetition_reward_func": -0.21983937174081802, "rewards/verse_reward_func": -0.0078125, "step": 818 }, { "completion_length": 245.421875, "epoch": 6.552, "grad_norm": 0.72265625, "kl": 0.4581110179424286, "learning_rate": 4.985339686817113e-05, "loss": 0.0183, "reward": 1.2568825483322144, "reward_std": 2.02632212638855, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4924226999282837, "rewards/no_repetition_reward_func": -0.2199152261018753, "rewards/verse_reward_func": -0.015625, "step": 819 }, { "completion_length": 249.515625, "epoch": 6.5600000000000005, "grad_norm": 1.25, "kl": 0.5434080958366394, "learning_rate": 4.985188309840012e-05, "loss": 0.0217, "reward": 1.0052413642406464, "reward_std": 1.6848570108413696, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2305690050125122, "rewards/no_repetition_reward_func": -0.2253277227282524, "rewards/verse_reward_func": 0.0, "step": 820 }, { "completion_length": 246.59375, "epoch": 6.568, "grad_norm": 0.875, "kl": 0.5315926969051361, "learning_rate": 4.985036157662521e-05, "loss": 0.0213, "reward": 1.0839297771453857, "reward_std": 1.8398511409759521, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3341925740242004, "rewards/no_repetition_reward_func": -0.23463782668113708, "rewards/verse_reward_func": -0.015625, "step": 821 }, { "completion_length": 245.75, "epoch": 6.576, "grad_norm": 1.1171875, "kl": 0.5049473792314529, "learning_rate": 4.984883230332099e-05, "loss": 0.0202, "reward": 0.9804214537143707, "reward_std": 1.7651756405830383, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2210246920585632, "rewards/no_repetition_reward_func": -0.22497814893722534, "rewards/verse_reward_func": -0.015625, "step": 822 }, { "completion_length": 252.25, "epoch": 6.584, "grad_norm": 0.7109375, "kl": 0.5067265331745148, "learning_rate": 4.9847295278964514e-05, "loss": 0.0203, "reward": 1.1217164993286133, "reward_std": 1.6556967496871948, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3464975357055664, "rewards/no_repetition_reward_func": -0.22478105127811432, "rewards/verse_reward_func": 0.0, "step": 823 }, { "completion_length": 248.875, "epoch": 6.592, "grad_norm": 1.0078125, "kl": 0.7228975892066956, "learning_rate": 4.9845750504035195e-05, "loss": 0.0289, "reward": 1.5003682374954224, "reward_std": 2.287202835083008, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.758164644241333, "rewards/no_repetition_reward_func": -0.24998392164707184, "rewards/verse_reward_func": -0.0078125, "step": 824 }, { "completion_length": 244.765625, "epoch": 6.6, "grad_norm": 0.69921875, "kl": 0.511970192193985, "learning_rate": 4.984419797901491e-05, "loss": 0.0205, "reward": 1.457135021686554, "reward_std": 2.180075168609619, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7080859541893005, "rewards/no_repetition_reward_func": -0.22751345485448837, "rewards/verse_reward_func": -0.0234375, "step": 825 }, { "completion_length": 251.203125, "epoch": 6.608, "grad_norm": 1.3359375, "kl": 0.7686850130558014, "learning_rate": 4.984263770438793e-05, "loss": 0.0307, "reward": 1.6456793546676636, "reward_std": 2.2752955555915833, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.885909378528595, "rewards/no_repetition_reward_func": -0.232417531311512, "rewards/verse_reward_func": -0.0078125, "step": 826 }, { "completion_length": 253.171875, "epoch": 6.616, "grad_norm": 0.94140625, "kl": 0.9129926562309265, "learning_rate": 4.984106968064095e-05, "loss": 0.0365, "reward": 1.0889034271240234, "reward_std": 1.4576642513275146, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2873912453651428, "rewards/no_repetition_reward_func": -0.19848789274692535, "rewards/verse_reward_func": 0.0, "step": 827 }, { "completion_length": 248.0, "epoch": 6.624, "grad_norm": 5.90625, "kl": 2.233668088912964, "learning_rate": 4.983949390826308e-05, "loss": 0.0893, "reward": 1.2918775379657745, "reward_std": 1.6508362293243408, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5186134576797485, "rewards/no_repetition_reward_func": -0.22673596441745758, "rewards/verse_reward_func": 0.0, "step": 828 }, { "completion_length": 249.890625, "epoch": 6.632, "grad_norm": 1.21875, "kl": 1.179654598236084, "learning_rate": 4.9837910387745845e-05, "loss": 0.0472, "reward": 1.2699270248413086, "reward_std": 1.877051591873169, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5028204321861267, "rewards/no_repetition_reward_func": -0.2172684371471405, "rewards/verse_reward_func": -0.015625, "step": 829 }, { "completion_length": 253.0, "epoch": 6.64, "grad_norm": 1.375, "kl": 0.5414243340492249, "learning_rate": 4.983631911958319e-05, "loss": 0.0217, "reward": 1.522219955921173, "reward_std": 2.0880706310272217, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7512183785438538, "rewards/no_repetition_reward_func": -0.2211860716342926, "rewards/verse_reward_func": -0.0078125, "step": 830 }, { "completion_length": 250.046875, "epoch": 6.648, "grad_norm": 1.4140625, "kl": 0.8062189817428589, "learning_rate": 4.9834720104271484e-05, "loss": 0.0322, "reward": 0.9670798778533936, "reward_std": 1.5132344365119934, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1943549513816833, "rewards/no_repetition_reward_func": -0.21165011823177338, "rewards/verse_reward_func": -0.015625, "step": 831 }, { "completion_length": 248.640625, "epoch": 6.656, "grad_norm": 0.84375, "kl": 0.8213241398334503, "learning_rate": 4.98331133423095e-05, "loss": 0.0329, "reward": 1.5873386859893799, "reward_std": 1.8252683877944946, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8253728747367859, "rewards/no_repetition_reward_func": -0.2302217036485672, "rewards/verse_reward_func": -0.0078125, "step": 832 }, { "completion_length": 246.5, "epoch": 6.664, "grad_norm": 1.9609375, "kl": 1.414080560207367, "learning_rate": 4.983149883419842e-05, "loss": 0.0566, "reward": 1.0509822964668274, "reward_std": 1.8196401596069336, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.268084168434143, "rewards/no_repetition_reward_func": -0.20928944647312164, "rewards/verse_reward_func": -0.0078125, "step": 833 }, { "completion_length": 252.671875, "epoch": 6.672, "grad_norm": 0.8671875, "kl": 0.6415015161037445, "learning_rate": 4.982987658044188e-05, "loss": 0.0257, "reward": 0.9936472773551941, "reward_std": 1.6828435063362122, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2323132753372192, "rewards/no_repetition_reward_func": -0.22304102033376694, "rewards/verse_reward_func": -0.015625, "step": 834 }, { "completion_length": 251.921875, "epoch": 6.68, "grad_norm": 1.8359375, "kl": 1.0801241993904114, "learning_rate": 4.982824658154589e-05, "loss": 0.0432, "reward": 1.2009358406066895, "reward_std": 1.7921912670135498, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4483851194381714, "rewards/no_repetition_reward_func": -0.2396368533372879, "rewards/verse_reward_func": -0.0078125, "step": 835 }, { "completion_length": 251.921875, "epoch": 6.688, "grad_norm": 1.75, "kl": 1.30610191822052, "learning_rate": 4.982660883801889e-05, "loss": 0.0522, "reward": 1.0551094114780426, "reward_std": 2.0083119869232178, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3018726110458374, "rewards/no_repetition_reward_func": -0.22332574427127838, "rewards/verse_reward_func": -0.0234375, "step": 836 }, { "completion_length": 255.109375, "epoch": 6.696, "grad_norm": 0.8125, "kl": 1.1719949841499329, "learning_rate": 4.982496335037175e-05, "loss": 0.0469, "reward": 1.4143125414848328, "reward_std": 2.0463242530822754, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6675381064414978, "rewards/no_repetition_reward_func": -0.2532254159450531, "rewards/verse_reward_func": 0.0, "step": 837 }, { "completion_length": 248.78125, "epoch": 6.704, "grad_norm": 1.484375, "kl": 1.4255139231681824, "learning_rate": 4.982331011911774e-05, "loss": 0.057, "reward": 0.758671760559082, "reward_std": 1.449147641658783, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0236837267875671, "rewards/no_repetition_reward_func": -0.24157442152500153, "rewards/verse_reward_func": -0.0234375, "step": 838 }, { "completion_length": 245.140625, "epoch": 6.712, "grad_norm": 0.92578125, "kl": 0.95753014087677, "learning_rate": 4.9821649144772545e-05, "loss": 0.0383, "reward": 1.254793107509613, "reward_std": 1.9290164709091187, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.487295925617218, "rewards/no_repetition_reward_func": -0.23250287771224976, "rewards/verse_reward_func": 0.0, "step": 839 }, { "completion_length": 255.765625, "epoch": 6.72, "grad_norm": 0.85546875, "kl": 0.8460690379142761, "learning_rate": 4.981998042785427e-05, "loss": 0.0338, "reward": 0.9500227719545364, "reward_std": 1.5798491835594177, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1995253562927246, "rewards/no_repetition_reward_func": -0.24169009923934937, "rewards/verse_reward_func": -0.0078125, "step": 840 }, { "completion_length": 250.75, "epoch": 6.728, "grad_norm": 0.73046875, "kl": 0.8782661259174347, "learning_rate": 4.981830396888344e-05, "loss": 0.0351, "reward": 1.1705176532268524, "reward_std": 1.8916509747505188, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4095433950424194, "rewards/no_repetition_reward_func": -0.23121327906847, "rewards/verse_reward_func": -0.0078125, "step": 841 }, { "completion_length": 250.375, "epoch": 6.736, "grad_norm": 2.140625, "kl": 1.445740520954132, "learning_rate": 4.981661976838299e-05, "loss": 0.0578, "reward": 1.1163818836212158, "reward_std": 1.6247622966766357, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.344318926334381, "rewards/no_repetition_reward_func": -0.2279369905591011, "rewards/verse_reward_func": 0.0, "step": 842 }, { "completion_length": 256.0, "epoch": 6.744, "grad_norm": 1.0703125, "kl": 0.9275459051132202, "learning_rate": 4.9814927826878256e-05, "loss": 0.0371, "reward": 1.4724332690238953, "reward_std": 1.9973928928375244, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7071768045425415, "rewards/no_repetition_reward_func": -0.2347436249256134, "rewards/verse_reward_func": 0.0, "step": 843 }, { "completion_length": 256.0, "epoch": 6.752, "grad_norm": 0.84765625, "kl": 1.116780698299408, "learning_rate": 4.981322814489703e-05, "loss": 0.0447, "reward": 1.1425187587738037, "reward_std": 1.6285641193389893, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3783595561981201, "rewards/no_repetition_reward_func": -0.2358408123254776, "rewards/verse_reward_func": 0.0, "step": 844 }, { "completion_length": 250.5625, "epoch": 6.76, "grad_norm": 0.80078125, "kl": 0.8719207644462585, "learning_rate": 4.9811520722969465e-05, "loss": 0.0349, "reward": 0.8660511672496796, "reward_std": 1.4127416610717773, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0712374448776245, "rewards/no_repetition_reward_func": -0.20518632978200912, "rewards/verse_reward_func": 0.0, "step": 845 }, { "completion_length": 251.6875, "epoch": 6.768, "grad_norm": 1.6015625, "kl": 0.719853937625885, "learning_rate": 4.980980556162816e-05, "loss": 0.0288, "reward": 1.1964474022388458, "reward_std": 1.9925445318222046, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4317009449005127, "rewards/no_repetition_reward_func": -0.22744101285934448, "rewards/verse_reward_func": -0.0078125, "step": 846 }, { "completion_length": 249.78125, "epoch": 6.776, "grad_norm": 0.8984375, "kl": 1.0947000980377197, "learning_rate": 4.980808266140813e-05, "loss": 0.0438, "reward": 1.1885067820549011, "reward_std": 1.5812971591949463, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4117505550384521, "rewards/no_repetition_reward_func": -0.2232438325881958, "rewards/verse_reward_func": 0.0, "step": 847 }, { "completion_length": 250.828125, "epoch": 6.784, "grad_norm": 0.67578125, "kl": 0.8857119977474213, "learning_rate": 4.980635202284679e-05, "loss": 0.0354, "reward": 1.1033869981765747, "reward_std": 1.6866949796676636, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3399069905281067, "rewards/no_repetition_reward_func": -0.2287074774503708, "rewards/verse_reward_func": -0.0078125, "step": 848 }, { "completion_length": 253.03125, "epoch": 6.792, "grad_norm": 1.109375, "kl": 1.2812073826789856, "learning_rate": 4.980461364648398e-05, "loss": 0.0512, "reward": 0.9673897624015808, "reward_std": 1.6243728399276733, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1874721050262451, "rewards/no_repetition_reward_func": -0.22008243948221207, "rewards/verse_reward_func": 0.0, "step": 849 }, { "completion_length": 255.40625, "epoch": 6.8, "grad_norm": 0.56640625, "kl": 0.7793036103248596, "learning_rate": 4.980286753286195e-05, "loss": 0.0312, "reward": 1.3241742849349976, "reward_std": 2.247538685798645, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5958189964294434, "rewards/no_repetition_reward_func": -0.2560197561979294, "rewards/verse_reward_func": -0.015625, "step": 850 }, { "completion_length": 245.96875, "epoch": 6.808, "grad_norm": 0.68359375, "kl": 1.044660210609436, "learning_rate": 4.980111368252535e-05, "loss": 0.0418, "reward": 0.9439708590507507, "reward_std": 1.6701172590255737, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1662994027137756, "rewards/no_repetition_reward_func": -0.21451610326766968, "rewards/verse_reward_func": -0.0078125, "step": 851 }, { "completion_length": 246.96875, "epoch": 6.816, "grad_norm": 0.93359375, "kl": 1.302802562713623, "learning_rate": 4.9799352096021266e-05, "loss": 0.0521, "reward": 1.1180134415626526, "reward_std": 2.018368124961853, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.355922520160675, "rewards/no_repetition_reward_func": -0.2300964817404747, "rewards/verse_reward_func": -0.0078125, "step": 852 }, { "completion_length": 254.84375, "epoch": 6.824, "grad_norm": 0.8671875, "kl": 0.7558676898479462, "learning_rate": 4.979758277389919e-05, "loss": 0.0302, "reward": 0.7809170782566071, "reward_std": 1.8270381689071655, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9930982291698456, "rewards/no_repetition_reward_func": -0.2121811807155609, "rewards/verse_reward_func": 0.0, "step": 853 }, { "completion_length": 252.96875, "epoch": 6.832, "grad_norm": 1.234375, "kl": 0.7658964693546295, "learning_rate": 4.9795805716711e-05, "loss": 0.0306, "reward": 0.549546517431736, "reward_std": 1.3604126274585724, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7595341801643372, "rewards/no_repetition_reward_func": -0.19436265528202057, "rewards/verse_reward_func": -0.015625, "step": 854 }, { "completion_length": 247.1875, "epoch": 6.84, "grad_norm": 0.96484375, "kl": 0.8297922611236572, "learning_rate": 4.9794020925011044e-05, "loss": 0.0332, "reward": 1.1880182027816772, "reward_std": 2.0436679124832153, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4259989261627197, "rewards/no_repetition_reward_func": -0.2379806935787201, "rewards/verse_reward_func": 0.0, "step": 855 }, { "completion_length": 256.0, "epoch": 6.848, "grad_norm": 0.828125, "kl": 0.846998929977417, "learning_rate": 4.979222839935602e-05, "loss": 0.0339, "reward": 0.9442600607872009, "reward_std": 1.356271743774414, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1796774864196777, "rewards/no_repetition_reward_func": -0.2197924479842186, "rewards/verse_reward_func": -0.015625, "step": 856 }, { "completion_length": 241.0, "epoch": 6.856, "grad_norm": 3.375, "kl": 1.7045230865478516, "learning_rate": 4.979042814030509e-05, "loss": 0.0682, "reward": 0.6369239985942841, "reward_std": 1.2585601806640625, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8590770959854126, "rewards/no_repetition_reward_func": -0.20652809739112854, "rewards/verse_reward_func": -0.015625, "step": 857 }, { "completion_length": 250.84375, "epoch": 6.864, "grad_norm": 0.67578125, "kl": 0.9324458539485931, "learning_rate": 4.978862014841979e-05, "loss": 0.0373, "reward": 1.0029951333999634, "reward_std": 1.7322176694869995, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2319279313087463, "rewards/no_repetition_reward_func": -0.2133077159523964, "rewards/verse_reward_func": -0.015625, "step": 858 }, { "completion_length": 253.203125, "epoch": 6.872, "grad_norm": 1.171875, "kl": 1.550704836845398, "learning_rate": 4.9786804424264085e-05, "loss": 0.062, "reward": 1.5475972294807434, "reward_std": 2.291177272796631, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7792677879333496, "rewards/no_repetition_reward_func": -0.22385801374912262, "rewards/verse_reward_func": -0.0078125, "step": 859 }, { "completion_length": 244.046875, "epoch": 6.88, "grad_norm": 1.671875, "kl": 1.4675114154815674, "learning_rate": 4.978498096840436e-05, "loss": 0.0587, "reward": 0.77946737408638, "reward_std": 1.5320111513137817, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9866736829280853, "rewards/no_repetition_reward_func": -0.19158131629228592, "rewards/verse_reward_func": -0.015625, "step": 860 }, { "completion_length": 248.78125, "epoch": 6.888, "grad_norm": 2.140625, "kl": 1.5804182291030884, "learning_rate": 4.9783149781409404e-05, "loss": 0.0632, "reward": 0.4385823756456375, "reward_std": 1.199340969324112, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.6482267677783966, "rewards/no_repetition_reward_func": -0.1940193623304367, "rewards/verse_reward_func": -0.015625, "step": 861 }, { "completion_length": 250.890625, "epoch": 6.896, "grad_norm": 2.109375, "kl": 1.1540470719337463, "learning_rate": 4.9781310863850405e-05, "loss": 0.0462, "reward": 1.0669716596603394, "reward_std": 2.0492053627967834, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3130905032157898, "rewards/no_repetition_reward_func": -0.23049384355545044, "rewards/verse_reward_func": -0.015625, "step": 862 }, { "completion_length": 255.34375, "epoch": 6.904, "grad_norm": 0.74609375, "kl": 1.0686950087547302, "learning_rate": 4.977946421630098e-05, "loss": 0.0427, "reward": 1.1877829730510712, "reward_std": 2.079117476940155, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4241054952144623, "rewards/no_repetition_reward_func": -0.2285100668668747, "rewards/verse_reward_func": -0.0078125, "step": 863 }, { "completion_length": 249.78125, "epoch": 6.912, "grad_norm": 1.015625, "kl": 1.2096496224403381, "learning_rate": 4.977760983933714e-05, "loss": 0.0484, "reward": 1.0882200300693512, "reward_std": 1.7214243412017822, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2725467085838318, "rewards/no_repetition_reward_func": -0.1765141785144806, "rewards/verse_reward_func": -0.0078125, "step": 864 }, { "completion_length": 252.6875, "epoch": 6.92, "grad_norm": 1.1171875, "kl": 1.368546485900879, "learning_rate": 4.977574773353732e-05, "loss": 0.0547, "reward": 1.18972647190094, "reward_std": 1.855005145072937, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4289894700050354, "rewards/no_repetition_reward_func": -0.22363808751106262, "rewards/verse_reward_func": -0.015625, "step": 865 }, { "completion_length": 253.40625, "epoch": 6.928, "grad_norm": 1.453125, "kl": 1.8447370529174805, "learning_rate": 4.977387789948238e-05, "loss": 0.0738, "reward": 0.961736798286438, "reward_std": 1.8894940614700317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2135423421859741, "rewards/no_repetition_reward_func": -0.23618054389953613, "rewards/verse_reward_func": -0.015625, "step": 866 }, { "completion_length": 252.609375, "epoch": 6.936, "grad_norm": 0.9453125, "kl": 1.098572015762329, "learning_rate": 4.977200033775555e-05, "loss": 0.0439, "reward": 1.146183431148529, "reward_std": 2.0423543453216553, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3961946964263916, "rewards/no_repetition_reward_func": -0.2500114440917969, "rewards/verse_reward_func": 0.0, "step": 867 }, { "completion_length": 247.203125, "epoch": 6.944, "grad_norm": 1.453125, "kl": 1.6156167387962341, "learning_rate": 4.977011504894252e-05, "loss": 0.0646, "reward": 0.9837636351585388, "reward_std": 1.597393274307251, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1992380321025848, "rewards/no_repetition_reward_func": -0.21547439694404602, "rewards/verse_reward_func": 0.0, "step": 868 }, { "completion_length": 253.421875, "epoch": 6.952, "grad_norm": 1.7734375, "kl": 0.993777334690094, "learning_rate": 4.976822203363135e-05, "loss": 0.0398, "reward": 1.007406324148178, "reward_std": 1.790173053741455, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2371967732906342, "rewards/no_repetition_reward_func": -0.22197802364826202, "rewards/verse_reward_func": -0.0078125, "step": 869 }, { "completion_length": 251.09375, "epoch": 6.96, "grad_norm": 2.25, "kl": 0.9439399540424347, "learning_rate": 4.976632129241252e-05, "loss": 0.0378, "reward": 0.732579693198204, "reward_std": 1.5813514292240143, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9572000503540039, "rewards/no_repetition_reward_func": -0.21680789440870285, "rewards/verse_reward_func": -0.0078125, "step": 870 }, { "completion_length": 253.84375, "epoch": 6.968, "grad_norm": 2.203125, "kl": 0.709376871585846, "learning_rate": 4.9764412825878943e-05, "loss": 0.0284, "reward": 1.2630798816680908, "reward_std": 2.3180254697799683, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.51371169090271, "rewards/no_repetition_reward_func": -0.2271943837404251, "rewards/verse_reward_func": -0.0234375, "step": 871 }, { "completion_length": 252.234375, "epoch": 6.976, "grad_norm": 2.6875, "kl": 0.8254943490028381, "learning_rate": 4.97624966346259e-05, "loss": 0.033, "reward": 0.8487787544727325, "reward_std": 1.626845121383667, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0551021695137024, "rewards/no_repetition_reward_func": -0.20632342994213104, "rewards/verse_reward_func": 0.0, "step": 872 }, { "completion_length": 253.15625, "epoch": 6.984, "grad_norm": 2.65625, "kl": 0.8596548736095428, "learning_rate": 4.976057271925113e-05, "loss": 0.0344, "reward": 1.312869906425476, "reward_std": 2.288983941078186, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.546929121017456, "rewards/no_repetition_reward_func": -0.22624679654836655, "rewards/verse_reward_func": -0.0078125, "step": 873 }, { "completion_length": 249.59375, "epoch": 6.992, "grad_norm": 1.5, "kl": 1.0863572359085083, "learning_rate": 4.975864108035474e-05, "loss": 0.0435, "reward": 1.123713731765747, "reward_std": 2.010913670063019, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3663477301597595, "rewards/no_repetition_reward_func": -0.2348213866353035, "rewards/verse_reward_func": -0.0078125, "step": 874 }, { "completion_length": 242.4375, "epoch": 7.0, "grad_norm": 1.1875, "kl": 1.4190509915351868, "learning_rate": 4.975670171853926e-05, "loss": 0.0568, "reward": 1.0653299689292908, "reward_std": 2.117420017719269, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2892979979515076, "rewards/no_repetition_reward_func": -0.20834298431873322, "rewards/verse_reward_func": -0.015625, "step": 875 }, { "completion_length": 250.859375, "epoch": 7.008, "grad_norm": 1.8828125, "kl": 2.1558337211608887, "learning_rate": 4.975475463440964e-05, "loss": 0.0862, "reward": 1.1871364116668701, "reward_std": 2.0565709471702576, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.418267011642456, "rewards/no_repetition_reward_func": -0.2155056893825531, "rewards/verse_reward_func": -0.015625, "step": 876 }, { "completion_length": 252.578125, "epoch": 7.016, "grad_norm": 6.0625, "kl": 2.9226107597351074, "learning_rate": 4.975279982857324e-05, "loss": 0.1169, "reward": 0.8506084084510803, "reward_std": 1.754819631576538, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.067438542842865, "rewards/no_repetition_reward_func": -0.21683009713888168, "rewards/verse_reward_func": 0.0, "step": 877 }, { "completion_length": 251.53125, "epoch": 7.024, "grad_norm": 1.5859375, "kl": 2.0504040122032166, "learning_rate": 4.9750837301639796e-05, "loss": 0.082, "reward": 1.2499070465564728, "reward_std": 2.2156792879104614, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.467307984828949, "rewards/no_repetition_reward_func": -0.2174009531736374, "rewards/verse_reward_func": 0.0, "step": 878 }, { "completion_length": 251.625, "epoch": 7.032, "grad_norm": 3.1875, "kl": 2.2698689699172974, "learning_rate": 4.974886705422149e-05, "loss": 0.0908, "reward": 0.6620779633522034, "reward_std": 1.4246535301208496, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8901454210281372, "rewards/no_repetition_reward_func": -0.22025494277477264, "rewards/verse_reward_func": -0.0078125, "step": 879 }, { "completion_length": 248.515625, "epoch": 7.04, "grad_norm": 5.625, "kl": 3.02890682220459, "learning_rate": 4.9746889086932895e-05, "loss": 0.1212, "reward": 0.8775609135627747, "reward_std": 1.4999259114265442, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1100962162017822, "rewards/no_repetition_reward_func": -0.2169102355837822, "rewards/verse_reward_func": -0.015625, "step": 880 }, { "completion_length": 250.46875, "epoch": 7.048, "grad_norm": 1.4765625, "kl": 1.4128172397613525, "learning_rate": 4.9744903400391e-05, "loss": 0.0565, "reward": 1.2914723753929138, "reward_std": 2.1407947540283203, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.520127296447754, "rewards/no_repetition_reward_func": -0.21302999556064606, "rewards/verse_reward_func": -0.015625, "step": 881 }, { "completion_length": 252.390625, "epoch": 7.056, "grad_norm": 4.4375, "kl": 1.7749797701835632, "learning_rate": 4.974290999521519e-05, "loss": 0.071, "reward": 0.6262873411178589, "reward_std": 1.4145514965057373, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8664114475250244, "rewards/no_repetition_reward_func": -0.22449911385774612, "rewards/verse_reward_func": -0.015625, "step": 882 }, { "completion_length": 247.34375, "epoch": 7.064, "grad_norm": 0.89453125, "kl": 1.0790831446647644, "learning_rate": 4.974090887202726e-05, "loss": 0.0432, "reward": 1.3186776041984558, "reward_std": 1.7905214428901672, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5302578210830688, "rewards/no_repetition_reward_func": -0.20376765727996826, "rewards/verse_reward_func": -0.0078125, "step": 883 }, { "completion_length": 249.625, "epoch": 7.072, "grad_norm": 1.546875, "kl": 1.3132380247116089, "learning_rate": 4.973890003145143e-05, "loss": 0.0525, "reward": 0.8253572881221771, "reward_std": 1.632200002670288, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0685640573501587, "rewards/no_repetition_reward_func": -0.21195679157972336, "rewards/verse_reward_func": -0.03125, "step": 884 }, { "completion_length": 249.671875, "epoch": 7.08, "grad_norm": 0.91015625, "kl": 0.8426347970962524, "learning_rate": 4.973688347411431e-05, "loss": 0.0337, "reward": 0.617992639541626, "reward_std": 1.3584703207015991, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.832266628742218, "rewards/no_repetition_reward_func": -0.20646146684885025, "rewards/verse_reward_func": -0.0078125, "step": 885 }, { "completion_length": 248.734375, "epoch": 7.088, "grad_norm": 1.078125, "kl": 0.9991362988948822, "learning_rate": 4.9734859200644905e-05, "loss": 0.04, "reward": 1.2016112804412842, "reward_std": 1.8877158761024475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4303563833236694, "rewards/no_repetition_reward_func": -0.22093259543180466, "rewards/verse_reward_func": -0.0078125, "step": 886 }, { "completion_length": 238.1875, "epoch": 7.096, "grad_norm": 0.94921875, "kl": 1.0616003572940826, "learning_rate": 4.973282721167467e-05, "loss": 0.0425, "reward": 1.0026718080043793, "reward_std": 1.4379112124443054, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2156562805175781, "rewards/no_repetition_reward_func": -0.19735948741436005, "rewards/verse_reward_func": -0.015625, "step": 887 }, { "completion_length": 248.890625, "epoch": 7.104, "grad_norm": 1.5234375, "kl": 1.01621413230896, "learning_rate": 4.973078750783742e-05, "loss": 0.0406, "reward": 0.6018400490283966, "reward_std": 1.4054080843925476, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8312025964260101, "rewards/no_repetition_reward_func": -0.1824875995516777, "rewards/verse_reward_func": -0.046875, "step": 888 }, { "completion_length": 250.953125, "epoch": 7.112, "grad_norm": 1.5, "kl": 1.283844530582428, "learning_rate": 4.97287400897694e-05, "loss": 0.0514, "reward": 1.1446937322616577, "reward_std": 1.7924619317054749, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3807766437530518, "rewards/no_repetition_reward_func": -0.20483288168907166, "rewards/verse_reward_func": -0.03125, "step": 889 }, { "completion_length": 252.265625, "epoch": 7.12, "grad_norm": 1.25, "kl": 1.3531187176704407, "learning_rate": 4.9726684958109266e-05, "loss": 0.0541, "reward": 0.9714410901069641, "reward_std": 2.0666732788085938, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1799232363700867, "rewards/no_repetition_reward_func": -0.19285719841718674, "rewards/verse_reward_func": -0.015625, "step": 890 }, { "completion_length": 249.171875, "epoch": 7.128, "grad_norm": 1.9296875, "kl": 0.9687939286231995, "learning_rate": 4.972462211349806e-05, "loss": 0.0388, "reward": 0.8063727095723152, "reward_std": 1.4531045407056808, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0302959606051445, "rewards/no_repetition_reward_func": -0.20048585534095764, "rewards/verse_reward_func": -0.0234375, "step": 891 }, { "completion_length": 252.3125, "epoch": 7.136, "grad_norm": 1.4453125, "kl": 1.5207387804985046, "learning_rate": 4.972255155657925e-05, "loss": 0.0608, "reward": 1.0612455606460571, "reward_std": 1.6690773963928223, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2929446697235107, "rewards/no_repetition_reward_func": -0.2238866314291954, "rewards/verse_reward_func": -0.0078125, "step": 892 }, { "completion_length": 252.125, "epoch": 7.144, "grad_norm": 1.03125, "kl": 1.605215311050415, "learning_rate": 4.9720473287998695e-05, "loss": 0.0642, "reward": 1.1112192869186401, "reward_std": 1.8074179887771606, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3244377374649048, "rewards/no_repetition_reward_func": -0.1897810623049736, "rewards/verse_reward_func": -0.0234375, "step": 893 }, { "completion_length": 253.78125, "epoch": 7.152, "grad_norm": 1.5546875, "kl": 1.5421146750450134, "learning_rate": 4.9718387308404675e-05, "loss": 0.0617, "reward": 1.3390461802482605, "reward_std": 2.342655062675476, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6242722272872925, "rewards/no_repetition_reward_func": -0.24616356194019318, "rewards/verse_reward_func": -0.0390625, "step": 894 }, { "completion_length": 254.625, "epoch": 7.16, "grad_norm": 0.90625, "kl": 1.0768148303031921, "learning_rate": 4.971629361844785e-05, "loss": 0.0431, "reward": 1.2312524914741516, "reward_std": 2.042689859867096, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4586381912231445, "rewards/no_repetition_reward_func": -0.22738569974899292, "rewards/verse_reward_func": 0.0, "step": 895 }, { "completion_length": 251.421875, "epoch": 7.168, "grad_norm": 1.3828125, "kl": 1.0421361923217773, "learning_rate": 4.9714192218781316e-05, "loss": 0.0417, "reward": 1.350337564945221, "reward_std": 2.1678454279899597, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5694657564163208, "rewards/no_repetition_reward_func": -0.21912814676761627, "rewards/verse_reward_func": 0.0, "step": 896 }, { "completion_length": 256.0, "epoch": 7.176, "grad_norm": 1.6875, "kl": 1.6820430755615234, "learning_rate": 4.9712083110060556e-05, "loss": 0.0673, "reward": 1.1812028884887695, "reward_std": 2.0498868823051453, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4465532898902893, "rewards/no_repetition_reward_func": -0.2419128566980362, "rewards/verse_reward_func": -0.0234375, "step": 897 }, { "completion_length": 248.515625, "epoch": 7.184, "grad_norm": 0.98828125, "kl": 1.7426720261573792, "learning_rate": 4.9709966292943455e-05, "loss": 0.0697, "reward": 1.0817338824272156, "reward_std": 1.865894615650177, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3330947756767273, "rewards/no_repetition_reward_func": -0.23573587089776993, "rewards/verse_reward_func": -0.015625, "step": 898 }, { "completion_length": 255.3125, "epoch": 7.192, "grad_norm": 51.25, "kl": 3.0937424898147583, "learning_rate": 4.9707841768090314e-05, "loss": 0.1237, "reward": 0.9867993593215942, "reward_std": 1.9745967388153076, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2503898441791534, "rewards/no_repetition_reward_func": -0.21671545505523682, "rewards/verse_reward_func": -0.046875, "step": 899 }, { "completion_length": 255.859375, "epoch": 7.2, "grad_norm": 1.4375, "kl": 2.1351219415664673, "learning_rate": 4.9705709536163824e-05, "loss": 0.0854, "reward": 1.1173745393753052, "reward_std": 1.9624792337417603, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3164197206497192, "rewards/no_repetition_reward_func": -0.19904520362615585, "rewards/verse_reward_func": 0.0, "step": 900 }, { "completion_length": 255.578125, "epoch": 7.208, "grad_norm": 1.609375, "kl": 1.8043521046638489, "learning_rate": 4.970356959782909e-05, "loss": 0.0722, "reward": 0.793923944234848, "reward_std": 1.8022722601890564, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.020615577697754, "rewards/no_repetition_reward_func": -0.2110666111111641, "rewards/verse_reward_func": -0.015625, "step": 901 }, { "completion_length": 256.0, "epoch": 7.216, "grad_norm": 2.125, "kl": 1.5445780158042908, "learning_rate": 4.970142195375363e-05, "loss": 0.0618, "reward": 0.8362632542848587, "reward_std": 1.5947047472000122, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1047805547714233, "rewards/no_repetition_reward_func": -0.23726734519004822, "rewards/verse_reward_func": -0.03125, "step": 902 }, { "completion_length": 252.015625, "epoch": 7.224, "grad_norm": 1.09375, "kl": 1.5856103301048279, "learning_rate": 4.9699266604607355e-05, "loss": 0.0634, "reward": 0.8616461753845215, "reward_std": 1.6990668773651123, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.092407613992691, "rewards/no_repetition_reward_func": -0.21513642370700836, "rewards/verse_reward_func": -0.015625, "step": 903 }, { "completion_length": 256.0, "epoch": 7.232, "grad_norm": 0.9375, "kl": 1.0569016337394714, "learning_rate": 4.9697103551062556e-05, "loss": 0.0423, "reward": 0.7486406862735748, "reward_std": 1.736048400402069, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0149555206298828, "rewards/no_repetition_reward_func": -0.2506897896528244, "rewards/verse_reward_func": -0.015625, "step": 904 }, { "completion_length": 247.78125, "epoch": 7.24, "grad_norm": 1.2734375, "kl": 0.8278602957725525, "learning_rate": 4.969493279379398e-05, "loss": 0.0331, "reward": 1.6774433851242065, "reward_std": 2.175652503967285, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.942798376083374, "rewards/no_repetition_reward_func": -0.24973005056381226, "rewards/verse_reward_func": -0.015625, "step": 905 }, { "completion_length": 249.5625, "epoch": 7.248, "grad_norm": 1.8203125, "kl": 0.8024688065052032, "learning_rate": 4.969275433347872e-05, "loss": 0.0321, "reward": 1.0789653062820435, "reward_std": 1.9322351217269897, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2877301573753357, "rewards/no_repetition_reward_func": -0.20876483619213104, "rewards/verse_reward_func": 0.0, "step": 906 }, { "completion_length": 252.734375, "epoch": 7.256, "grad_norm": 1.75, "kl": 0.8559224009513855, "learning_rate": 4.969056817079633e-05, "loss": 0.0342, "reward": 1.2957186698913574, "reward_std": 1.8711894750595093, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5124266147613525, "rewards/no_repetition_reward_func": -0.21670803427696228, "rewards/verse_reward_func": 0.0, "step": 907 }, { "completion_length": 251.734375, "epoch": 7.264, "grad_norm": 1.40625, "kl": 0.9526568353176117, "learning_rate": 4.9688374306428696e-05, "loss": 0.0381, "reward": 1.1317813992500305, "reward_std": 2.1990281343460083, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3443827033042908, "rewards/no_repetition_reward_func": -0.19697627425193787, "rewards/verse_reward_func": -0.015625, "step": 908 }, { "completion_length": 256.0, "epoch": 7.272, "grad_norm": 1.890625, "kl": 0.7367254495620728, "learning_rate": 4.968617274106019e-05, "loss": 0.0295, "reward": 0.855340838432312, "reward_std": 1.9944617748260498, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1222111284732819, "rewards/no_repetition_reward_func": -0.24343270063400269, "rewards/verse_reward_func": -0.0234375, "step": 909 }, { "completion_length": 253.28125, "epoch": 7.28, "grad_norm": 1.03125, "kl": 0.9893233776092529, "learning_rate": 4.968396347537751e-05, "loss": 0.0396, "reward": 1.2158559560775757, "reward_std": 2.09521484375, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4194748401641846, "rewards/no_repetition_reward_func": -0.19580641388893127, "rewards/verse_reward_func": -0.0078125, "step": 910 }, { "completion_length": 252.65625, "epoch": 7.288, "grad_norm": 1.90625, "kl": 1.8227573037147522, "learning_rate": 4.9681746510069805e-05, "loss": 0.0729, "reward": 0.9441046118736267, "reward_std": 1.9342188239097595, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1583767533302307, "rewards/no_repetition_reward_func": -0.2142721563577652, "rewards/verse_reward_func": 0.0, "step": 911 }, { "completion_length": 251.109375, "epoch": 7.296, "grad_norm": 1.390625, "kl": 1.618133544921875, "learning_rate": 4.9679521845828604e-05, "loss": 0.0647, "reward": 1.0648987591266632, "reward_std": 1.9707379341125488, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.302627682685852, "rewards/no_repetition_reward_func": -0.20647889375686646, "rewards/verse_reward_func": -0.03125, "step": 912 }, { "completion_length": 251.796875, "epoch": 7.304, "grad_norm": 1.2890625, "kl": 1.4640384912490845, "learning_rate": 4.967728948334784e-05, "loss": 0.0586, "reward": 0.6170102655887604, "reward_std": 1.321679949760437, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8481419682502747, "rewards/no_repetition_reward_func": -0.2076941877603531, "rewards/verse_reward_func": -0.0234375, "step": 913 }, { "completion_length": 247.1875, "epoch": 7.312, "grad_norm": 0.7578125, "kl": 1.1338188350200653, "learning_rate": 4.967504942332385e-05, "loss": 0.0454, "reward": 1.1620409488677979, "reward_std": 1.9730304479599, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3945574164390564, "rewards/no_repetition_reward_func": -0.23251652717590332, "rewards/verse_reward_func": 0.0, "step": 914 }, { "completion_length": 256.0, "epoch": 7.32, "grad_norm": 1.109375, "kl": 1.024137020111084, "learning_rate": 4.967280166645538e-05, "loss": 0.041, "reward": 0.9321473240852356, "reward_std": 1.7971391677856445, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1529276967048645, "rewards/no_repetition_reward_func": -0.2207803577184677, "rewards/verse_reward_func": 0.0, "step": 915 }, { "completion_length": 252.3125, "epoch": 7.328, "grad_norm": 1.0703125, "kl": 1.0498508214950562, "learning_rate": 4.967054621344356e-05, "loss": 0.042, "reward": 1.3738905787467957, "reward_std": 1.9364690780639648, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6475068926811218, "rewards/no_repetition_reward_func": -0.2579915151000023, "rewards/verse_reward_func": -0.015625, "step": 916 }, { "completion_length": 247.265625, "epoch": 7.336, "grad_norm": 0.92578125, "kl": 1.0707257986068726, "learning_rate": 4.966828306499193e-05, "loss": 0.0428, "reward": 0.874647319316864, "reward_std": 1.5076842308044434, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.094356656074524, "rewards/no_repetition_reward_func": -0.21970932930707932, "rewards/verse_reward_func": 0.0, "step": 917 }, { "completion_length": 255.375, "epoch": 7.344, "grad_norm": 1.0625, "kl": 1.2022607326507568, "learning_rate": 4.9666012221806434e-05, "loss": 0.0481, "reward": 1.2985407710075378, "reward_std": 2.1313997507095337, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.564548909664154, "rewards/no_repetition_reward_func": -0.2503831684589386, "rewards/verse_reward_func": -0.015625, "step": 918 }, { "completion_length": 248.828125, "epoch": 7.352, "grad_norm": 1.0390625, "kl": 0.9649212956428528, "learning_rate": 4.966373368459541e-05, "loss": 0.0386, "reward": 1.066928505897522, "reward_std": 2.1434061527252197, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3156393468379974, "rewards/no_repetition_reward_func": -0.2252732813358307, "rewards/verse_reward_func": -0.0234375, "step": 919 }, { "completion_length": 249.125, "epoch": 7.36, "grad_norm": 0.8828125, "kl": 1.5497153997421265, "learning_rate": 4.966144745406961e-05, "loss": 0.062, "reward": 0.9348263144493103, "reward_std": 1.8412669897079468, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1581050753593445, "rewards/no_repetition_reward_func": -0.2076537311077118, "rewards/verse_reward_func": -0.015625, "step": 920 }, { "completion_length": 252.734375, "epoch": 7.368, "grad_norm": 2.640625, "kl": 2.3525519371032715, "learning_rate": 4.965915353094215e-05, "loss": 0.0941, "reward": 1.2438867092132568, "reward_std": 2.007694125175476, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4675945043563843, "rewards/no_repetition_reward_func": -0.21589531004428864, "rewards/verse_reward_func": -0.0078125, "step": 921 }, { "completion_length": 253.359375, "epoch": 7.376, "grad_norm": 1.2578125, "kl": 1.7092148065567017, "learning_rate": 4.965685191592859e-05, "loss": 0.0684, "reward": 1.0075548887252808, "reward_std": 1.7558658719062805, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2450904846191406, "rewards/no_repetition_reward_func": -0.22972308099269867, "rewards/verse_reward_func": -0.0078125, "step": 922 }, { "completion_length": 256.0, "epoch": 7.384, "grad_norm": 2.96875, "kl": 2.4633710980415344, "learning_rate": 4.965454260974685e-05, "loss": 0.0985, "reward": 1.2219523787498474, "reward_std": 1.7698557376861572, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4928018450737, "rewards/no_repetition_reward_func": -0.24741198867559433, "rewards/verse_reward_func": -0.0234375, "step": 923 }, { "completion_length": 250.1875, "epoch": 7.392, "grad_norm": 2.296875, "kl": 1.7692094445228577, "learning_rate": 4.9652225613117284e-05, "loss": 0.0708, "reward": 1.0681825876235962, "reward_std": 1.6751796007156372, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2764020562171936, "rewards/no_repetition_reward_func": -0.2004069685935974, "rewards/verse_reward_func": -0.0078125, "step": 924 }, { "completion_length": 252.640625, "epoch": 7.4, "grad_norm": 2.109375, "kl": 1.7070144414901733, "learning_rate": 4.964990092676263e-05, "loss": 0.0683, "reward": 1.284591794013977, "reward_std": 1.7982569932937622, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5527055859565735, "rewards/no_repetition_reward_func": -0.25248880684375763, "rewards/verse_reward_func": -0.015625, "step": 925 }, { "completion_length": 254.453125, "epoch": 7.408, "grad_norm": 0.87890625, "kl": 1.2596924006938934, "learning_rate": 4.964756855140801e-05, "loss": 0.0504, "reward": 1.5267031788825989, "reward_std": 2.5037894248962402, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7684226036071777, "rewards/no_repetition_reward_func": -0.2417193204164505, "rewards/verse_reward_func": 0.0, "step": 926 }, { "completion_length": 253.859375, "epoch": 7.416, "grad_norm": 1.9375, "kl": 1.9057604670524597, "learning_rate": 4.964522848778096e-05, "loss": 0.0762, "reward": 0.8833639919757843, "reward_std": 1.8436721563339233, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1595720052719116, "rewards/no_repetition_reward_func": -0.2683955430984497, "rewards/verse_reward_func": -0.0078125, "step": 927 }, { "completion_length": 252.65625, "epoch": 7.424, "grad_norm": 1.03125, "kl": 1.3426657915115356, "learning_rate": 4.964288073661142e-05, "loss": 0.0537, "reward": 1.040624588727951, "reward_std": 1.8013249039649963, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2657508254051208, "rewards/no_repetition_reward_func": -0.22512617707252502, "rewards/verse_reward_func": 0.0, "step": 928 }, { "completion_length": 253.140625, "epoch": 7.432, "grad_norm": 1.8203125, "kl": 1.3455710411071777, "learning_rate": 4.964052529863171e-05, "loss": 0.0538, "reward": 1.8077496886253357, "reward_std": 2.4333033561706543, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.046558976173401, "rewards/no_repetition_reward_func": -0.2309967651963234, "rewards/verse_reward_func": -0.0078125, "step": 929 }, { "completion_length": 250.78125, "epoch": 7.44, "grad_norm": 0.828125, "kl": 1.558940052986145, "learning_rate": 4.963816217457657e-05, "loss": 0.0624, "reward": 1.1396155953407288, "reward_std": 2.096994161605835, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.369799017906189, "rewards/no_repetition_reward_func": -0.23018334805965424, "rewards/verse_reward_func": 0.0, "step": 930 }, { "completion_length": 252.203125, "epoch": 7.448, "grad_norm": 1.3671875, "kl": 1.6029589176177979, "learning_rate": 4.963579136518312e-05, "loss": 0.0641, "reward": 1.5633243918418884, "reward_std": 2.3883012533187866, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8315709233283997, "rewards/no_repetition_reward_func": -0.25262145698070526, "rewards/verse_reward_func": -0.015625, "step": 931 }, { "completion_length": 251.390625, "epoch": 7.456, "grad_norm": 1.0625, "kl": 1.6558680534362793, "learning_rate": 4.9633412871190873e-05, "loss": 0.0662, "reward": 1.1355789601802826, "reward_std": 1.89902263879776, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3600240349769592, "rewards/no_repetition_reward_func": -0.2244449481368065, "rewards/verse_reward_func": 0.0, "step": 932 }, { "completion_length": 253.75, "epoch": 7.464, "grad_norm": 0.9921875, "kl": 1.6832175254821777, "learning_rate": 4.9631026693341764e-05, "loss": 0.0673, "reward": 1.073776513338089, "reward_std": 1.9382247924804688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3139886856079102, "rewards/no_repetition_reward_func": -0.24021226912736893, "rewards/verse_reward_func": 0.0, "step": 933 }, { "completion_length": 246.109375, "epoch": 7.4719999999999995, "grad_norm": 2.140625, "kl": 1.888257622718811, "learning_rate": 4.96286328323801e-05, "loss": 0.0755, "reward": 2.0078715682029724, "reward_std": 2.8708481788635254, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.285566568374634, "rewards/no_repetition_reward_func": -0.26988257467746735, "rewards/verse_reward_func": -0.0078125, "step": 934 }, { "completion_length": 253.296875, "epoch": 7.48, "grad_norm": 1.6015625, "kl": 1.8511589169502258, "learning_rate": 4.9626231289052596e-05, "loss": 0.074, "reward": 1.2880651950836182, "reward_std": 2.1595195531845093, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.583446741104126, "rewards/no_repetition_reward_func": -0.2719440460205078, "rewards/verse_reward_func": -0.0234375, "step": 935 }, { "completion_length": 253.28125, "epoch": 7.4879999999999995, "grad_norm": 1.6796875, "kl": 2.2750595808029175, "learning_rate": 4.9623822064108364e-05, "loss": 0.091, "reward": 1.6215291023254395, "reward_std": 2.353622317314148, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8794309496879578, "rewards/no_repetition_reward_func": -0.24227695167064667, "rewards/verse_reward_func": -0.015625, "step": 936 }, { "completion_length": 256.0, "epoch": 7.496, "grad_norm": 1.53125, "kl": 1.9913344383239746, "learning_rate": 4.96214051582989e-05, "loss": 0.0797, "reward": 1.2141138911247253, "reward_std": 1.9108680486679077, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4834462404251099, "rewards/no_repetition_reward_func": -0.25370731949806213, "rewards/verse_reward_func": -0.015625, "step": 937 }, { "completion_length": 251.734375, "epoch": 7.504, "grad_norm": 2.671875, "kl": 1.9286339282989502, "learning_rate": 4.96189805723781e-05, "loss": 0.0771, "reward": 1.284025400876999, "reward_std": 2.4793719053268433, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4891839027404785, "rewards/no_repetition_reward_func": -0.20515847951173782, "rewards/verse_reward_func": 0.0, "step": 938 }, { "completion_length": 252.765625, "epoch": 7.5120000000000005, "grad_norm": 1.9453125, "kl": 2.669082522392273, "learning_rate": 4.961654830710229e-05, "loss": 0.1068, "reward": 0.9572393894195557, "reward_std": 1.8904012441635132, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1526327729225159, "rewards/no_repetition_reward_func": -0.1953933909535408, "rewards/verse_reward_func": 0.0, "step": 939 }, { "completion_length": 253.375, "epoch": 7.52, "grad_norm": 1.0234375, "kl": 1.7103662490844727, "learning_rate": 4.9614108363230135e-05, "loss": 0.0684, "reward": 1.6451933979988098, "reward_std": 2.574695110321045, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9455305337905884, "rewards/no_repetition_reward_func": -0.28471215069293976, "rewards/verse_reward_func": -0.015625, "step": 940 }, { "completion_length": 253.859375, "epoch": 7.5280000000000005, "grad_norm": 1.9609375, "kl": 1.9833675026893616, "learning_rate": 4.961166074152274e-05, "loss": 0.0793, "reward": 1.1353576183319092, "reward_std": 1.7001217603683472, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3399677276611328, "rewards/no_repetition_reward_func": -0.18898503482341766, "rewards/verse_reward_func": -0.015625, "step": 941 }, { "completion_length": 251.234375, "epoch": 7.536, "grad_norm": 1.0390625, "kl": 1.8694941401481628, "learning_rate": 4.9609205442743566e-05, "loss": 0.0748, "reward": 1.4819597005844116, "reward_std": 2.3599607944488525, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.731626570224762, "rewards/no_repetition_reward_func": -0.2262292429804802, "rewards/verse_reward_func": -0.0234375, "step": 942 }, { "completion_length": 251.03125, "epoch": 7.5440000000000005, "grad_norm": 1.1796875, "kl": 1.9319854974746704, "learning_rate": 4.960674246765851e-05, "loss": 0.0773, "reward": 1.7231401205062866, "reward_std": 2.3971134424209595, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9617294073104858, "rewards/no_repetition_reward_func": -0.23858927935361862, "rewards/verse_reward_func": 0.0, "step": 943 }, { "completion_length": 250.328125, "epoch": 7.552, "grad_norm": 1.3125, "kl": 1.3292348980903625, "learning_rate": 4.9604271817035834e-05, "loss": 0.0532, "reward": 0.9299003779888153, "reward_std": 1.7645471692085266, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1474903225898743, "rewards/no_repetition_reward_func": -0.217589870095253, "rewards/verse_reward_func": 0.0, "step": 944 }, { "completion_length": 251.734375, "epoch": 7.5600000000000005, "grad_norm": 1.1484375, "kl": 1.5646055936813354, "learning_rate": 4.960179349164621e-05, "loss": 0.0626, "reward": 1.5465738773345947, "reward_std": 2.11781507730484, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8025093078613281, "rewards/no_repetition_reward_func": -0.255935363471508, "rewards/verse_reward_func": 0.0, "step": 945 }, { "completion_length": 249.328125, "epoch": 7.568, "grad_norm": 1.171875, "kl": 1.7956596612930298, "learning_rate": 4.959930749226269e-05, "loss": 0.0718, "reward": 1.1023371815681458, "reward_std": 1.926612913608551, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3277864456176758, "rewards/no_repetition_reward_func": -0.20982428640127182, "rewards/verse_reward_func": -0.015625, "step": 946 }, { "completion_length": 249.890625, "epoch": 7.576, "grad_norm": 1.3203125, "kl": 1.9088272452354431, "learning_rate": 4.959681381966073e-05, "loss": 0.0764, "reward": 1.0308920443058014, "reward_std": 1.912396788597107, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2716851234436035, "rewards/no_repetition_reward_func": -0.23298054188489914, "rewards/verse_reward_func": -0.0078125, "step": 947 }, { "completion_length": 250.484375, "epoch": 7.584, "grad_norm": 1.1015625, "kl": 1.5014382600784302, "learning_rate": 4.9594312474618175e-05, "loss": 0.0601, "reward": 1.5578564405441284, "reward_std": 2.262612760066986, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8171193599700928, "rewards/no_repetition_reward_func": -0.25926291197538376, "rewards/verse_reward_func": 0.0, "step": 948 }, { "completion_length": 252.9375, "epoch": 7.592, "grad_norm": 1.859375, "kl": 1.6291932463645935, "learning_rate": 4.959180345791528e-05, "loss": 0.0652, "reward": 1.7352071404457092, "reward_std": 2.6478198766708374, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0060993432998657, "rewards/no_repetition_reward_func": -0.2630797326564789, "rewards/verse_reward_func": -0.0078125, "step": 949 }, { "completion_length": 247.375, "epoch": 7.6, "grad_norm": 1.3515625, "kl": 1.9182549715042114, "learning_rate": 4.9589286770334654e-05, "loss": 0.0767, "reward": 1.4209169149398804, "reward_std": 2.5419150590896606, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6906019449234009, "rewards/no_repetition_reward_func": -0.23843510448932648, "rewards/verse_reward_func": -0.03125, "step": 950 }, { "completion_length": 254.203125, "epoch": 7.608, "grad_norm": 0.99609375, "kl": 1.5815243124961853, "learning_rate": 4.9586762412661333e-05, "loss": 0.0633, "reward": 0.9976861774921417, "reward_std": 1.7780057191848755, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2344184517860413, "rewards/no_repetition_reward_func": -0.2289198711514473, "rewards/verse_reward_func": -0.0078125, "step": 951 }, { "completion_length": 247.625, "epoch": 7.616, "grad_norm": 1.28125, "kl": 2.1573779582977295, "learning_rate": 4.958423038568274e-05, "loss": 0.0863, "reward": 1.0946545600891113, "reward_std": 1.7524455785751343, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3233542442321777, "rewards/no_repetition_reward_func": -0.20526223629713058, "rewards/verse_reward_func": -0.0234375, "step": 952 }, { "completion_length": 245.5625, "epoch": 7.624, "grad_norm": 1.2734375, "kl": 2.0097737312316895, "learning_rate": 4.958169069018869e-05, "loss": 0.0804, "reward": 1.240204781293869, "reward_std": 2.095954656600952, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4682055115699768, "rewards/no_repetition_reward_func": -0.21237574517726898, "rewards/verse_reward_func": -0.015625, "step": 953 }, { "completion_length": 250.296875, "epoch": 7.632, "grad_norm": 1.5859375, "kl": 1.4694258570671082, "learning_rate": 4.957914332697137e-05, "loss": 0.0588, "reward": 1.9587507247924805, "reward_std": 2.7254321575164795, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.241810441017151, "rewards/no_repetition_reward_func": -0.2830597311258316, "rewards/verse_reward_func": 0.0, "step": 954 }, { "completion_length": 254.203125, "epoch": 7.64, "grad_norm": 1.2109375, "kl": 2.132238507270813, "learning_rate": 4.9576588296825386e-05, "loss": 0.0853, "reward": 1.2579952776432037, "reward_std": 2.2891998887062073, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5001490712165833, "rewards/no_repetition_reward_func": -0.23434127867221832, "rewards/verse_reward_func": -0.0078125, "step": 955 }, { "completion_length": 251.703125, "epoch": 7.648, "grad_norm": 1.6796875, "kl": 2.290048122406006, "learning_rate": 4.957402560054773e-05, "loss": 0.0916, "reward": 0.9112907946109772, "reward_std": 1.8982796669006348, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.185679852962494, "rewards/no_repetition_reward_func": -0.2743890583515167, "rewards/verse_reward_func": 0.0, "step": 956 }, { "completion_length": 249.59375, "epoch": 7.656, "grad_norm": 1.8671875, "kl": 2.580196976661682, "learning_rate": 4.957145523893776e-05, "loss": 0.1032, "reward": 1.0636639595031738, "reward_std": 1.8405083417892456, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2993430495262146, "rewards/no_repetition_reward_func": -0.21224157512187958, "rewards/verse_reward_func": -0.0234375, "step": 957 }, { "completion_length": 248.546875, "epoch": 7.664, "grad_norm": 2.265625, "kl": 2.3819459676742554, "learning_rate": 4.956887721279726e-05, "loss": 0.0953, "reward": 1.2139822840690613, "reward_std": 1.8791494369506836, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4084305167198181, "rewards/no_repetition_reward_func": -0.17882321774959564, "rewards/verse_reward_func": -0.015625, "step": 958 }, { "completion_length": 244.765625, "epoch": 7.672, "grad_norm": 1.015625, "kl": 1.918041706085205, "learning_rate": 4.9566291522930375e-05, "loss": 0.0767, "reward": 1.3633157014846802, "reward_std": 2.1778979301452637, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6393109261989594, "rewards/no_repetition_reward_func": -0.2447451800107956, "rewards/verse_reward_func": -0.03125, "step": 959 }, { "completion_length": 246.828125, "epoch": 7.68, "grad_norm": 1.53125, "kl": 1.1239044070243835, "learning_rate": 4.9563698170143666e-05, "loss": 0.045, "reward": 1.5356465578079224, "reward_std": 2.2494970560073853, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8397571444511414, "rewards/no_repetition_reward_func": -0.2962980940937996, "rewards/verse_reward_func": -0.0078125, "step": 960 }, { "completion_length": 249.484375, "epoch": 7.688, "grad_norm": 1.21875, "kl": 1.4795981049537659, "learning_rate": 4.956109715524608e-05, "loss": 0.0592, "reward": 1.3513786792755127, "reward_std": 2.1870534420013428, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6059293746948242, "rewards/no_repetition_reward_func": -0.23892565071582794, "rewards/verse_reward_func": -0.015625, "step": 961 }, { "completion_length": 255.015625, "epoch": 7.696, "grad_norm": 1.0625, "kl": 1.6652514338493347, "learning_rate": 4.955848847904894e-05, "loss": 0.0666, "reward": 1.6966251730918884, "reward_std": 2.3639793395996094, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9387180805206299, "rewards/no_repetition_reward_func": -0.24209284037351608, "rewards/verse_reward_func": 0.0, "step": 962 }, { "completion_length": 246.59375, "epoch": 7.704, "grad_norm": 2.359375, "kl": 2.7277162075042725, "learning_rate": 4.9555872142365945e-05, "loss": 0.1091, "reward": 1.068219542503357, "reward_std": 1.854051947593689, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2590786218643188, "rewards/no_repetition_reward_func": -0.1908591017127037, "rewards/verse_reward_func": 0.0, "step": 963 }, { "completion_length": 255.265625, "epoch": 7.712, "grad_norm": 4.46875, "kl": 3.5824004411697388, "learning_rate": 4.955324814601324e-05, "loss": 0.1433, "reward": 1.341755211353302, "reward_std": 1.9450767040252686, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.586718589067459, "rewards/no_repetition_reward_func": -0.2371508702635765, "rewards/verse_reward_func": -0.0078125, "step": 964 }, { "completion_length": 251.265625, "epoch": 7.72, "grad_norm": 1.484375, "kl": 2.2429245710372925, "learning_rate": 4.95506164908093e-05, "loss": 0.0897, "reward": 1.0681340396404266, "reward_std": 1.6783549785614014, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.27149498462677, "rewards/no_repetition_reward_func": -0.19554845243692398, "rewards/verse_reward_func": -0.0078125, "step": 965 }, { "completion_length": 255.078125, "epoch": 7.728, "grad_norm": 1.28125, "kl": 2.4161852598190308, "learning_rate": 4.9547977177575014e-05, "loss": 0.0966, "reward": 1.3886639475822449, "reward_std": 2.070668935775757, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6368716955184937, "rewards/no_repetition_reward_func": -0.2325827181339264, "rewards/verse_reward_func": -0.015625, "step": 966 }, { "completion_length": 252.15625, "epoch": 7.736, "grad_norm": 1.4296875, "kl": 2.348904609680176, "learning_rate": 4.9545330207133664e-05, "loss": 0.094, "reward": 1.529417872428894, "reward_std": 2.518564224243164, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7518261671066284, "rewards/no_repetition_reward_func": -0.22240828722715378, "rewards/verse_reward_func": 0.0, "step": 967 }, { "completion_length": 253.109375, "epoch": 7.744, "grad_norm": 1.359375, "kl": 1.886633276939392, "learning_rate": 4.954267558031092e-05, "loss": 0.0755, "reward": 1.226347029209137, "reward_std": 2.107405126094818, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4502041935920715, "rewards/no_repetition_reward_func": -0.21604468673467636, "rewards/verse_reward_func": -0.0078125, "step": 968 }, { "completion_length": 249.484375, "epoch": 7.752, "grad_norm": 1.1640625, "kl": 1.952187180519104, "learning_rate": 4.9540013297934826e-05, "loss": 0.0781, "reward": 1.5704030990600586, "reward_std": 2.081385016441345, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.793421983718872, "rewards/no_repetition_reward_func": -0.2073938027024269, "rewards/verse_reward_func": -0.015625, "step": 969 }, { "completion_length": 242.25, "epoch": 7.76, "grad_norm": 2.859375, "kl": 2.553574800491333, "learning_rate": 4.953734336083583e-05, "loss": 0.1021, "reward": 0.876029372215271, "reward_std": 1.7609411478042603, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0952139794826508, "rewards/no_repetition_reward_func": -0.19574709236621857, "rewards/verse_reward_func": -0.0234375, "step": 970 }, { "completion_length": 254.8125, "epoch": 7.768, "grad_norm": 0.97265625, "kl": 1.7756646275520325, "learning_rate": 4.953466576984675e-05, "loss": 0.071, "reward": 0.9416195452213287, "reward_std": 1.904327154159546, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1693038046360016, "rewards/no_repetition_reward_func": -0.22768420726060867, "rewards/verse_reward_func": 0.0, "step": 971 }, { "completion_length": 251.3125, "epoch": 7.776, "grad_norm": 1.1640625, "kl": 2.409230947494507, "learning_rate": 4.953198052580281e-05, "loss": 0.0964, "reward": 1.093723624944687, "reward_std": 1.9783670902252197, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3162738978862762, "rewards/no_repetition_reward_func": -0.206925168633461, "rewards/verse_reward_func": -0.015625, "step": 972 }, { "completion_length": 240.359375, "epoch": 7.784, "grad_norm": 1.5, "kl": 1.903691828250885, "learning_rate": 4.952928762954161e-05, "loss": 0.0761, "reward": 1.4235520958900452, "reward_std": 2.355111002922058, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6646637320518494, "rewards/no_repetition_reward_func": -0.22548670321702957, "rewards/verse_reward_func": -0.015625, "step": 973 }, { "completion_length": 256.0, "epoch": 7.792, "grad_norm": 2.109375, "kl": 1.9986734986305237, "learning_rate": 4.9526587081903145e-05, "loss": 0.0799, "reward": 1.401249349117279, "reward_std": 2.2018154859542847, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6703412532806396, "rewards/no_repetition_reward_func": -0.2534667178988457, "rewards/verse_reward_func": -0.015625, "step": 974 }, { "completion_length": 252.859375, "epoch": 7.8, "grad_norm": 1.6875, "kl": 1.554544448852539, "learning_rate": 4.952387888372979e-05, "loss": 0.0622, "reward": 1.9162665605545044, "reward_std": 2.352985680103302, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.17028272151947, "rewards/no_repetition_reward_func": -0.25401605665683746, "rewards/verse_reward_func": 0.0, "step": 975 }, { "completion_length": 239.71875, "epoch": 7.808, "grad_norm": 1.78125, "kl": 2.4385889172554016, "learning_rate": 4.952116303586631e-05, "loss": 0.0975, "reward": 0.5943225026130676, "reward_std": 1.4196311235427856, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.7981665730476379, "rewards/no_repetition_reward_func": -0.1725940853357315, "rewards/verse_reward_func": -0.03125, "step": 976 }, { "completion_length": 247.078125, "epoch": 7.816, "grad_norm": 0.8828125, "kl": 1.6003650426864624, "learning_rate": 4.951843953915985e-05, "loss": 0.064, "reward": 1.4198012351989746, "reward_std": 2.3520020246505737, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6765921711921692, "rewards/no_repetition_reward_func": -0.24897851049900055, "rewards/verse_reward_func": -0.0078125, "step": 977 }, { "completion_length": 247.109375, "epoch": 7.824, "grad_norm": 1.203125, "kl": 2.02906733751297, "learning_rate": 4.951570839445995e-05, "loss": 0.0812, "reward": 1.3608390092849731, "reward_std": 2.395640254020691, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.637468934059143, "rewards/no_repetition_reward_func": -0.24537988007068634, "rewards/verse_reward_func": -0.03125, "step": 978 }, { "completion_length": 253.28125, "epoch": 7.832, "grad_norm": 2.515625, "kl": 3.031827211380005, "learning_rate": 4.951296960261853e-05, "loss": 0.1213, "reward": 1.3307018876075745, "reward_std": 2.0265473127365112, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.586198627948761, "rewards/no_repetition_reward_func": -0.23987183719873428, "rewards/verse_reward_func": -0.015625, "step": 979 }, { "completion_length": 246.6875, "epoch": 7.84, "grad_norm": 2.078125, "kl": 2.5738492012023926, "learning_rate": 4.95102231644899e-05, "loss": 0.103, "reward": 1.5103654861450195, "reward_std": 2.7099368572235107, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8163443207740784, "rewards/no_repetition_reward_func": -0.2669164314866066, "rewards/verse_reward_func": -0.0390625, "step": 980 }, { "completion_length": 245.328125, "epoch": 7.848, "grad_norm": 1.75, "kl": 2.447508931159973, "learning_rate": 4.9507469080930734e-05, "loss": 0.0979, "reward": 1.7979114055633545, "reward_std": 2.529129981994629, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0699830651283264, "rewards/no_repetition_reward_func": -0.2564467266201973, "rewards/verse_reward_func": -0.015625, "step": 981 }, { "completion_length": 256.0, "epoch": 7.856, "grad_norm": 1.109375, "kl": 2.2522255778312683, "learning_rate": 4.9504707352800125e-05, "loss": 0.0901, "reward": 1.4444116353988647, "reward_std": 2.1904162168502808, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7142583131790161, "rewards/no_repetition_reward_func": -0.25422173738479614, "rewards/verse_reward_func": -0.015625, "step": 982 }, { "completion_length": 247.65625, "epoch": 7.864, "grad_norm": 5.75, "kl": 3.35110604763031, "learning_rate": 4.9501937980959545e-05, "loss": 0.134, "reward": 1.628995418548584, "reward_std": 2.5862799882888794, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.910351812839508, "rewards/no_repetition_reward_func": -0.2501063793897629, "rewards/verse_reward_func": -0.03125, "step": 983 }, { "completion_length": 252.1875, "epoch": 7.872, "grad_norm": 4.75, "kl": 3.7082040309906006, "learning_rate": 4.949916096627282e-05, "loss": 0.1483, "reward": 1.206632673740387, "reward_std": 2.2120320796966553, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4566166400909424, "rewards/no_repetition_reward_func": -0.2187339961528778, "rewards/verse_reward_func": -0.03125, "step": 984 }, { "completion_length": 247.859375, "epoch": 7.88, "grad_norm": 1.5703125, "kl": 1.867096722126007, "learning_rate": 4.949637630960617e-05, "loss": 0.0747, "reward": 1.523249864578247, "reward_std": 2.2365838289260864, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7789702415466309, "rewards/no_repetition_reward_func": -0.24009543657302856, "rewards/verse_reward_func": -0.015625, "step": 985 }, { "completion_length": 244.109375, "epoch": 7.888, "grad_norm": 2.265625, "kl": 2.201767086982727, "learning_rate": 4.949358401182824e-05, "loss": 0.0881, "reward": 1.0996905267238617, "reward_std": 1.8654215335845947, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3190231919288635, "rewards/no_repetition_reward_func": -0.21933263540267944, "rewards/verse_reward_func": 0.0, "step": 986 }, { "completion_length": 248.71875, "epoch": 7.896, "grad_norm": 2.578125, "kl": 2.522704243659973, "learning_rate": 4.949078407381e-05, "loss": 0.1009, "reward": 1.1908366978168488, "reward_std": 2.256490707397461, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4508190751075745, "rewards/no_repetition_reward_func": -0.22091986238956451, "rewards/verse_reward_func": -0.0390625, "step": 987 }, { "completion_length": 256.0, "epoch": 7.904, "grad_norm": 3.3125, "kl": 2.6181461811065674, "learning_rate": 4.948797649642484e-05, "loss": 0.1047, "reward": 0.9868910908699036, "reward_std": 1.4774927496910095, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2124682664871216, "rewards/no_repetition_reward_func": -0.20995217561721802, "rewards/verse_reward_func": -0.015625, "step": 988 }, { "completion_length": 252.703125, "epoch": 7.912, "grad_norm": 2.0625, "kl": 1.3931060433387756, "learning_rate": 4.948516128054852e-05, "loss": 0.0557, "reward": 1.544028401374817, "reward_std": 2.5446319580078125, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8273049592971802, "rewards/no_repetition_reward_func": -0.2520265430212021, "rewards/verse_reward_func": -0.03125, "step": 989 }, { "completion_length": 249.109375, "epoch": 7.92, "grad_norm": 1.1171875, "kl": 1.5042359232902527, "learning_rate": 4.948233842705919e-05, "loss": 0.0602, "reward": 1.0979307293891907, "reward_std": 2.1331995725631714, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3977090120315552, "rewards/no_repetition_reward_func": -0.2685282826423645, "rewards/verse_reward_func": -0.03125, "step": 990 }, { "completion_length": 249.8125, "epoch": 7.928, "grad_norm": 0.8359375, "kl": 1.846281111240387, "learning_rate": 4.9479507936837364e-05, "loss": 0.0739, "reward": 1.333755612373352, "reward_std": 1.768705427646637, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5827407836914062, "rewards/no_repetition_reward_func": -0.23336021602153778, "rewards/verse_reward_func": -0.015625, "step": 991 }, { "completion_length": 252.140625, "epoch": 7.936, "grad_norm": 1.7421875, "kl": 2.0539779663085938, "learning_rate": 4.947666981076597e-05, "loss": 0.0822, "reward": 1.3430189490318298, "reward_std": 2.537106156349182, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5928085446357727, "rewards/no_repetition_reward_func": -0.21072707325220108, "rewards/verse_reward_func": -0.0390625, "step": 992 }, { "completion_length": 252.21875, "epoch": 7.944, "grad_norm": 1.2578125, "kl": 1.7016598582267761, "learning_rate": 4.94738240497303e-05, "loss": 0.0681, "reward": 1.208688497543335, "reward_std": 2.1489673256874084, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4399616122245789, "rewards/no_repetition_reward_func": -0.23127318918704987, "rewards/verse_reward_func": 0.0, "step": 993 }, { "completion_length": 243.4375, "epoch": 7.952, "grad_norm": 1.1171875, "kl": 1.5544956922531128, "learning_rate": 4.947097065461801e-05, "loss": 0.0622, "reward": 1.5859720706939697, "reward_std": 2.3570974469184875, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8212417364120483, "rewards/no_repetition_reward_func": -0.2274571657180786, "rewards/verse_reward_func": -0.0078125, "step": 994 }, { "completion_length": 256.0, "epoch": 7.96, "grad_norm": 1.21875, "kl": 2.012086033821106, "learning_rate": 4.946810962631916e-05, "loss": 0.0805, "reward": 1.7889666557312012, "reward_std": 2.6192054748535156, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0731217861175537, "rewards/no_repetition_reward_func": -0.2685301825404167, "rewards/verse_reward_func": -0.015625, "step": 995 }, { "completion_length": 254.0625, "epoch": 7.968, "grad_norm": 1.265625, "kl": 2.3979278802871704, "learning_rate": 4.9465240965726195e-05, "loss": 0.0959, "reward": 1.2362503409385681, "reward_std": 2.215522527694702, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4941574931144714, "rewards/no_repetition_reward_func": -0.2500946968793869, "rewards/verse_reward_func": -0.0078125, "step": 996 }, { "completion_length": 256.0, "epoch": 7.976, "grad_norm": 2.28125, "kl": 2.9495162963867188, "learning_rate": 4.946236467373392e-05, "loss": 0.118, "reward": 1.3666187524795532, "reward_std": 2.2872610688209534, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5830238461494446, "rewards/no_repetition_reward_func": -0.21640509366989136, "rewards/verse_reward_func": 0.0, "step": 997 }, { "completion_length": 251.265625, "epoch": 7.984, "grad_norm": 3.96875, "kl": 2.747784435749054, "learning_rate": 4.945948075123954e-05, "loss": 0.1099, "reward": 1.0644095540046692, "reward_std": 1.9875365495681763, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3037093877792358, "rewards/no_repetition_reward_func": -0.23929984867572784, "rewards/verse_reward_func": 0.0, "step": 998 }, { "completion_length": 253.296875, "epoch": 7.992, "grad_norm": 2.046875, "kl": 1.7312621474266052, "learning_rate": 4.9456589199142637e-05, "loss": 0.0693, "reward": 2.1611011028289795, "reward_std": 2.547036647796631, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4380970001220703, "rewards/no_repetition_reward_func": -0.2691833972930908, "rewards/verse_reward_func": -0.0078125, "step": 999 }, { "completion_length": 241.875, "epoch": 8.0, "grad_norm": 1.5859375, "kl": 2.105294704437256, "learning_rate": 4.9453690018345144e-05, "loss": 0.0842, "reward": 1.577698290348053, "reward_std": 2.5304157733917236, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.822435975074768, "rewards/no_repetition_reward_func": -0.23692519962787628, "rewards/verse_reward_func": -0.0078125, "step": 1000 }, { "completion_length": 252.140625, "epoch": 8.008, "grad_norm": 1.265625, "kl": 1.9843942523002625, "learning_rate": 4.945078320975142e-05, "loss": 0.0794, "reward": 1.5155670642852783, "reward_std": 2.534904718399048, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7758639454841614, "rewards/no_repetition_reward_func": -0.25248442590236664, "rewards/verse_reward_func": -0.0078125, "step": 1001 }, { "completion_length": 256.0, "epoch": 8.016, "grad_norm": 2.140625, "kl": 2.8737980127334595, "learning_rate": 4.9447868774268166e-05, "loss": 0.115, "reward": 1.6928761005401611, "reward_std": 2.7195931673049927, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.929912805557251, "rewards/no_repetition_reward_func": -0.22922424972057343, "rewards/verse_reward_func": -0.0078125, "step": 1002 }, { "completion_length": 250.671875, "epoch": 8.024, "grad_norm": 1.3203125, "kl": 1.7834777235984802, "learning_rate": 4.9444946712804494e-05, "loss": 0.0713, "reward": 1.6837299466133118, "reward_std": 2.1815099716186523, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9502649307250977, "rewards/no_repetition_reward_func": -0.2430974245071411, "rewards/verse_reward_func": -0.0234375, "step": 1003 }, { "completion_length": 253.265625, "epoch": 8.032, "grad_norm": 1.9453125, "kl": 2.0031216740608215, "learning_rate": 4.9442017026271864e-05, "loss": 0.0801, "reward": 1.7454336881637573, "reward_std": 2.444065570831299, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9672438502311707, "rewards/no_repetition_reward_func": -0.21399762481451035, "rewards/verse_reward_func": -0.0078125, "step": 1004 }, { "completion_length": 247.984375, "epoch": 8.04, "grad_norm": 1.4375, "kl": 2.458342432975769, "learning_rate": 4.9439079715584135e-05, "loss": 0.0983, "reward": 1.5548396706581116, "reward_std": 2.1911784410476685, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7807612419128418, "rewards/no_repetition_reward_func": -0.21029659360647202, "rewards/verse_reward_func": -0.015625, "step": 1005 }, { "completion_length": 249.9375, "epoch": 8.048, "grad_norm": 2.328125, "kl": 2.6136163473129272, "learning_rate": 4.943613478165753e-05, "loss": 0.1045, "reward": 1.5214948058128357, "reward_std": 2.3941107988357544, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7649409770965576, "rewards/no_repetition_reward_func": -0.24344606697559357, "rewards/verse_reward_func": 0.0, "step": 1006 }, { "completion_length": 254.15625, "epoch": 8.056, "grad_norm": 410.0, "kl": 19.722984671592712, "learning_rate": 4.943318222541066e-05, "loss": 0.7889, "reward": 1.0360592603683472, "reward_std": 1.6716561913490295, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2682814002037048, "rewards/no_repetition_reward_func": -0.21659712493419647, "rewards/verse_reward_func": -0.015625, "step": 1007 }, { "completion_length": 249.484375, "epoch": 8.064, "grad_norm": 1.4453125, "kl": 2.5408741235733032, "learning_rate": 4.9430222047764506e-05, "loss": 0.1016, "reward": 1.4362399578094482, "reward_std": 2.250568985939026, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6564909219741821, "rewards/no_repetition_reward_func": -0.20462597906589508, "rewards/verse_reward_func": -0.015625, "step": 1008 }, { "completion_length": 250.1875, "epoch": 8.072, "grad_norm": 1.5234375, "kl": 2.7013540267944336, "learning_rate": 4.9427254249642444e-05, "loss": 0.1081, "reward": 1.4080070853233337, "reward_std": 2.3986542224884033, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.654770016670227, "rewards/no_repetition_reward_func": -0.23895039409399033, "rewards/verse_reward_func": -0.0078125, "step": 1009 }, { "completion_length": 240.8125, "epoch": 8.08, "grad_norm": 1.21875, "kl": 2.3663395643234253, "learning_rate": 4.942427883197021e-05, "loss": 0.0947, "reward": 0.985444188117981, "reward_std": 1.888737440109253, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.19329434633255, "rewards/no_repetition_reward_func": -0.20785018056631088, "rewards/verse_reward_func": 0.0, "step": 1010 }, { "completion_length": 246.25, "epoch": 8.088, "grad_norm": 1.3984375, "kl": 2.122484862804413, "learning_rate": 4.94212957956759e-05, "loss": 0.0849, "reward": 1.4935247898101807, "reward_std": 2.253728151321411, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7331838011741638, "rewards/no_repetition_reward_func": -0.23965896666049957, "rewards/verse_reward_func": 0.0, "step": 1011 }, { "completion_length": 255.328125, "epoch": 8.096, "grad_norm": 1.6328125, "kl": 2.7412571907043457, "learning_rate": 4.941830514169004e-05, "loss": 0.1097, "reward": 1.480798602104187, "reward_std": 2.123804271221161, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.719380497932434, "rewards/no_repetition_reward_func": -0.23076947033405304, "rewards/verse_reward_func": -0.0078125, "step": 1012 }, { "completion_length": 254.140625, "epoch": 8.104, "grad_norm": 1.8046875, "kl": 2.2842386960983276, "learning_rate": 4.941530687094548e-05, "loss": 0.0914, "reward": 1.2716558575630188, "reward_std": 2.4434783458709717, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5579733848571777, "rewards/no_repetition_reward_func": -0.2550675719976425, "rewards/verse_reward_func": -0.03125, "step": 1013 }, { "completion_length": 247.75, "epoch": 8.112, "grad_norm": 1.28125, "kl": 2.1160163283348083, "learning_rate": 4.941230098437747e-05, "loss": 0.0846, "reward": 1.551452100276947, "reward_std": 2.2326717376708984, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8144882321357727, "rewards/no_repetition_reward_func": -0.2395985797047615, "rewards/verse_reward_func": -0.0234375, "step": 1014 }, { "completion_length": 256.0, "epoch": 8.12, "grad_norm": 0.890625, "kl": 2.5740855932235718, "learning_rate": 4.940928748292363e-05, "loss": 0.103, "reward": 1.3833904266357422, "reward_std": 2.2961583137512207, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6446579694747925, "rewards/no_repetition_reward_func": -0.2456425577402115, "rewards/verse_reward_func": -0.015625, "step": 1015 }, { "completion_length": 253.0, "epoch": 8.128, "grad_norm": 1.296875, "kl": 1.8367489576339722, "learning_rate": 4.9406266367523945e-05, "loss": 0.0735, "reward": 1.4986603260040283, "reward_std": 2.2482666969299316, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.743171513080597, "rewards/no_repetition_reward_func": -0.2366987094283104, "rewards/verse_reward_func": -0.0078125, "step": 1016 }, { "completion_length": 252.875, "epoch": 8.136, "grad_norm": 0.98828125, "kl": 2.4459389448165894, "learning_rate": 4.9403237639120805e-05, "loss": 0.0978, "reward": 1.231771856546402, "reward_std": 1.923745334148407, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4527851939201355, "rewards/no_repetition_reward_func": -0.2053883895277977, "rewards/verse_reward_func": -0.015625, "step": 1017 }, { "completion_length": 250.046875, "epoch": 8.144, "grad_norm": 1.5859375, "kl": 2.4212554693222046, "learning_rate": 4.940020129865895e-05, "loss": 0.0969, "reward": 1.3461110293865204, "reward_std": 2.2377467155456543, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5881145000457764, "rewards/no_repetition_reward_func": -0.23419103026390076, "rewards/verse_reward_func": -0.0078125, "step": 1018 }, { "completion_length": 252.890625, "epoch": 8.152, "grad_norm": 2.09375, "kl": 3.1101908683776855, "learning_rate": 4.93971573470855e-05, "loss": 0.1244, "reward": 1.7346885800361633, "reward_std": 2.4234217405319214, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9910393357276917, "rewards/no_repetition_reward_func": -0.25635072588920593, "rewards/verse_reward_func": 0.0, "step": 1019 }, { "completion_length": 253.6875, "epoch": 8.16, "grad_norm": 1.6640625, "kl": 2.0224185585975647, "learning_rate": 4.9394105785349944e-05, "loss": 0.0809, "reward": 1.609645962715149, "reward_std": 2.4470778703689575, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8820793628692627, "rewards/no_repetition_reward_func": -0.24118328094482422, "rewards/verse_reward_func": -0.03125, "step": 1020 }, { "completion_length": 251.71875, "epoch": 8.168, "grad_norm": 1.3671875, "kl": 1.899598240852356, "learning_rate": 4.939104661440415e-05, "loss": 0.076, "reward": 1.762359619140625, "reward_std": 2.327043056488037, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0000221729278564, "rewards/no_repetition_reward_func": -0.22984998673200607, "rewards/verse_reward_func": -0.0078125, "step": 1021 }, { "completion_length": 252.9375, "epoch": 8.176, "grad_norm": 1.2890625, "kl": 1.840745449066162, "learning_rate": 4.938797983520237e-05, "loss": 0.0736, "reward": 1.9326311349868774, "reward_std": 2.494999051094055, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.215430974960327, "rewards/no_repetition_reward_func": -0.2749873474240303, "rewards/verse_reward_func": -0.0078125, "step": 1022 }, { "completion_length": 255.28125, "epoch": 8.184, "grad_norm": 0.90625, "kl": 1.778556764125824, "learning_rate": 4.938490544870121e-05, "loss": 0.0711, "reward": 1.6321381628513336, "reward_std": 2.4562448263168335, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9587197303771973, "rewards/no_repetition_reward_func": -0.3109566420316696, "rewards/verse_reward_func": -0.015625, "step": 1023 }, { "completion_length": 254.8125, "epoch": 8.192, "grad_norm": 1.09375, "kl": 3.0205827951431274, "learning_rate": 4.938182345585966e-05, "loss": 0.1208, "reward": 1.586065948009491, "reward_std": 2.6784539222717285, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8394536972045898, "rewards/no_repetition_reward_func": -0.22995032370090485, "rewards/verse_reward_func": -0.0234375, "step": 1024 }, { "completion_length": 250.609375, "epoch": 8.2, "grad_norm": 2.5625, "kl": 3.042997717857361, "learning_rate": 4.937873385763908e-05, "loss": 0.1217, "reward": 1.3179081082344055, "reward_std": 2.1502243280410767, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5819652676582336, "rewards/no_repetition_reward_func": -0.2562447041273117, "rewards/verse_reward_func": -0.0078125, "step": 1025 }, { "completion_length": 245.3125, "epoch": 8.208, "grad_norm": 0.98046875, "kl": 2.4773329496383667, "learning_rate": 4.937563665500321e-05, "loss": 0.0991, "reward": 1.5449820160865784, "reward_std": 2.6573996543884277, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8156715631484985, "rewards/no_repetition_reward_func": -0.2472519800066948, "rewards/verse_reward_func": -0.0234375, "step": 1026 }, { "completion_length": 252.078125, "epoch": 8.216, "grad_norm": 1.8984375, "kl": 2.2389187812805176, "learning_rate": 4.9372531848918145e-05, "loss": 0.0896, "reward": 1.7808492183685303, "reward_std": 2.4611164331436157, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0728620290756226, "rewards/no_repetition_reward_func": -0.2763879522681236, "rewards/verse_reward_func": -0.015625, "step": 1027 }, { "completion_length": 250.15625, "epoch": 8.224, "grad_norm": 1.375, "kl": 2.1670159101486206, "learning_rate": 4.936941944035237e-05, "loss": 0.0867, "reward": 2.0579692125320435, "reward_std": 3.046495795249939, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3272119760513306, "rewards/no_repetition_reward_func": -0.26143014430999756, "rewards/verse_reward_func": -0.0078125, "step": 1028 }, { "completion_length": 255.8125, "epoch": 8.232, "grad_norm": 1.8515625, "kl": 2.20617139339447, "learning_rate": 4.936629943027672e-05, "loss": 0.0882, "reward": 1.5409149527549744, "reward_std": 2.2701094150543213, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8214136958122253, "rewards/no_repetition_reward_func": -0.2570612505078316, "rewards/verse_reward_func": -0.0234375, "step": 1029 }, { "completion_length": 248.171875, "epoch": 8.24, "grad_norm": 1.9296875, "kl": 2.0382161736488342, "learning_rate": 4.9363171819664434e-05, "loss": 0.0815, "reward": 1.841402292251587, "reward_std": 2.7307897806167603, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.157628118991852, "rewards/no_repetition_reward_func": -0.3006007820367813, "rewards/verse_reward_func": -0.015625, "step": 1030 }, { "completion_length": 251.828125, "epoch": 8.248, "grad_norm": 2.078125, "kl": 1.996553897857666, "learning_rate": 4.936003660949108e-05, "loss": 0.0799, "reward": 1.913634181022644, "reward_std": 2.5018612146377563, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1934207677841187, "rewards/no_repetition_reward_func": -0.2641616314649582, "rewards/verse_reward_func": -0.015625, "step": 1031 }, { "completion_length": 256.0, "epoch": 8.256, "grad_norm": 1.4921875, "kl": 2.90777850151062, "learning_rate": 4.935689380073464e-05, "loss": 0.1163, "reward": 1.1877199113368988, "reward_std": 2.1245083808898926, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.451526939868927, "rewards/no_repetition_reward_func": -0.24036958813667297, "rewards/verse_reward_func": -0.0234375, "step": 1032 }, { "completion_length": 252.421875, "epoch": 8.264, "grad_norm": 1.390625, "kl": 2.4815644025802612, "learning_rate": 4.935374339437543e-05, "loss": 0.0993, "reward": 1.7633467316627502, "reward_std": 2.4750747680664062, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9784681797027588, "rewards/no_repetition_reward_func": -0.21512127667665482, "rewards/verse_reward_func": 0.0, "step": 1033 }, { "completion_length": 246.125, "epoch": 8.272, "grad_norm": 2.078125, "kl": 1.9372131824493408, "learning_rate": 4.935058539139615e-05, "loss": 0.0775, "reward": 1.686569333076477, "reward_std": 2.4728318452835083, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9628074169158936, "rewards/no_repetition_reward_func": -0.2606131136417389, "rewards/verse_reward_func": -0.015625, "step": 1034 }, { "completion_length": 252.9375, "epoch": 8.28, "grad_norm": 1.25, "kl": 2.979602098464966, "learning_rate": 4.9347419792781876e-05, "loss": 0.1192, "reward": 1.7556570172309875, "reward_std": 2.6203588247299194, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0019311904907227, "rewards/no_repetition_reward_func": -0.2306491732597351, "rewards/verse_reward_func": -0.015625, "step": 1035 }, { "completion_length": 249.734375, "epoch": 8.288, "grad_norm": 1.65625, "kl": 2.7539724111557007, "learning_rate": 4.934424659952006e-05, "loss": 0.1102, "reward": 1.7521055936813354, "reward_std": 2.618234872817993, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0365670919418335, "rewards/no_repetition_reward_func": -0.28446148335933685, "rewards/verse_reward_func": 0.0, "step": 1036 }, { "completion_length": 250.578125, "epoch": 8.296, "grad_norm": 2.046875, "kl": 2.8382842540740967, "learning_rate": 4.934106581260049e-05, "loss": 0.1135, "reward": 1.2804073095321655, "reward_std": 1.9487804174423218, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5237463116645813, "rewards/no_repetition_reward_func": -0.21208906918764114, "rewards/verse_reward_func": -0.03125, "step": 1037 }, { "completion_length": 250.59375, "epoch": 8.304, "grad_norm": 1.2578125, "kl": 2.5520651936531067, "learning_rate": 4.933787743301534e-05, "loss": 0.1021, "reward": 1.4043711423873901, "reward_std": 2.46442711353302, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.662422776222229, "rewards/no_repetition_reward_func": -0.2346140444278717, "rewards/verse_reward_func": -0.0234375, "step": 1038 }, { "completion_length": 247.734375, "epoch": 8.312, "grad_norm": 2.15625, "kl": 3.133436441421509, "learning_rate": 4.933468146175918e-05, "loss": 0.1253, "reward": 1.2102829813957214, "reward_std": 2.03595632314682, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4113553762435913, "rewards/no_repetition_reward_func": -0.20107237249612808, "rewards/verse_reward_func": 0.0, "step": 1039 }, { "completion_length": 251.359375, "epoch": 8.32, "grad_norm": 2.046875, "kl": 2.7479416131973267, "learning_rate": 4.93314778998289e-05, "loss": 0.1099, "reward": 1.1121116876602173, "reward_std": 1.829235017299652, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3527698516845703, "rewards/no_repetition_reward_func": -0.21722066402435303, "rewards/verse_reward_func": -0.0234375, "step": 1040 }, { "completion_length": 253.59375, "epoch": 8.328, "grad_norm": 1.7265625, "kl": 2.0826823711395264, "learning_rate": 4.93282667482238e-05, "loss": 0.0833, "reward": 1.0794783234596252, "reward_std": 1.9083153009414673, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.326395034790039, "rewards/no_repetition_reward_func": -0.23910418152809143, "rewards/verse_reward_func": -0.0078125, "step": 1041 }, { "completion_length": 243.25, "epoch": 8.336, "grad_norm": 1.8828125, "kl": 1.521207869052887, "learning_rate": 4.9325048007945526e-05, "loss": 0.0608, "reward": 1.6514121890068054, "reward_std": 2.4052048921585083, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.935989260673523, "rewards/no_repetition_reward_func": -0.2767646461725235, "rewards/verse_reward_func": -0.0078125, "step": 1042 }, { "completion_length": 251.421875, "epoch": 8.344, "grad_norm": 1.203125, "kl": 1.3798884749412537, "learning_rate": 4.9321821679998074e-05, "loss": 0.0552, "reward": 1.0128845572471619, "reward_std": 1.7419170141220093, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.259311854839325, "rewards/no_repetition_reward_func": -0.24642734229564667, "rewards/verse_reward_func": 0.0, "step": 1043 }, { "completion_length": 250.0625, "epoch": 8.352, "grad_norm": 1.2265625, "kl": 1.1587218642234802, "learning_rate": 4.9318587765387845e-05, "loss": 0.0463, "reward": 1.8950392603874207, "reward_std": 2.304630398750305, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.160906195640564, "rewards/no_repetition_reward_func": -0.26586705446243286, "rewards/verse_reward_func": 0.0, "step": 1044 }, { "completion_length": 251.234375, "epoch": 8.36, "grad_norm": 1.3828125, "kl": 1.4030361473560333, "learning_rate": 4.9315346265123594e-05, "loss": 0.0561, "reward": 1.2230584025382996, "reward_std": 1.8823055028915405, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.476722538471222, "rewards/no_repetition_reward_func": -0.25366413593292236, "rewards/verse_reward_func": 0.0, "step": 1045 }, { "completion_length": 249.46875, "epoch": 8.368, "grad_norm": 1.0859375, "kl": 1.6154981851577759, "learning_rate": 4.9312097180216414e-05, "loss": 0.0646, "reward": 1.1250810623168945, "reward_std": 2.0056483149528503, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4011136293411255, "rewards/no_repetition_reward_func": -0.2682199776172638, "rewards/verse_reward_func": -0.0078125, "step": 1046 }, { "completion_length": 255.59375, "epoch": 8.376, "grad_norm": 1.2734375, "kl": 1.762284517288208, "learning_rate": 4.9308840511679804e-05, "loss": 0.0705, "reward": 1.2636191248893738, "reward_std": 2.0946953296661377, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5242394804954529, "rewards/no_repetition_reward_func": -0.25280778110027313, "rewards/verse_reward_func": -0.0078125, "step": 1047 }, { "completion_length": 247.296875, "epoch": 8.384, "grad_norm": 2.703125, "kl": 1.1219384968280792, "learning_rate": 4.9305576260529607e-05, "loss": 0.0449, "reward": 1.97087162733078, "reward_std": 2.410091519355774, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2391375303268433, "rewards/no_repetition_reward_func": -0.26045338809490204, "rewards/verse_reward_func": -0.0078125, "step": 1048 }, { "completion_length": 247.734375, "epoch": 8.392, "grad_norm": 1.109375, "kl": 1.7097259163856506, "learning_rate": 4.930230442778403e-05, "loss": 0.0684, "reward": 0.8422871232032776, "reward_std": 1.7099577188491821, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0446022152900696, "rewards/no_repetition_reward_func": -0.1945025473833084, "rewards/verse_reward_func": -0.0078125, "step": 1049 }, { "completion_length": 250.109375, "epoch": 8.4, "grad_norm": 3.140625, "kl": 1.021303653717041, "learning_rate": 4.929902501446366e-05, "loss": 0.0409, "reward": 2.104056239128113, "reward_std": 2.75792396068573, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3803728222846985, "rewards/no_repetition_reward_func": -0.2685040533542633, "rewards/verse_reward_func": -0.0078125, "step": 1050 }, { "completion_length": 251.015625, "epoch": 8.408, "grad_norm": 3.296875, "kl": 2.299276113510132, "learning_rate": 4.929573802159143e-05, "loss": 0.092, "reward": 1.0339867770671844, "reward_std": 1.729171633720398, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2555366158485413, "rewards/no_repetition_reward_func": -0.21373729407787323, "rewards/verse_reward_func": -0.0078125, "step": 1051 }, { "completion_length": 250.625, "epoch": 8.416, "grad_norm": 1.4921875, "kl": 1.5108332633972168, "learning_rate": 4.9292443450192645e-05, "loss": 0.0604, "reward": 1.447253167629242, "reward_std": 2.0339062809944153, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.711805820465088, "rewards/no_repetition_reward_func": -0.26455260813236237, "rewards/verse_reward_func": 0.0, "step": 1052 }, { "completion_length": 253.1875, "epoch": 8.424, "grad_norm": 1.4296875, "kl": 1.83262300491333, "learning_rate": 4.928914130129498e-05, "loss": 0.0733, "reward": 1.3204105496406555, "reward_std": 1.534000277519226, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5873503684997559, "rewards/no_repetition_reward_func": -0.2591274082660675, "rewards/verse_reward_func": -0.0078125, "step": 1053 }, { "completion_length": 249.46875, "epoch": 8.432, "grad_norm": 1.6953125, "kl": 2.657777786254883, "learning_rate": 4.9285831575928465e-05, "loss": 0.1063, "reward": 1.2954630851745605, "reward_std": 1.9062365293502808, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5251641869544983, "rewards/no_repetition_reward_func": -0.22970110177993774, "rewards/verse_reward_func": 0.0, "step": 1054 }, { "completion_length": 252.265625, "epoch": 8.44, "grad_norm": 1.859375, "kl": 3.054810643196106, "learning_rate": 4.92825142751255e-05, "loss": 0.1222, "reward": 1.3567066192626953, "reward_std": 2.655884623527527, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6193383932113647, "rewards/no_repetition_reward_func": -0.23138166964054108, "rewards/verse_reward_func": -0.03125, "step": 1055 }, { "completion_length": 252.390625, "epoch": 8.448, "grad_norm": 1.125, "kl": 2.316670596599579, "learning_rate": 4.9279189399920844e-05, "loss": 0.0927, "reward": 1.6829927265644073, "reward_std": 2.0918333530426025, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.917274296283722, "rewards/no_repetition_reward_func": -0.23428168892860413, "rewards/verse_reward_func": 0.0, "step": 1056 }, { "completion_length": 252.578125, "epoch": 8.456, "grad_norm": 1.3828125, "kl": 1.7714252471923828, "learning_rate": 4.927585695135162e-05, "loss": 0.0709, "reward": 0.9196666479110718, "reward_std": 1.7638619542121887, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1324989795684814, "rewards/no_repetition_reward_func": -0.2128322795033455, "rewards/verse_reward_func": 0.0, "step": 1057 }, { "completion_length": 244.125, "epoch": 8.464, "grad_norm": 3.03125, "kl": 1.229564368724823, "learning_rate": 4.9272516930457314e-05, "loss": 0.0492, "reward": 1.6949133276939392, "reward_std": 2.4538066387176514, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9386008977890015, "rewards/no_repetition_reward_func": -0.23587501049041748, "rewards/verse_reward_func": -0.0078125, "step": 1058 }, { "completion_length": 246.84375, "epoch": 8.472, "grad_norm": 2.296875, "kl": 1.7565299272537231, "learning_rate": 4.9269169338279766e-05, "loss": 0.0703, "reward": 1.4015214443206787, "reward_std": 2.049657940864563, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.637028157711029, "rewards/no_repetition_reward_func": -0.23550674319267273, "rewards/verse_reward_func": 0.0, "step": 1059 }, { "completion_length": 252.59375, "epoch": 8.48, "grad_norm": 2.5, "kl": 1.371520757675171, "learning_rate": 4.9265814175863186e-05, "loss": 0.0549, "reward": 2.086591064929962, "reward_std": 2.4334068298339844, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3503754138946533, "rewards/no_repetition_reward_func": -0.26378433406352997, "rewards/verse_reward_func": 0.0, "step": 1060 }, { "completion_length": 239.21875, "epoch": 8.488, "grad_norm": 2.734375, "kl": 1.7376953959465027, "learning_rate": 4.926245144425415e-05, "loss": 0.0695, "reward": 1.6497701406478882, "reward_std": 2.602501153945923, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.923996388912201, "rewards/no_repetition_reward_func": -0.2586012929677963, "rewards/verse_reward_func": -0.015625, "step": 1061 }, { "completion_length": 247.0, "epoch": 8.496, "grad_norm": 1.09375, "kl": 2.4437280893325806, "learning_rate": 4.925908114450158e-05, "loss": 0.0977, "reward": 1.2452532351016998, "reward_std": 2.0080602765083313, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.502834975719452, "rewards/no_repetition_reward_func": -0.24195680022239685, "rewards/verse_reward_func": -0.015625, "step": 1062 }, { "completion_length": 256.0, "epoch": 8.504, "grad_norm": 1.0234375, "kl": 2.369669556617737, "learning_rate": 4.925570327765678e-05, "loss": 0.0948, "reward": 1.9167377948760986, "reward_std": 2.4180301427841187, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.217056155204773, "rewards/no_repetition_reward_func": -0.2768808901309967, "rewards/verse_reward_func": -0.0234375, "step": 1063 }, { "completion_length": 247.59375, "epoch": 8.512, "grad_norm": 2.984375, "kl": 2.6792397499084473, "learning_rate": 4.925231784477339e-05, "loss": 0.1072, "reward": 1.539185643196106, "reward_std": 2.466219902038574, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8115651607513428, "rewards/no_repetition_reward_func": -0.26456689834594727, "rewards/verse_reward_func": -0.0078125, "step": 1064 }, { "completion_length": 256.0, "epoch": 8.52, "grad_norm": 1.8984375, "kl": 2.669731616973877, "learning_rate": 4.924892484690743e-05, "loss": 0.1068, "reward": 1.4898350238800049, "reward_std": 2.382685422897339, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7598706483840942, "rewards/no_repetition_reward_func": -0.26222310960292816, "rewards/verse_reward_func": -0.0078125, "step": 1065 }, { "completion_length": 256.0, "epoch": 8.528, "grad_norm": 1.2265625, "kl": 2.0968483686447144, "learning_rate": 4.9245524285117274e-05, "loss": 0.0839, "reward": 1.4425432682037354, "reward_std": 2.3095895051956177, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7171283960342407, "rewards/no_repetition_reward_func": -0.2589602619409561, "rewards/verse_reward_func": -0.015625, "step": 1066 }, { "completion_length": 249.421875, "epoch": 8.536, "grad_norm": 0.82421875, "kl": 1.9186650812625885, "learning_rate": 4.924211616046365e-05, "loss": 0.0767, "reward": 1.9811816811561584, "reward_std": 2.717707872390747, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2733083963394165, "rewards/no_repetition_reward_func": -0.28431424498558044, "rewards/verse_reward_func": -0.0078125, "step": 1067 }, { "completion_length": 254.9375, "epoch": 8.544, "grad_norm": 2.796875, "kl": 1.3492854833602905, "learning_rate": 4.923870047400964e-05, "loss": 0.054, "reward": 2.1229992508888245, "reward_std": 2.5374295711517334, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.390270471572876, "rewards/no_repetition_reward_func": -0.2672712877392769, "rewards/verse_reward_func": 0.0, "step": 1068 }, { "completion_length": 252.515625, "epoch": 8.552, "grad_norm": 2.859375, "kl": 3.419508218765259, "learning_rate": 4.9235277226820695e-05, "loss": 0.1368, "reward": 1.2536267638206482, "reward_std": 2.4033271074295044, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5219549536705017, "rewards/no_repetition_reward_func": -0.2370782196521759, "rewards/verse_reward_func": -0.03125, "step": 1069 }, { "completion_length": 256.0, "epoch": 8.56, "grad_norm": 1.8046875, "kl": 2.748640298843384, "learning_rate": 4.923184641996463e-05, "loss": 0.1099, "reward": 1.816136360168457, "reward_std": 2.3497507572174072, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0830078125, "rewards/no_repetition_reward_func": -0.26687151193618774, "rewards/verse_reward_func": 0.0, "step": 1070 }, { "completion_length": 248.15625, "epoch": 8.568, "grad_norm": 2.6875, "kl": 2.591638505458832, "learning_rate": 4.922840805451161e-05, "loss": 0.1037, "reward": 1.2769336700439453, "reward_std": 2.2431838512420654, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5402846336364746, "rewards/no_repetition_reward_func": -0.2555384933948517, "rewards/verse_reward_func": -0.0078125, "step": 1071 }, { "completion_length": 250.796875, "epoch": 8.576, "grad_norm": 1.5, "kl": 2.1427736282348633, "learning_rate": 4.922496213153416e-05, "loss": 0.0857, "reward": 1.3332757949829102, "reward_std": 1.9523900747299194, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.59617680311203, "rewards/no_repetition_reward_func": -0.2629010081291199, "rewards/verse_reward_func": 0.0, "step": 1072 }, { "completion_length": 243.65625, "epoch": 8.584, "grad_norm": 2.15625, "kl": 2.7664706707000732, "learning_rate": 4.922150865210715e-05, "loss": 0.1107, "reward": 2.0210845470428467, "reward_std": 2.7398242950439453, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3292466402053833, "rewards/no_repetition_reward_func": -0.2690996825695038, "rewards/verse_reward_func": -0.0390625, "step": 1073 }, { "completion_length": 253.328125, "epoch": 8.592, "grad_norm": 1.359375, "kl": 1.6300589442253113, "learning_rate": 4.9218047617307824e-05, "loss": 0.0652, "reward": 1.96805340051651, "reward_std": 2.0417325496673584, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.257491707801819, "rewards/no_repetition_reward_func": -0.2894383668899536, "rewards/verse_reward_func": 0.0, "step": 1074 }, { "completion_length": 246.984375, "epoch": 8.6, "grad_norm": 1.765625, "kl": 1.4632887840270996, "learning_rate": 4.9214579028215776e-05, "loss": 0.0585, "reward": 1.7675957083702087, "reward_std": 2.7721410989761353, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.081535816192627, "rewards/no_repetition_reward_func": -0.2983151227235794, "rewards/verse_reward_func": -0.015625, "step": 1075 }, { "completion_length": 252.703125, "epoch": 8.608, "grad_norm": 1.96875, "kl": 2.257062315940857, "learning_rate": 4.9211102885912965e-05, "loss": 0.0903, "reward": 1.7744180560112, "reward_std": 2.38748562335968, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0137718319892883, "rewards/no_repetition_reward_func": -0.23935379087924957, "rewards/verse_reward_func": 0.0, "step": 1076 }, { "completion_length": 256.0, "epoch": 8.616, "grad_norm": 2.390625, "kl": 3.4973068237304688, "learning_rate": 4.920761919148369e-05, "loss": 0.1399, "reward": 1.2790847420692444, "reward_std": 2.01931095123291, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5267841219902039, "rewards/no_repetition_reward_func": -0.2476992905139923, "rewards/verse_reward_func": 0.0, "step": 1077 }, { "completion_length": 249.421875, "epoch": 8.624, "grad_norm": 4.875, "kl": 3.0611650943756104, "learning_rate": 4.920412794601461e-05, "loss": 0.1224, "reward": 0.6347403824329376, "reward_std": 1.4875072240829468, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.8955379724502563, "rewards/no_repetition_reward_func": -0.25298506021499634, "rewards/verse_reward_func": -0.0078125, "step": 1078 }, { "completion_length": 254.171875, "epoch": 8.632, "grad_norm": 3.9375, "kl": 3.1764299869537354, "learning_rate": 4.9200629150594744e-05, "loss": 0.1271, "reward": 1.4049482345581055, "reward_std": 2.1102922558784485, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6869949102401733, "rewards/no_repetition_reward_func": -0.25860899686813354, "rewards/verse_reward_func": -0.0234375, "step": 1079 }, { "completion_length": 255.9375, "epoch": 8.64, "grad_norm": 1.109375, "kl": 2.2924582958221436, "learning_rate": 4.919712280631547e-05, "loss": 0.0917, "reward": 1.5168760418891907, "reward_std": 2.5411986112594604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7666918635368347, "rewards/no_repetition_reward_func": -0.24981573224067688, "rewards/verse_reward_func": 0.0, "step": 1080 }, { "completion_length": 253.421875, "epoch": 8.648, "grad_norm": 1.4609375, "kl": 1.6383684873580933, "learning_rate": 4.9193608914270515e-05, "loss": 0.0655, "reward": 1.5489948391914368, "reward_std": 2.082504987716675, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8082539439201355, "rewards/no_repetition_reward_func": -0.25925904512405396, "rewards/verse_reward_func": 0.0, "step": 1081 }, { "completion_length": 251.890625, "epoch": 8.656, "grad_norm": 4.34375, "kl": 2.411911129951477, "learning_rate": 4.9190087475555955e-05, "loss": 0.0965, "reward": 1.778829574584961, "reward_std": 2.691890835762024, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.033872365951538, "rewards/no_repetition_reward_func": -0.23941797763109207, "rewards/verse_reward_func": -0.015625, "step": 1082 }, { "completion_length": 251.8125, "epoch": 8.664, "grad_norm": 0.9921875, "kl": 1.5727295279502869, "learning_rate": 4.918655849127024e-05, "loss": 0.0629, "reward": 1.5000871419906616, "reward_std": 2.163014531135559, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7258697748184204, "rewards/no_repetition_reward_func": -0.225782573223114, "rewards/verse_reward_func": 0.0, "step": 1083 }, { "completion_length": 250.109375, "epoch": 8.672, "grad_norm": 0.98046875, "kl": 1.4015931487083435, "learning_rate": 4.918302196251415e-05, "loss": 0.0561, "reward": 1.2858639359474182, "reward_std": 1.8972795605659485, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5317636728286743, "rewards/no_repetition_reward_func": -0.2380872219800949, "rewards/verse_reward_func": -0.0078125, "step": 1084 }, { "completion_length": 251.375, "epoch": 8.68, "grad_norm": 3.140625, "kl": 1.2614425122737885, "learning_rate": 4.9179477890390825e-05, "loss": 0.0505, "reward": 2.1379603147506714, "reward_std": 2.7795811891555786, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4058536291122437, "rewards/no_repetition_reward_func": -0.2678934335708618, "rewards/verse_reward_func": 0.0, "step": 1085 }, { "completion_length": 252.609375, "epoch": 8.688, "grad_norm": 3.15625, "kl": 1.5243343114852905, "learning_rate": 4.917592627600577e-05, "loss": 0.061, "reward": 2.0307196974754333, "reward_std": 2.7188310623168945, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.273890256881714, "rewards/no_repetition_reward_func": -0.2431708201766014, "rewards/verse_reward_func": 0.0, "step": 1086 }, { "completion_length": 252.203125, "epoch": 8.696, "grad_norm": 1.5390625, "kl": 1.4146641492843628, "learning_rate": 4.917236712046682e-05, "loss": 0.0566, "reward": 1.9977346658706665, "reward_std": 2.393261194229126, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2701677083969116, "rewards/no_repetition_reward_func": -0.2724330574274063, "rewards/verse_reward_func": 0.0, "step": 1087 }, { "completion_length": 256.0, "epoch": 8.704, "grad_norm": 2.1875, "kl": 1.516023874282837, "learning_rate": 4.916880042488419e-05, "loss": 0.0606, "reward": 2.232193350791931, "reward_std": 2.6597900390625, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5214121341705322, "rewards/no_repetition_reward_func": -0.27359381318092346, "rewards/verse_reward_func": -0.015625, "step": 1088 }, { "completion_length": 247.5, "epoch": 8.712, "grad_norm": 1.421875, "kl": 2.4501919746398926, "learning_rate": 4.916522619037043e-05, "loss": 0.098, "reward": 1.4835536479949951, "reward_std": 2.440859794616699, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.721078634262085, "rewards/no_repetition_reward_func": -0.20627496391534805, "rewards/verse_reward_func": -0.03125, "step": 1089 }, { "completion_length": 249.734375, "epoch": 8.72, "grad_norm": 6.53125, "kl": 3.299388289451599, "learning_rate": 4.916164441804044e-05, "loss": 0.132, "reward": 1.4958065152168274, "reward_std": 2.6188732385635376, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.742091178894043, "rewards/no_repetition_reward_func": -0.24628466367721558, "rewards/verse_reward_func": 0.0, "step": 1090 }, { "completion_length": 255.984375, "epoch": 8.728, "grad_norm": 2.859375, "kl": 2.9541711807250977, "learning_rate": 4.915805510901148e-05, "loss": 0.1182, "reward": 1.6589125990867615, "reward_std": 2.46344256401062, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9658462405204773, "rewards/no_repetition_reward_func": -0.275683656334877, "rewards/verse_reward_func": -0.03125, "step": 1091 }, { "completion_length": 253.984375, "epoch": 8.736, "grad_norm": 1.2890625, "kl": 2.6042407751083374, "learning_rate": 4.915445826440316e-05, "loss": 0.1042, "reward": 1.6784538626670837, "reward_std": 2.3289122581481934, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9149656891822815, "rewards/no_repetition_reward_func": -0.23651187866926193, "rewards/verse_reward_func": 0.0, "step": 1092 }, { "completion_length": 251.03125, "epoch": 8.744, "grad_norm": 1.484375, "kl": 2.752676248550415, "learning_rate": 4.9150853885337426e-05, "loss": 0.1101, "reward": 1.3524699807167053, "reward_std": 2.132526218891144, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6024757027626038, "rewards/no_repetition_reward_func": -0.24219323694705963, "rewards/verse_reward_func": -0.0078125, "step": 1093 }, { "completion_length": 243.453125, "epoch": 8.752, "grad_norm": 1.1171875, "kl": 2.676031231880188, "learning_rate": 4.9147241972938596e-05, "loss": 0.107, "reward": 1.6577139496803284, "reward_std": 2.619917631149292, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9166728258132935, "rewards/no_repetition_reward_func": -0.23552124202251434, "rewards/verse_reward_func": -0.0234375, "step": 1094 }, { "completion_length": 252.421875, "epoch": 8.76, "grad_norm": 2.15625, "kl": 2.9422738552093506, "learning_rate": 4.914362252833332e-05, "loss": 0.1177, "reward": 1.4781804084777832, "reward_std": 2.289204716682434, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.728188395500183, "rewards/no_repetition_reward_func": -0.2343830242753029, "rewards/verse_reward_func": -0.015625, "step": 1095 }, { "completion_length": 254.15625, "epoch": 8.768, "grad_norm": 1.984375, "kl": 2.5422645807266235, "learning_rate": 4.913999555265062e-05, "loss": 0.1017, "reward": 2.0058581829071045, "reward_std": 2.5610119104385376, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2457826137542725, "rewards/no_repetition_reward_func": -0.2321120798587799, "rewards/verse_reward_func": -0.0078125, "step": 1096 }, { "completion_length": 244.046875, "epoch": 8.776, "grad_norm": 1.6015625, "kl": 2.3940906524658203, "learning_rate": 4.913636104702183e-05, "loss": 0.0958, "reward": 1.6226460933685303, "reward_std": 2.05150043964386, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8540966510772705, "rewards/no_repetition_reward_func": -0.20801305770874023, "rewards/verse_reward_func": -0.0234375, "step": 1097 }, { "completion_length": 246.140625, "epoch": 8.784, "grad_norm": 2.03125, "kl": 2.4806922674179077, "learning_rate": 4.913271901258067e-05, "loss": 0.0992, "reward": 1.8934123516082764, "reward_std": 2.6250457763671875, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1347644329071045, "rewards/no_repetition_reward_func": -0.22572704404592514, "rewards/verse_reward_func": -0.015625, "step": 1098 }, { "completion_length": 256.0, "epoch": 8.792, "grad_norm": 1.7578125, "kl": 2.324559807777405, "learning_rate": 4.9129069450463186e-05, "loss": 0.093, "reward": 1.8848950266838074, "reward_std": 2.9835546016693115, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1574883460998535, "rewards/no_repetition_reward_func": -0.26478081941604614, "rewards/verse_reward_func": -0.0078125, "step": 1099 }, { "completion_length": 253.921875, "epoch": 8.8, "grad_norm": 2.046875, "kl": 2.2531851530075073, "learning_rate": 4.912541236180779e-05, "loss": 0.0901, "reward": 1.2357702255249023, "reward_std": 1.8091620206832886, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.512952446937561, "rewards/no_repetition_reward_func": -0.2693697214126587, "rewards/verse_reward_func": -0.0078125, "step": 1100 }, { "completion_length": 248.5, "epoch": 8.808, "grad_norm": 2.21875, "kl": 2.167120039463043, "learning_rate": 4.912174774775522e-05, "loss": 0.0867, "reward": 1.8610901236534119, "reward_std": 2.634432315826416, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.122992753982544, "rewards/no_repetition_reward_func": -0.2619026154279709, "rewards/verse_reward_func": 0.0, "step": 1101 }, { "completion_length": 247.796875, "epoch": 8.816, "grad_norm": 2.5625, "kl": 1.942491590976715, "learning_rate": 4.911807560944858e-05, "loss": 0.0777, "reward": 2.3125572204589844, "reward_std": 2.803728699684143, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.615502953529358, "rewards/no_repetition_reward_func": -0.2795082926750183, "rewards/verse_reward_func": -0.0234375, "step": 1102 }, { "completion_length": 247.109375, "epoch": 8.824, "grad_norm": 1.4609375, "kl": 2.467926025390625, "learning_rate": 4.9114395948033296e-05, "loss": 0.0987, "reward": 1.540828824043274, "reward_std": 2.8014880418777466, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7987875938415527, "rewards/no_repetition_reward_func": -0.250146321952343, "rewards/verse_reward_func": -0.0078125, "step": 1103 }, { "completion_length": 249.359375, "epoch": 8.832, "grad_norm": 1.4921875, "kl": 2.734396457672119, "learning_rate": 4.911070876465719e-05, "loss": 0.1094, "reward": 2.3465569019317627, "reward_std": 2.6726170778274536, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.622212529182434, "rewards/no_repetition_reward_func": -0.26003047078847885, "rewards/verse_reward_func": -0.015625, "step": 1104 }, { "completion_length": 252.234375, "epoch": 8.84, "grad_norm": 5.28125, "kl": 3.699983835220337, "learning_rate": 4.910701406047037e-05, "loss": 0.148, "reward": 1.4118596911430359, "reward_std": 2.456843376159668, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6868532299995422, "rewards/no_repetition_reward_func": -0.2671810984611511, "rewards/verse_reward_func": -0.0078125, "step": 1105 }, { "completion_length": 253.15625, "epoch": 8.848, "grad_norm": 1.03125, "kl": 2.3120745420455933, "learning_rate": 4.910331183662533e-05, "loss": 0.0925, "reward": 1.9969865083694458, "reward_std": 2.7096498012542725, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2641947269439697, "rewards/no_repetition_reward_func": -0.2672082930803299, "rewards/verse_reward_func": 0.0, "step": 1106 }, { "completion_length": 252.734375, "epoch": 8.856, "grad_norm": 2.515625, "kl": 3.3111698627471924, "learning_rate": 4.90996020942769e-05, "loss": 0.1324, "reward": 1.3992677330970764, "reward_std": 2.6778810024261475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6681240797042847, "rewards/no_repetition_reward_func": -0.261043906211853, "rewards/verse_reward_func": -0.0078125, "step": 1107 }, { "completion_length": 248.34375, "epoch": 8.864, "grad_norm": 2.390625, "kl": 2.9769307374954224, "learning_rate": 4.909588483458225e-05, "loss": 0.1191, "reward": 1.494800865650177, "reward_std": 2.422270655632019, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.761991262435913, "rewards/no_repetition_reward_func": -0.25937794148921967, "rewards/verse_reward_func": -0.0078125, "step": 1108 }, { "completion_length": 249.53125, "epoch": 8.872, "grad_norm": 1.65625, "kl": 2.982543468475342, "learning_rate": 4.90921600587009e-05, "loss": 0.1193, "reward": 1.7917951047420502, "reward_std": 2.2437989711761475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0322745740413666, "rewards/no_repetition_reward_func": -0.2404794618487358, "rewards/verse_reward_func": 0.0, "step": 1109 }, { "completion_length": 250.546875, "epoch": 8.88, "grad_norm": 1.8671875, "kl": 2.863701820373535, "learning_rate": 4.908842776779472e-05, "loss": 0.1145, "reward": 1.584557831287384, "reward_std": 2.3283361196517944, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9064328074455261, "rewards/no_repetition_reward_func": -0.2906250134110451, "rewards/verse_reward_func": -0.03125, "step": 1110 }, { "completion_length": 247.828125, "epoch": 8.888, "grad_norm": 1.3359375, "kl": 2.2128825187683105, "learning_rate": 4.9084687963027894e-05, "loss": 0.0885, "reward": 1.4596824049949646, "reward_std": 2.2846839427948, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7186335325241089, "rewards/no_repetition_reward_func": -0.25895120948553085, "rewards/verse_reward_func": 0.0, "step": 1111 }, { "completion_length": 254.375, "epoch": 8.896, "grad_norm": 1.3046875, "kl": 2.815098285675049, "learning_rate": 4.9080940645567e-05, "loss": 0.1126, "reward": 1.5110222697257996, "reward_std": 2.4951939582824707, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.773724913597107, "rewards/no_repetition_reward_func": -0.2548900693655014, "rewards/verse_reward_func": -0.0078125, "step": 1112 }, { "completion_length": 249.828125, "epoch": 8.904, "grad_norm": 1.6171875, "kl": 1.8564711809158325, "learning_rate": 4.907718581658091e-05, "loss": 0.0743, "reward": 1.5659573078155518, "reward_std": 2.6012041568756104, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8393514156341553, "rewards/no_repetition_reward_func": -0.2577691823244095, "rewards/verse_reward_func": -0.015625, "step": 1113 }, { "completion_length": 252.53125, "epoch": 8.912, "grad_norm": 1.3828125, "kl": 3.0910286903381348, "learning_rate": 4.907342347724087e-05, "loss": 0.1236, "reward": 1.1533099114894867, "reward_std": 2.1013383865356445, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3758221864700317, "rewards/no_repetition_reward_func": -0.20688727498054504, "rewards/verse_reward_func": -0.015625, "step": 1114 }, { "completion_length": 250.171875, "epoch": 8.92, "grad_norm": 1.3046875, "kl": 2.248998761177063, "learning_rate": 4.906965362872047e-05, "loss": 0.09, "reward": 1.7998939752578735, "reward_std": 2.4083306789398193, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0518742203712463, "rewards/no_repetition_reward_func": -0.2441677823662758, "rewards/verse_reward_func": -0.0078125, "step": 1115 }, { "completion_length": 251.484375, "epoch": 8.928, "grad_norm": 1.140625, "kl": 1.8552862405776978, "learning_rate": 4.906587627219562e-05, "loss": 0.0742, "reward": 1.5194830894470215, "reward_std": 1.9832842350006104, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7604107856750488, "rewards/no_repetition_reward_func": -0.23311518132686615, "rewards/verse_reward_func": -0.0078125, "step": 1116 }, { "completion_length": 256.0, "epoch": 8.936, "grad_norm": 1.21875, "kl": 2.5109671354293823, "learning_rate": 4.906209140884459e-05, "loss": 0.1004, "reward": 1.3914742469787598, "reward_std": 2.5306379795074463, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.653431475162506, "rewards/no_repetition_reward_func": -0.25414469838142395, "rewards/verse_reward_func": -0.0078125, "step": 1117 }, { "completion_length": 242.953125, "epoch": 8.943999999999999, "grad_norm": 1.2265625, "kl": 2.673795223236084, "learning_rate": 4.9058299039847975e-05, "loss": 0.107, "reward": 1.212670922279358, "reward_std": 2.2227553129196167, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.440567135810852, "rewards/no_repetition_reward_func": -0.22008365392684937, "rewards/verse_reward_func": -0.0078125, "step": 1118 }, { "completion_length": 247.46875, "epoch": 8.952, "grad_norm": 2.515625, "kl": 2.1899211406707764, "learning_rate": 4.905449916638873e-05, "loss": 0.0876, "reward": 2.0817350149154663, "reward_std": 3.043152928352356, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.372952938079834, "rewards/no_repetition_reward_func": -0.2834053337574005, "rewards/verse_reward_func": -0.0078125, "step": 1119 }, { "completion_length": 252.828125, "epoch": 8.96, "grad_norm": 2.46875, "kl": 2.349913477897644, "learning_rate": 4.905069178965215e-05, "loss": 0.094, "reward": 2.0401649475097656, "reward_std": 2.6040282249450684, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3075157403945923, "rewards/no_repetition_reward_func": -0.26735061407089233, "rewards/verse_reward_func": 0.0, "step": 1120 }, { "completion_length": 252.046875, "epoch": 8.968, "grad_norm": 3.625, "kl": 3.585000514984131, "learning_rate": 4.904687691082585e-05, "loss": 0.1434, "reward": 1.4706612825393677, "reward_std": 2.088826060295105, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6915892362594604, "rewards/no_repetition_reward_func": -0.2209278717637062, "rewards/verse_reward_func": 0.0, "step": 1121 }, { "completion_length": 253.0625, "epoch": 8.975999999999999, "grad_norm": 1.296875, "kl": 2.7030205726623535, "learning_rate": 4.904305453109981e-05, "loss": 0.1081, "reward": 1.8983827829360962, "reward_std": 2.4097453355789185, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1497628688812256, "rewards/no_repetition_reward_func": -0.2357550635933876, "rewards/verse_reward_func": -0.015625, "step": 1122 }, { "completion_length": 241.375, "epoch": 8.984, "grad_norm": 1.5078125, "kl": 1.730399250984192, "learning_rate": 4.9039224651666325e-05, "loss": 0.0692, "reward": 1.517457515001297, "reward_std": 2.213824212551117, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.812053918838501, "rewards/no_repetition_reward_func": -0.2711588442325592, "rewards/verse_reward_func": -0.0234375, "step": 1123 }, { "completion_length": 249.265625, "epoch": 8.992, "grad_norm": 2.125, "kl": 2.5914523601531982, "learning_rate": 4.903538727372005e-05, "loss": 0.1037, "reward": 2.04703426361084, "reward_std": 2.7465816736221313, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3358670473098755, "rewards/no_repetition_reward_func": -0.2810203433036804, "rewards/verse_reward_func": -0.0078125, "step": 1124 }, { "completion_length": 253.6875, "epoch": 9.0, "grad_norm": 2.953125, "kl": 3.0964435338974, "learning_rate": 4.9031542398457974e-05, "loss": 0.1239, "reward": 1.493617296218872, "reward_std": 2.105890452861786, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.727089524269104, "rewards/no_repetition_reward_func": -0.23347223550081253, "rewards/verse_reward_func": 0.0, "step": 1125 }, { "completion_length": 249.15625, "epoch": 9.008, "grad_norm": 3.296875, "kl": 1.5001939535140991, "learning_rate": 4.902769002707942e-05, "loss": 0.06, "reward": 3.140958309173584, "reward_std": 2.9918601512908936, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4717925786972046, "rewards/no_repetition_reward_func": -0.3308341056108475, "rewards/verse_reward_func": 0.0, "step": 1126 }, { "completion_length": 245.15625, "epoch": 9.016, "grad_norm": 1.234375, "kl": 2.139797329902649, "learning_rate": 4.902383016078605e-05, "loss": 0.0856, "reward": 1.5607886910438538, "reward_std": 2.395994782447815, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8235458135604858, "rewards/no_repetition_reward_func": -0.2549445778131485, "rewards/verse_reward_func": -0.0078125, "step": 1127 }, { "completion_length": 250.578125, "epoch": 9.024, "grad_norm": 1.5546875, "kl": 2.533983588218689, "learning_rate": 4.901996280078186e-05, "loss": 0.1014, "reward": 1.8495182991027832, "reward_std": 2.548023223876953, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.142273783683777, "rewards/no_repetition_reward_func": -0.284943051636219, "rewards/verse_reward_func": -0.0078125, "step": 1128 }, { "completion_length": 240.921875, "epoch": 9.032, "grad_norm": 5.96875, "kl": 4.295783758163452, "learning_rate": 4.90160879482732e-05, "loss": 0.1718, "reward": 1.2675798535346985, "reward_std": 2.3179351687431335, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.514914631843567, "rewards/no_repetition_reward_func": -0.22389735281467438, "rewards/verse_reward_func": -0.0234375, "step": 1129 }, { "completion_length": 249.25, "epoch": 9.04, "grad_norm": 1.7890625, "kl": 2.5464459657669067, "learning_rate": 4.9012205604468744e-05, "loss": 0.1019, "reward": 1.6552918553352356, "reward_std": 2.440829634666443, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9226232767105103, "rewards/no_repetition_reward_func": -0.25951893627643585, "rewards/verse_reward_func": -0.0078125, "step": 1130 }, { "completion_length": 249.6875, "epoch": 9.048, "grad_norm": 1.5625, "kl": 2.7251728773117065, "learning_rate": 4.90083157705795e-05, "loss": 0.109, "reward": 1.3364322185516357, "reward_std": 2.2587802410125732, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6093313097953796, "rewards/no_repetition_reward_func": -0.2650865316390991, "rewards/verse_reward_func": -0.0078125, "step": 1131 }, { "completion_length": 251.734375, "epoch": 9.056, "grad_norm": 1.2890625, "kl": 2.3555558919906616, "learning_rate": 4.9004418447818815e-05, "loss": 0.0942, "reward": 1.4957892298698425, "reward_std": 1.8879814147949219, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7432051301002502, "rewards/no_repetition_reward_func": -0.2317906618118286, "rewards/verse_reward_func": -0.015625, "step": 1132 }, { "completion_length": 248.890625, "epoch": 9.064, "grad_norm": 0.96875, "kl": 1.5601673126220703, "learning_rate": 4.900051363740238e-05, "loss": 0.0624, "reward": 1.5903590321540833, "reward_std": 2.517342209815979, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.864476203918457, "rewards/no_repetition_reward_func": -0.2663046717643738, "rewards/verse_reward_func": -0.0078125, "step": 1133 }, { "completion_length": 255.203125, "epoch": 9.072, "grad_norm": 1.8125, "kl": 2.2828345894813538, "learning_rate": 4.8996601340548215e-05, "loss": 0.0913, "reward": 1.6602014303207397, "reward_std": 2.5748274326324463, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.882499635219574, "rewards/no_repetition_reward_func": -0.2222982943058014, "rewards/verse_reward_func": 0.0, "step": 1134 }, { "completion_length": 253.21875, "epoch": 9.08, "grad_norm": 3.0, "kl": 2.133159041404724, "learning_rate": 4.899268155847667e-05, "loss": 0.0853, "reward": 1.7126153707504272, "reward_std": 2.5434141159057617, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9731632471084595, "rewards/no_repetition_reward_func": -0.24492305517196655, "rewards/verse_reward_func": -0.015625, "step": 1135 }, { "completion_length": 254.40625, "epoch": 9.088, "grad_norm": 2.15625, "kl": 2.1502574682235718, "learning_rate": 4.898875429241044e-05, "loss": 0.086, "reward": 2.0261430144309998, "reward_std": 2.721428394317627, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3113884329795837, "rewards/no_repetition_reward_func": -0.27743279933929443, "rewards/verse_reward_func": -0.0078125, "step": 1136 }, { "completion_length": 240.28125, "epoch": 9.096, "grad_norm": 2.203125, "kl": 2.3012523651123047, "learning_rate": 4.898481954357455e-05, "loss": 0.0921, "reward": 1.63975590467453, "reward_std": 2.6206401586532593, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8895055055618286, "rewards/no_repetition_reward_func": -0.22631189972162247, "rewards/verse_reward_func": -0.0234375, "step": 1137 }, { "completion_length": 247.390625, "epoch": 9.104, "grad_norm": 1.390625, "kl": 2.2602781653404236, "learning_rate": 4.898087731319636e-05, "loss": 0.0904, "reward": 2.041706681251526, "reward_std": 2.658132314682007, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3043097257614136, "rewards/no_repetition_reward_func": -0.2626030743122101, "rewards/verse_reward_func": 0.0, "step": 1138 }, { "completion_length": 254.5, "epoch": 9.112, "grad_norm": 1.1796875, "kl": 2.594844102859497, "learning_rate": 4.897692760250556e-05, "loss": 0.1038, "reward": 1.5696528553962708, "reward_std": 2.5858616828918457, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.859424114227295, "rewards/no_repetition_reward_func": -0.281958669424057, "rewards/verse_reward_func": -0.0078125, "step": 1139 }, { "completion_length": 243.015625, "epoch": 9.12, "grad_norm": 15.3125, "kl": 2.7379144430160522, "learning_rate": 4.8972970412734176e-05, "loss": 0.1095, "reward": 1.605180561542511, "reward_std": 2.418820381164551, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.886808454990387, "rewards/no_repetition_reward_func": -0.27381540834903717, "rewards/verse_reward_func": -0.0078125, "step": 1140 }, { "completion_length": 254.53125, "epoch": 9.128, "grad_norm": 2.703125, "kl": 3.34896457195282, "learning_rate": 4.896900574511657e-05, "loss": 0.134, "reward": 1.7385264039039612, "reward_std": 2.5436519384384155, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.003295600414276, "rewards/no_repetition_reward_func": -0.2569568455219269, "rewards/verse_reward_func": -0.0078125, "step": 1141 }, { "completion_length": 249.046875, "epoch": 9.136, "grad_norm": 2.4375, "kl": 2.934778928756714, "learning_rate": 4.8965033600889435e-05, "loss": 0.1174, "reward": 2.3226839303970337, "reward_std": 2.8432700634002686, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.607481598854065, "rewards/no_repetition_reward_func": -0.2769849896430969, "rewards/verse_reward_func": -0.0078125, "step": 1142 }, { "completion_length": 254.65625, "epoch": 9.144, "grad_norm": 1.9921875, "kl": 2.3298450708389282, "learning_rate": 4.8961053981291795e-05, "loss": 0.0932, "reward": 2.0839396715164185, "reward_std": 2.5315533876419067, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3627336025238037, "rewards/no_repetition_reward_func": -0.2787938117980957, "rewards/verse_reward_func": 0.0, "step": 1143 }, { "completion_length": 234.671875, "epoch": 9.152, "grad_norm": 1.5546875, "kl": 2.90571129322052, "learning_rate": 4.8957066887565e-05, "loss": 0.1162, "reward": 2.012732744216919, "reward_std": 2.621542811393738, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2682319283485413, "rewards/no_repetition_reward_func": -0.23987431079149246, "rewards/verse_reward_func": -0.015625, "step": 1144 }, { "completion_length": 251.625, "epoch": 9.16, "grad_norm": 2.90625, "kl": 3.707362413406372, "learning_rate": 4.8953072320952745e-05, "loss": 0.1483, "reward": 1.6843395233154297, "reward_std": 2.7819706201553345, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9469521045684814, "rewards/no_repetition_reward_func": -0.25480007380247116, "rewards/verse_reward_func": -0.0078125, "step": 1145 }, { "completion_length": 248.40625, "epoch": 9.168, "grad_norm": 2.03125, "kl": 2.2296950221061707, "learning_rate": 4.8949070282701034e-05, "loss": 0.0892, "reward": 2.207293391227722, "reward_std": 3.005705237388611, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.47283136844635, "rewards/no_repetition_reward_func": -0.26553822308778763, "rewards/verse_reward_func": 0.0, "step": 1146 }, { "completion_length": 247.609375, "epoch": 9.176, "grad_norm": 2.53125, "kl": 3.904307246208191, "learning_rate": 4.894506077405824e-05, "loss": 0.1562, "reward": 1.652557611465454, "reward_std": 3.1396209001541138, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.932165801525116, "rewards/no_repetition_reward_func": -0.27960845828056335, "rewards/verse_reward_func": 0.0, "step": 1147 }, { "completion_length": 254.84375, "epoch": 9.184, "grad_norm": 4.28125, "kl": 2.9930419921875, "learning_rate": 4.8941043796275015e-05, "loss": 0.1197, "reward": 0.832183450460434, "reward_std": 1.5964691638946533, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.0735182166099548, "rewards/no_repetition_reward_func": -0.22570978105068207, "rewards/verse_reward_func": -0.015625, "step": 1148 }, { "completion_length": 245.875, "epoch": 9.192, "grad_norm": 4.90625, "kl": 3.519420862197876, "learning_rate": 4.893701935060439e-05, "loss": 0.1408, "reward": 0.753002256155014, "reward_std": 1.3478976488113403, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 0.9778568744659424, "rewards/no_repetition_reward_func": -0.2092297002673149, "rewards/verse_reward_func": -0.015625, "step": 1149 }, { "completion_length": 247.796875, "epoch": 9.2, "grad_norm": 4.71875, "kl": 4.314791917800903, "learning_rate": 4.893298743830168e-05, "loss": 0.1726, "reward": 0.9041626453399658, "reward_std": 1.9063243865966797, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1122111082077026, "rewards/no_repetition_reward_func": -0.208048477768898, "rewards/verse_reward_func": 0.0, "step": 1150 }, { "completion_length": 245.859375, "epoch": 9.208, "grad_norm": 1.6484375, "kl": 2.58253413438797, "learning_rate": 4.892894806062458e-05, "loss": 0.1033, "reward": 1.6310515999794006, "reward_std": 2.4017876386642456, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8694331645965576, "rewards/no_repetition_reward_func": -0.23838160932064056, "rewards/verse_reward_func": 0.0, "step": 1151 }, { "completion_length": 255.90625, "epoch": 9.216, "grad_norm": 2.4375, "kl": 1.8560674786567688, "learning_rate": 4.892490121883306e-05, "loss": 0.0742, "reward": 2.262786388397217, "reward_std": 2.780978560447693, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.572746753692627, "rewards/no_repetition_reward_func": -0.3021478056907654, "rewards/verse_reward_func": -0.0078125, "step": 1152 }, { "completion_length": 254.703125, "epoch": 9.224, "grad_norm": 2.25, "kl": 2.5860488414764404, "learning_rate": 4.892084691418947e-05, "loss": 0.1034, "reward": 1.51135915517807, "reward_std": 2.5519449710845947, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7691586017608643, "rewards/no_repetition_reward_func": -0.24217448383569717, "rewards/verse_reward_func": -0.015625, "step": 1153 }, { "completion_length": 254.640625, "epoch": 9.232, "grad_norm": 1.8046875, "kl": 2.0576584339141846, "learning_rate": 4.891678514795843e-05, "loss": 0.0823, "reward": 1.962876558303833, "reward_std": 2.569685220718384, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2546123266220093, "rewards/no_repetition_reward_func": -0.2526734098792076, "rewards/verse_reward_func": -0.0390625, "step": 1154 }, { "completion_length": 244.609375, "epoch": 9.24, "grad_norm": 1.4921875, "kl": 1.9269800186157227, "learning_rate": 4.891271592140695e-05, "loss": 0.0771, "reward": 1.6331895589828491, "reward_std": 2.437903881072998, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8925208449363708, "rewards/no_repetition_reward_func": -0.25151877105236053, "rewards/verse_reward_func": -0.0078125, "step": 1155 }, { "completion_length": 248.59375, "epoch": 9.248, "grad_norm": 1.7265625, "kl": 2.316287040710449, "learning_rate": 4.8908639235804324e-05, "loss": 0.0927, "reward": 1.5309677124023438, "reward_std": 2.595858573913574, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.799743413925171, "rewards/no_repetition_reward_func": -0.2453381046652794, "rewards/verse_reward_func": -0.0234375, "step": 1156 }, { "completion_length": 247.21875, "epoch": 9.256, "grad_norm": 3.671875, "kl": 1.550776481628418, "learning_rate": 4.890455509242218e-05, "loss": 0.062, "reward": 2.217956304550171, "reward_std": 2.788177728652954, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5006788969039917, "rewards/no_repetition_reward_func": -0.27491021156311035, "rewards/verse_reward_func": -0.0078125, "step": 1157 }, { "completion_length": 248.5, "epoch": 9.264, "grad_norm": 2.109375, "kl": 2.6808754801750183, "learning_rate": 4.890046349253448e-05, "loss": 0.1072, "reward": 1.709058701992035, "reward_std": 2.6109557151794434, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.006339907646179, "rewards/no_repetition_reward_func": -0.28165626525878906, "rewards/verse_reward_func": -0.015625, "step": 1158 }, { "completion_length": 253.203125, "epoch": 9.272, "grad_norm": 1.609375, "kl": 2.2707936763763428, "learning_rate": 4.889636443741752e-05, "loss": 0.0908, "reward": 1.598178207874298, "reward_std": 2.756372332572937, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8830049633979797, "rewards/no_repetition_reward_func": -0.2848268002271652, "rewards/verse_reward_func": 0.0, "step": 1159 }, { "completion_length": 249.28125, "epoch": 9.28, "grad_norm": 1.96875, "kl": 2.515147566795349, "learning_rate": 4.889225792834991e-05, "loss": 0.1006, "reward": 1.9872613549232483, "reward_std": 2.8572473526000977, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2691051959991455, "rewards/no_repetition_reward_func": -0.28184399008750916, "rewards/verse_reward_func": 0.0, "step": 1160 }, { "completion_length": 254.3125, "epoch": 9.288, "grad_norm": 1.6640625, "kl": 2.0487438440322876, "learning_rate": 4.888814396661256e-05, "loss": 0.0819, "reward": 1.982334017753601, "reward_std": 2.011467456817627, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2166858911514282, "rewards/no_repetition_reward_func": -0.23435193300247192, "rewards/verse_reward_func": 0.0, "step": 1161 }, { "completion_length": 251.15625, "epoch": 9.296, "grad_norm": 2.40625, "kl": 3.6003459692001343, "learning_rate": 4.888402255348876e-05, "loss": 0.144, "reward": 1.4124696254730225, "reward_std": 2.390692949295044, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6648028492927551, "rewards/no_repetition_reward_func": -0.24452068656682968, "rewards/verse_reward_func": -0.0078125, "step": 1162 }, { "completion_length": 253.28125, "epoch": 9.304, "grad_norm": 1.859375, "kl": 3.0330302715301514, "learning_rate": 4.887989369026409e-05, "loss": 0.1213, "reward": 1.6722046732902527, "reward_std": 2.3462891578674316, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9424437284469604, "rewards/no_repetition_reward_func": -0.26242657005786896, "rewards/verse_reward_func": -0.0078125, "step": 1163 }, { "completion_length": 249.125, "epoch": 9.312, "grad_norm": 1.9140625, "kl": 3.235753297805786, "learning_rate": 4.887575737822645e-05, "loss": 0.1294, "reward": 1.550498902797699, "reward_std": 2.425285816192627, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8081315755844116, "rewards/no_repetition_reward_func": -0.24200773239135742, "rewards/verse_reward_func": -0.015625, "step": 1164 }, { "completion_length": 249.765625, "epoch": 9.32, "grad_norm": 1.484375, "kl": 3.237924575805664, "learning_rate": 4.887161361866608e-05, "loss": 0.1295, "reward": 1.86676025390625, "reward_std": 2.5979608297348022, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.137468636035919, "rewards/no_repetition_reward_func": -0.2628958448767662, "rewards/verse_reward_func": -0.0078125, "step": 1165 }, { "completion_length": 256.0, "epoch": 9.328, "grad_norm": 3.546875, "kl": 2.288266181945801, "learning_rate": 4.8867462412875526e-05, "loss": 0.0915, "reward": 2.4145835638046265, "reward_std": 2.9641144275665283, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.709749460220337, "rewards/no_repetition_reward_func": -0.29516586661338806, "rewards/verse_reward_func": 0.0, "step": 1166 }, { "completion_length": 248.8125, "epoch": 9.336, "grad_norm": 2.390625, "kl": 2.4085655212402344, "learning_rate": 4.886330376214968e-05, "loss": 0.0963, "reward": 1.9806594848632812, "reward_std": 2.9195175170898438, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.233369469642639, "rewards/no_repetition_reward_func": -0.25271011143922806, "rewards/verse_reward_func": 0.0, "step": 1167 }, { "completion_length": 255.875, "epoch": 9.344, "grad_norm": 1.8125, "kl": 2.7482601404190063, "learning_rate": 4.8859137667785735e-05, "loss": 0.1099, "reward": 2.048998475074768, "reward_std": 2.965214729309082, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3410184383392334, "rewards/no_repetition_reward_func": -0.28420737385749817, "rewards/verse_reward_func": -0.0078125, "step": 1168 }, { "completion_length": 250.375, "epoch": 9.352, "grad_norm": 5.21875, "kl": 4.420813083648682, "learning_rate": 4.88549641310832e-05, "loss": 0.1768, "reward": 1.8128425478935242, "reward_std": 2.781289577484131, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0973201990127563, "rewards/no_repetition_reward_func": -0.27666516602039337, "rewards/verse_reward_func": -0.0078125, "step": 1169 }, { "completion_length": 244.875, "epoch": 9.36, "grad_norm": 1.234375, "kl": 2.891939640045166, "learning_rate": 4.885078315334395e-05, "loss": 0.1157, "reward": 2.1118980646133423, "reward_std": 2.508175492286682, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3539611101150513, "rewards/no_repetition_reward_func": -0.2420630306005478, "rewards/verse_reward_func": 0.0, "step": 1170 }, { "completion_length": 252.8125, "epoch": 9.368, "grad_norm": 2.234375, "kl": 2.0577683448791504, "learning_rate": 4.884659473587213e-05, "loss": 0.0823, "reward": 1.9869248867034912, "reward_std": 2.4012848138809204, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.275110363960266, "rewards/no_repetition_reward_func": -0.28818538784980774, "rewards/verse_reward_func": 0.0, "step": 1171 }, { "completion_length": 250.9375, "epoch": 9.376, "grad_norm": 2.734375, "kl": 3.3379716873168945, "learning_rate": 4.884239887997423e-05, "loss": 0.1335, "reward": 1.7037521600723267, "reward_std": 2.4200522899627686, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9755966067314148, "rewards/no_repetition_reward_func": -0.2718444764614105, "rewards/verse_reward_func": 0.0, "step": 1172 }, { "completion_length": 256.0, "epoch": 9.384, "grad_norm": 1.9609375, "kl": 2.9582440853118896, "learning_rate": 4.8838195586959046e-05, "loss": 0.1183, "reward": 2.0850285291671753, "reward_std": 2.292340636253357, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.386132597923279, "rewards/no_repetition_reward_func": -0.30110400915145874, "rewards/verse_reward_func": 0.0, "step": 1173 }, { "completion_length": 254.578125, "epoch": 9.392, "grad_norm": 3.734375, "kl": 2.5663613080978394, "learning_rate": 4.8833984858137715e-05, "loss": 0.1027, "reward": 1.9703121781349182, "reward_std": 2.4652044773101807, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2505042552948, "rewards/no_repetition_reward_func": -0.28019191324710846, "rewards/verse_reward_func": 0.0, "step": 1174 }, { "completion_length": 236.390625, "epoch": 9.4, "grad_norm": 3.84375, "kl": 3.6336015462875366, "learning_rate": 4.882976669482367e-05, "loss": 0.1453, "reward": 1.4848475456237793, "reward_std": 2.4463346004486084, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.729425072669983, "rewards/no_repetition_reward_func": -0.2289525866508484, "rewards/verse_reward_func": -0.015625, "step": 1175 }, { "completion_length": 256.0, "epoch": 9.408, "grad_norm": 8.9375, "kl": 4.438924789428711, "learning_rate": 4.8825541098332706e-05, "loss": 0.1776, "reward": 1.533605694770813, "reward_std": 2.331944704055786, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8013044595718384, "rewards/no_repetition_reward_func": -0.25207386910915375, "rewards/verse_reward_func": -0.015625, "step": 1176 }, { "completion_length": 256.0, "epoch": 9.416, "grad_norm": 1.4375, "kl": 2.34198534488678, "learning_rate": 4.8821308069982867e-05, "loss": 0.0937, "reward": 2.045042395591736, "reward_std": 2.6493154764175415, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3392590284347534, "rewards/no_repetition_reward_func": -0.2785916179418564, "rewards/verse_reward_func": -0.015625, "step": 1177 }, { "completion_length": 255.6875, "epoch": 9.424, "grad_norm": 2.15625, "kl": 2.495872735977173, "learning_rate": 4.881706761109458e-05, "loss": 0.0998, "reward": 1.4978627562522888, "reward_std": 2.5519086718559265, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7383875250816345, "rewards/no_repetition_reward_func": -0.23271214962005615, "rewards/verse_reward_func": -0.0078125, "step": 1178 }, { "completion_length": 253.171875, "epoch": 9.432, "grad_norm": 1.7109375, "kl": 2.4015734791755676, "learning_rate": 4.881281972299055e-05, "loss": 0.0961, "reward": 1.6687864661216736, "reward_std": 2.7096810340881348, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9792430400848389, "rewards/no_repetition_reward_func": -0.3026440292596817, "rewards/verse_reward_func": -0.0078125, "step": 1179 }, { "completion_length": 249.09375, "epoch": 9.44, "grad_norm": 2.015625, "kl": 2.544740915298462, "learning_rate": 4.880856440699582e-05, "loss": 0.1018, "reward": 1.9073689579963684, "reward_std": 2.5849103927612305, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1645455956459045, "rewards/no_repetition_reward_func": -0.24936427175998688, "rewards/verse_reward_func": -0.0078125, "step": 1180 }, { "completion_length": 244.0625, "epoch": 9.448, "grad_norm": 3.390625, "kl": 2.5047240257263184, "learning_rate": 4.880430166443775e-05, "loss": 0.1002, "reward": 2.278413772583008, "reward_std": 2.56055748462677, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5399304628372192, "rewards/no_repetition_reward_func": -0.2380792275071144, "rewards/verse_reward_func": -0.0234375, "step": 1181 }, { "completion_length": 254.078125, "epoch": 9.456, "grad_norm": 1.703125, "kl": 3.134582042694092, "learning_rate": 4.880003149664599e-05, "loss": 0.1254, "reward": 1.4613617658615112, "reward_std": 2.432440757751465, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.697263777256012, "rewards/no_repetition_reward_func": -0.23590179532766342, "rewards/verse_reward_func": 0.0, "step": 1182 }, { "completion_length": 254.15625, "epoch": 9.464, "grad_norm": 2.234375, "kl": 2.3656362295150757, "learning_rate": 4.8795753904952534e-05, "loss": 0.0946, "reward": 1.8989188075065613, "reward_std": 2.6368331909179688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1683729887008667, "rewards/no_repetition_reward_func": -0.2694539874792099, "rewards/verse_reward_func": 0.0, "step": 1183 }, { "completion_length": 251.484375, "epoch": 9.472, "grad_norm": 1.34375, "kl": 3.4142613410949707, "learning_rate": 4.8791468890691696e-05, "loss": 0.1366, "reward": 1.5486395359039307, "reward_std": 2.5562314987182617, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8058634996414185, "rewards/no_repetition_reward_func": -0.2415989190340042, "rewards/verse_reward_func": -0.015625, "step": 1184 }, { "completion_length": 253.296875, "epoch": 9.48, "grad_norm": 1.75, "kl": 1.8978185653686523, "learning_rate": 4.878717645520008e-05, "loss": 0.0759, "reward": 2.3730040788650513, "reward_std": 2.7540868520736694, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6683238744735718, "rewards/no_repetition_reward_func": -0.2953197956085205, "rewards/verse_reward_func": 0.0, "step": 1185 }, { "completion_length": 255.171875, "epoch": 9.488, "grad_norm": 2.9375, "kl": 3.9555962085723877, "learning_rate": 4.878287659981662e-05, "loss": 0.1582, "reward": 2.0731464624404907, "reward_std": 2.801233649253845, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.326260566711426, "rewards/no_repetition_reward_func": -0.2531139999628067, "rewards/verse_reward_func": 0.0, "step": 1186 }, { "completion_length": 253.140625, "epoch": 9.496, "grad_norm": 2.75, "kl": 3.6541831493377686, "learning_rate": 4.877856932588257e-05, "loss": 0.1462, "reward": 2.450811743736267, "reward_std": 2.7196526527404785, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7363178730010986, "rewards/no_repetition_reward_func": -0.2855059802532196, "rewards/verse_reward_func": 0.0, "step": 1187 }, { "completion_length": 245.0, "epoch": 9.504, "grad_norm": 1.484375, "kl": 3.209267020225525, "learning_rate": 4.877425463474148e-05, "loss": 0.1284, "reward": 2.0716320276260376, "reward_std": 2.717729926109314, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.317912220954895, "rewards/no_repetition_reward_func": -0.24628007411956787, "rewards/verse_reward_func": 0.0, "step": 1188 }, { "completion_length": 245.734375, "epoch": 9.512, "grad_norm": 1.9140625, "kl": 3.2845417261123657, "learning_rate": 4.8769932527739225e-05, "loss": 0.1314, "reward": 1.8358802795410156, "reward_std": 2.4921988248825073, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.100662589073181, "rewards/no_repetition_reward_func": -0.2569698691368103, "rewards/verse_reward_func": -0.0078125, "step": 1189 }, { "completion_length": 249.09375, "epoch": 9.52, "grad_norm": 1.671875, "kl": 3.736693501472473, "learning_rate": 4.8765603006224006e-05, "loss": 0.1495, "reward": 1.5984325408935547, "reward_std": 2.6807854175567627, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8857097029685974, "rewards/no_repetition_reward_func": -0.27946464717388153, "rewards/verse_reward_func": -0.0078125, "step": 1190 }, { "completion_length": 252.5625, "epoch": 9.528, "grad_norm": 1.75, "kl": 2.3369327783584595, "learning_rate": 4.87612660715463e-05, "loss": 0.0935, "reward": 2.4258251190185547, "reward_std": 2.7477927207946777, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.700933814048767, "rewards/no_repetition_reward_func": -0.2751086354255676, "rewards/verse_reward_func": 0.0, "step": 1191 }, { "completion_length": 248.71875, "epoch": 9.536, "grad_norm": 6.09375, "kl": 2.0450711250305176, "learning_rate": 4.8756921725058934e-05, "loss": 0.0818, "reward": 2.8404319286346436, "reward_std": 2.875824213027954, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1085150241851807, "rewards/no_repetition_reward_func": -0.2680831253528595, "rewards/verse_reward_func": 0.0, "step": 1192 }, { "completion_length": 254.078125, "epoch": 9.544, "grad_norm": 1.546875, "kl": 2.195577323436737, "learning_rate": 4.875256996811703e-05, "loss": 0.0878, "reward": 2.096486985683441, "reward_std": 2.1931657195091248, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3441519737243652, "rewards/no_repetition_reward_func": -0.24766495823860168, "rewards/verse_reward_func": 0.0, "step": 1193 }, { "completion_length": 245.6875, "epoch": 9.552, "grad_norm": 1.5078125, "kl": 2.7336496114730835, "learning_rate": 4.874821080207803e-05, "loss": 0.1093, "reward": 1.7974347472190857, "reward_std": 2.6781394481658936, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0635852813720703, "rewards/no_repetition_reward_func": -0.24271298199892044, "rewards/verse_reward_func": -0.0234375, "step": 1194 }, { "completion_length": 249.15625, "epoch": 9.56, "grad_norm": 2.015625, "kl": 2.2950714826583862, "learning_rate": 4.874384422830167e-05, "loss": 0.0918, "reward": 1.747079849243164, "reward_std": 2.4466583728790283, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9716312289237976, "rewards/no_repetition_reward_func": -0.22455143183469772, "rewards/verse_reward_func": 0.0, "step": 1195 }, { "completion_length": 242.609375, "epoch": 9.568, "grad_norm": 1.53125, "kl": 2.663655638694763, "learning_rate": 4.873947024815002e-05, "loss": 0.1065, "reward": 2.213918924331665, "reward_std": 2.591664433479309, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4953285455703735, "rewards/no_repetition_reward_func": -0.2579721659421921, "rewards/verse_reward_func": -0.0234375, "step": 1196 }, { "completion_length": 247.9375, "epoch": 9.576, "grad_norm": 1.6328125, "kl": 2.9836729764938354, "learning_rate": 4.873508886298743e-05, "loss": 0.1193, "reward": 1.5609089732170105, "reward_std": 2.680574655532837, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8274741768836975, "rewards/no_repetition_reward_func": -0.2509402856230736, "rewards/verse_reward_func": -0.015625, "step": 1197 }, { "completion_length": 250.8125, "epoch": 9.584, "grad_norm": 2.59375, "kl": 3.159751296043396, "learning_rate": 4.873070007418059e-05, "loss": 0.1264, "reward": 1.529939591884613, "reward_std": 2.3864853382110596, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.792417287826538, "rewards/no_repetition_reward_func": -0.24685266613960266, "rewards/verse_reward_func": -0.015625, "step": 1198 }, { "completion_length": 251.875, "epoch": 9.592, "grad_norm": 1.5234375, "kl": 2.9432692527770996, "learning_rate": 4.872630388309849e-05, "loss": 0.1177, "reward": 1.324613630771637, "reward_std": 2.2251765727996826, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.571883738040924, "rewards/no_repetition_reward_func": -0.23945754021406174, "rewards/verse_reward_func": -0.0078125, "step": 1199 }, { "completion_length": 243.5, "epoch": 9.6, "grad_norm": 1.8046875, "kl": 2.8490623235702515, "learning_rate": 4.8721900291112415e-05, "loss": 0.114, "reward": 1.2158302068710327, "reward_std": 2.3345847129821777, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4917858839035034, "rewards/no_repetition_reward_func": -0.26814328134059906, "rewards/verse_reward_func": -0.0078125, "step": 1200 }, { "completion_length": 251.171875, "epoch": 9.608, "grad_norm": 2.234375, "kl": 2.613136649131775, "learning_rate": 4.871748929959598e-05, "loss": 0.1045, "reward": 2.559861421585083, "reward_std": 3.015141248703003, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8468964099884033, "rewards/no_repetition_reward_func": -0.27140988409519196, "rewards/verse_reward_func": -0.015625, "step": 1201 }, { "completion_length": 249.046875, "epoch": 9.616, "grad_norm": 2.25, "kl": 2.650425672531128, "learning_rate": 4.8713070909925094e-05, "loss": 0.106, "reward": 1.320601463317871, "reward_std": 1.9714125990867615, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5603336691856384, "rewards/no_repetition_reward_func": -0.2241072952747345, "rewards/verse_reward_func": -0.015625, "step": 1202 }, { "completion_length": 255.640625, "epoch": 9.624, "grad_norm": 1.3984375, "kl": 3.047998547554016, "learning_rate": 4.870864512347797e-05, "loss": 0.1219, "reward": 1.5829907655715942, "reward_std": 2.330949902534485, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8258662223815918, "rewards/no_repetition_reward_func": -0.24287542700767517, "rewards/verse_reward_func": 0.0, "step": 1203 }, { "completion_length": 247.578125, "epoch": 9.632, "grad_norm": 2.984375, "kl": 1.7600687742233276, "learning_rate": 4.870421194163515e-05, "loss": 0.0704, "reward": 2.1513471603393555, "reward_std": 2.769736409187317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4616281986236572, "rewards/no_repetition_reward_func": -0.31028100848197937, "rewards/verse_reward_func": 0.0, "step": 1204 }, { "completion_length": 252.03125, "epoch": 9.64, "grad_norm": 2.25, "kl": 1.994486689567566, "learning_rate": 4.8699771365779453e-05, "loss": 0.0798, "reward": 2.3225399255752563, "reward_std": 2.838118553161621, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6170549392700195, "rewards/no_repetition_reward_func": -0.28670285642147064, "rewards/verse_reward_func": -0.0078125, "step": 1205 }, { "completion_length": 252.140625, "epoch": 9.648, "grad_norm": 2.734375, "kl": 2.1359052062034607, "learning_rate": 4.8695323397296044e-05, "loss": 0.0854, "reward": 2.592411518096924, "reward_std": 2.793976306915283, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9231314659118652, "rewards/no_repetition_reward_func": -0.3229074478149414, "rewards/verse_reward_func": -0.0078125, "step": 1206 }, { "completion_length": 243.46875, "epoch": 9.656, "grad_norm": 1.59375, "kl": 2.956955075263977, "learning_rate": 4.8690868037572346e-05, "loss": 0.1183, "reward": 1.656231939792633, "reward_std": 2.4973288774490356, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.898297667503357, "rewards/no_repetition_reward_func": -0.24206571280956268, "rewards/verse_reward_func": 0.0, "step": 1207 }, { "completion_length": 250.390625, "epoch": 9.664, "grad_norm": 1.5703125, "kl": 2.7173372507095337, "learning_rate": 4.8686405287998116e-05, "loss": 0.1087, "reward": 1.5364618301391602, "reward_std": 2.5260438919067383, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8347704410552979, "rewards/no_repetition_reward_func": -0.2904961556196213, "rewards/verse_reward_func": -0.0078125, "step": 1208 }, { "completion_length": 243.453125, "epoch": 9.672, "grad_norm": 5.21875, "kl": 3.1686137914657593, "learning_rate": 4.8681935149965416e-05, "loss": 0.1267, "reward": 1.2768777012825012, "reward_std": 1.7241949439048767, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5105502009391785, "rewards/no_repetition_reward_func": -0.23367257416248322, "rewards/verse_reward_func": 0.0, "step": 1209 }, { "completion_length": 252.875, "epoch": 9.68, "grad_norm": 1.4609375, "kl": 2.869908094406128, "learning_rate": 4.867745762486861e-05, "loss": 0.1148, "reward": 2.5923829078674316, "reward_std": 2.558947443962097, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8668729066848755, "rewards/no_repetition_reward_func": -0.2744901105761528, "rewards/verse_reward_func": 0.0, "step": 1210 }, { "completion_length": 254.140625, "epoch": 9.688, "grad_norm": 2.109375, "kl": 3.193397283554077, "learning_rate": 4.8672972714104357e-05, "loss": 0.1277, "reward": 2.841387629508972, "reward_std": 2.9524691104888916, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1553279161453247, "rewards/no_repetition_reward_func": -0.29831530153751373, "rewards/verse_reward_func": -0.015625, "step": 1211 }, { "completion_length": 256.0, "epoch": 9.696, "grad_norm": 1.140625, "kl": 3.2024646997451782, "learning_rate": 4.866848041907164e-05, "loss": 0.1281, "reward": 2.4312747716903687, "reward_std": 2.977389693260193, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.73134446144104, "rewards/no_repetition_reward_func": -0.29225724935531616, "rewards/verse_reward_func": -0.0078125, "step": 1212 }, { "completion_length": 250.453125, "epoch": 9.704, "grad_norm": 3.46875, "kl": 3.6692055463790894, "learning_rate": 4.8663980741171724e-05, "loss": 0.1468, "reward": 1.9102173447608948, "reward_std": 2.8923697471618652, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.25691556930542, "rewards/no_repetition_reward_func": -0.3232606053352356, "rewards/verse_reward_func": -0.0234375, "step": 1213 }, { "completion_length": 251.796875, "epoch": 9.712, "grad_norm": 1.5625, "kl": 3.198261260986328, "learning_rate": 4.865947368180818e-05, "loss": 0.1279, "reward": 2.1653090715408325, "reward_std": 2.713435411453247, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4483929872512817, "rewards/no_repetition_reward_func": -0.283083975315094, "rewards/verse_reward_func": 0.0, "step": 1214 }, { "completion_length": 249.890625, "epoch": 9.72, "grad_norm": 1.75, "kl": 2.6617815494537354, "learning_rate": 4.8654959242386896e-05, "loss": 0.1065, "reward": 1.962666392326355, "reward_std": 2.450185775756836, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2444769144058228, "rewards/no_repetition_reward_func": -0.27399804443120956, "rewards/verse_reward_func": -0.0078125, "step": 1215 }, { "completion_length": 256.0, "epoch": 9.728, "grad_norm": 5.0625, "kl": 2.833557605743408, "learning_rate": 4.865043742431605e-05, "loss": 0.1133, "reward": 2.7558610439300537, "reward_std": 3.1260910034179688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0438461303710938, "rewards/no_repetition_reward_func": -0.2801726162433624, "rewards/verse_reward_func": -0.0078125, "step": 1216 }, { "completion_length": 248.3125, "epoch": 9.736, "grad_norm": 1.40625, "kl": 3.70027232170105, "learning_rate": 4.8645908229006135e-05, "loss": 0.148, "reward": 1.7137986421585083, "reward_std": 2.6661174297332764, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9485554695129395, "rewards/no_repetition_reward_func": -0.23475682735443115, "rewards/verse_reward_func": 0.0, "step": 1217 }, { "completion_length": 246.390625, "epoch": 9.744, "grad_norm": 2.0, "kl": 2.521375060081482, "learning_rate": 4.8641371657869916e-05, "loss": 0.1009, "reward": 1.6146782636642456, "reward_std": 2.578650951385498, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.870437741279602, "rewards/no_repetition_reward_func": -0.247947059571743, "rewards/verse_reward_func": -0.0078125, "step": 1218 }, { "completion_length": 249.34375, "epoch": 9.752, "grad_norm": 2.78125, "kl": 3.4864702224731445, "learning_rate": 4.863682771232248e-05, "loss": 0.1395, "reward": 1.227284014225006, "reward_std": 2.2073790431022644, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4802438616752625, "rewards/no_repetition_reward_func": -0.25295981019735336, "rewards/verse_reward_func": 0.0, "step": 1219 }, { "completion_length": 254.4375, "epoch": 9.76, "grad_norm": 1.40625, "kl": 2.5549676418304443, "learning_rate": 4.863227639378124e-05, "loss": 0.1022, "reward": 1.6370345950126648, "reward_std": 2.3829137682914734, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9258185029029846, "rewards/no_repetition_reward_func": -0.28878387808799744, "rewards/verse_reward_func": 0.0, "step": 1220 }, { "completion_length": 255.21875, "epoch": 9.768, "grad_norm": 3.09375, "kl": 3.5750011205673218, "learning_rate": 4.862771770366584e-05, "loss": 0.143, "reward": 1.6966673135757446, "reward_std": 2.4657557010650635, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.957975447177887, "rewards/no_repetition_reward_func": -0.253495454788208, "rewards/verse_reward_func": -0.0078125, "step": 1221 }, { "completion_length": 252.765625, "epoch": 9.776, "grad_norm": 5.25, "kl": 1.9367796182632446, "learning_rate": 4.862315164339829e-05, "loss": 0.0775, "reward": 2.462921142578125, "reward_std": 2.913943886756897, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7431492805480957, "rewards/no_repetition_reward_func": -0.2802281826734543, "rewards/verse_reward_func": 0.0, "step": 1222 }, { "completion_length": 249.8125, "epoch": 9.784, "grad_norm": 3.46875, "kl": 1.5684479475021362, "learning_rate": 4.861857821440287e-05, "loss": 0.0627, "reward": 2.4256083965301514, "reward_std": 3.1845189332962036, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7789703607559204, "rewards/no_repetition_reward_func": -0.3455495983362198, "rewards/verse_reward_func": -0.0078125, "step": 1223 }, { "completion_length": 254.71875, "epoch": 9.792, "grad_norm": 3.578125, "kl": 2.0166946053504944, "learning_rate": 4.861399741810615e-05, "loss": 0.0807, "reward": 2.800867199897766, "reward_std": 2.7755078077316284, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0857622623443604, "rewards/no_repetition_reward_func": -0.2848951518535614, "rewards/verse_reward_func": 0.0, "step": 1224 }, { "completion_length": 253.78125, "epoch": 9.8, "grad_norm": 1.5625, "kl": 2.252847194671631, "learning_rate": 4.860940925593703e-05, "loss": 0.0901, "reward": 2.0856690406799316, "reward_std": 2.6139732599258423, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3844391107559204, "rewards/no_repetition_reward_func": -0.28314508497714996, "rewards/verse_reward_func": -0.015625, "step": 1225 }, { "completion_length": 242.6875, "epoch": 9.808, "grad_norm": 2.4375, "kl": 3.193793773651123, "learning_rate": 4.860481372932667e-05, "loss": 0.1278, "reward": 1.8090882897377014, "reward_std": 2.710961103439331, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0817015171051025, "rewards/no_repetition_reward_func": -0.27261335402727127, "rewards/verse_reward_func": 0.0, "step": 1226 }, { "completion_length": 256.0, "epoch": 9.816, "grad_norm": 6.65625, "kl": 4.1241878271102905, "learning_rate": 4.860021083970855e-05, "loss": 0.165, "reward": 1.3663129210472107, "reward_std": 1.9471158385276794, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6425906419754028, "rewards/no_repetition_reward_func": -0.26065272092819214, "rewards/verse_reward_func": -0.015625, "step": 1227 }, { "completion_length": 244.390625, "epoch": 9.824, "grad_norm": 6.78125, "kl": 4.739343643188477, "learning_rate": 4.859560058851844e-05, "loss": 0.1896, "reward": 1.2619640231132507, "reward_std": 2.1212474703788757, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4991361498832703, "rewards/no_repetition_reward_func": -0.22935960441827774, "rewards/verse_reward_func": -0.0078125, "step": 1228 }, { "completion_length": 249.78125, "epoch": 9.832, "grad_norm": 4.96875, "kl": 3.8139359951019287, "learning_rate": 4.85909829771944e-05, "loss": 0.1526, "reward": 1.0807504057884216, "reward_std": 1.9771589636802673, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3306167125701904, "rewards/no_repetition_reward_func": -0.2420538142323494, "rewards/verse_reward_func": -0.0078125, "step": 1229 }, { "completion_length": 256.0, "epoch": 9.84, "grad_norm": 1.1875, "kl": 3.431438684463501, "learning_rate": 4.858635800717681e-05, "loss": 0.1373, "reward": 2.1451767683029175, "reward_std": 3.011893630027771, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.420825719833374, "rewards/no_repetition_reward_func": -0.27564896643161774, "rewards/verse_reward_func": 0.0, "step": 1230 }, { "completion_length": 253.296875, "epoch": 9.848, "grad_norm": 2.046875, "kl": 2.9402785301208496, "learning_rate": 4.8581725679908317e-05, "loss": 0.1176, "reward": 2.5062841176986694, "reward_std": 3.0129066705703735, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.816378116607666, "rewards/no_repetition_reward_func": -0.29446908831596375, "rewards/verse_reward_func": -0.015625, "step": 1231 }, { "completion_length": 251.21875, "epoch": 9.856, "grad_norm": 3.546875, "kl": 1.6959657669067383, "learning_rate": 4.857708599683389e-05, "loss": 0.0678, "reward": 2.4153472185134888, "reward_std": 2.8912326097488403, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7093766927719116, "rewards/no_repetition_reward_func": -0.29402944445610046, "rewards/verse_reward_func": 0.0, "step": 1232 }, { "completion_length": 256.0, "epoch": 9.864, "grad_norm": 2.671875, "kl": 2.84661066532135, "learning_rate": 4.857243895940076e-05, "loss": 0.1139, "reward": 1.733267366886139, "reward_std": 2.0061503648757935, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0046343207359314, "rewards/no_repetition_reward_func": -0.2557419091463089, "rewards/verse_reward_func": -0.015625, "step": 1233 }, { "completion_length": 248.734375, "epoch": 9.872, "grad_norm": 1.3359375, "kl": 2.223117709159851, "learning_rate": 4.856778456905846e-05, "loss": 0.0889, "reward": 1.6540077328681946, "reward_std": 2.155939221382141, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9155056476593018, "rewards/no_repetition_reward_func": -0.2614978402853012, "rewards/verse_reward_func": 0.0, "step": 1234 }, { "completion_length": 253.78125, "epoch": 9.88, "grad_norm": 2.4375, "kl": 3.7625949382781982, "learning_rate": 4.856312282725886e-05, "loss": 0.1505, "reward": 1.856501281261444, "reward_std": 2.3224995136260986, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.104975640773773, "rewards/no_repetition_reward_func": -0.24847444146871567, "rewards/verse_reward_func": 0.0, "step": 1235 }, { "completion_length": 249.984375, "epoch": 9.888, "grad_norm": 2.640625, "kl": 3.0778968334198, "learning_rate": 4.855845373545605e-05, "loss": 0.1231, "reward": 1.2285181879997253, "reward_std": 2.0716179609298706, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4749462604522705, "rewards/no_repetition_reward_func": -0.22299061715602875, "rewards/verse_reward_func": -0.0234375, "step": 1236 }, { "completion_length": 249.984375, "epoch": 9.896, "grad_norm": 2.328125, "kl": 2.0342105627059937, "learning_rate": 4.855377729510648e-05, "loss": 0.0814, "reward": 1.78525972366333, "reward_std": 2.089527666568756, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0757042169570923, "rewards/no_repetition_reward_func": -0.2826320081949234, "rewards/verse_reward_func": -0.0078125, "step": 1237 }, { "completion_length": 256.0, "epoch": 9.904, "grad_norm": 2.890625, "kl": 2.1907383799552917, "learning_rate": 4.8549093507668865e-05, "loss": 0.0876, "reward": 1.9476479887962341, "reward_std": 2.718257427215576, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2305697202682495, "rewards/no_repetition_reward_func": -0.28292177617549896, "rewards/verse_reward_func": 0.0, "step": 1238 }, { "completion_length": 252.484375, "epoch": 9.912, "grad_norm": 1.984375, "kl": 2.1288141012191772, "learning_rate": 4.854440237460418e-05, "loss": 0.0852, "reward": 1.9112936854362488, "reward_std": 2.108749508857727, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1732946038246155, "rewards/no_repetition_reward_func": -0.2620009034872055, "rewards/verse_reward_func": 0.0, "step": 1239 }, { "completion_length": 253.203125, "epoch": 9.92, "grad_norm": 4.875, "kl": 1.5297017693519592, "learning_rate": 4.8539703897375755e-05, "loss": 0.0612, "reward": 2.923712730407715, "reward_std": 2.718656897544861, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.253346800804138, "rewards/no_repetition_reward_func": -0.314008966088295, "rewards/verse_reward_func": -0.015625, "step": 1240 }, { "completion_length": 243.515625, "epoch": 9.928, "grad_norm": 1.46875, "kl": 2.3902801275253296, "learning_rate": 4.853499807744916e-05, "loss": 0.0956, "reward": 1.8492909669876099, "reward_std": 2.6087796688079834, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1048754453659058, "rewards/no_repetition_reward_func": -0.23995966464281082, "rewards/verse_reward_func": -0.015625, "step": 1241 }, { "completion_length": 245.671875, "epoch": 9.936, "grad_norm": 3.328125, "kl": 2.336404800415039, "learning_rate": 4.853028491629228e-05, "loss": 0.0935, "reward": 2.490479826927185, "reward_std": 2.9251874685287476, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8102481365203857, "rewards/no_repetition_reward_func": -0.28851843625307083, "rewards/verse_reward_func": -0.03125, "step": 1242 }, { "completion_length": 250.3125, "epoch": 9.943999999999999, "grad_norm": 2.5, "kl": 2.2542388439178467, "learning_rate": 4.852556441537528e-05, "loss": 0.0902, "reward": 1.8564411401748657, "reward_std": 2.7702407836914062, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1453956365585327, "rewards/no_repetition_reward_func": -0.2811419367790222, "rewards/verse_reward_func": -0.0078125, "step": 1243 }, { "completion_length": 252.46875, "epoch": 9.952, "grad_norm": 1.609375, "kl": 2.9718974828720093, "learning_rate": 4.852083657617061e-05, "loss": 0.1189, "reward": 2.431082010269165, "reward_std": 2.9853192567825317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7160531282424927, "rewards/no_repetition_reward_func": -0.28497104346752167, "rewards/verse_reward_func": 0.0, "step": 1244 }, { "completion_length": 252.25, "epoch": 9.96, "grad_norm": 3.671875, "kl": 3.698856830596924, "learning_rate": 4.851610140015304e-05, "loss": 0.148, "reward": 1.9986069202423096, "reward_std": 2.811707854270935, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.267258644104004, "rewards/no_repetition_reward_func": -0.2686517685651779, "rewards/verse_reward_func": 0.0, "step": 1245 }, { "completion_length": 253.421875, "epoch": 9.968, "grad_norm": 2.78125, "kl": 3.8203766345977783, "learning_rate": 4.851135888879958e-05, "loss": 0.1528, "reward": 2.1920769214630127, "reward_std": 2.9332029819488525, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.482525110244751, "rewards/no_repetition_reward_func": -0.28263552486896515, "rewards/verse_reward_func": -0.0078125, "step": 1246 }, { "completion_length": 247.5, "epoch": 9.975999999999999, "grad_norm": 2.15625, "kl": 2.5232003927230835, "learning_rate": 4.850660904358956e-05, "loss": 0.1009, "reward": 2.061963379383087, "reward_std": 2.6905816793441772, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.342982530593872, "rewards/no_repetition_reward_func": -0.2810192406177521, "rewards/verse_reward_func": 0.0, "step": 1247 }, { "completion_length": 251.640625, "epoch": 9.984, "grad_norm": 1.859375, "kl": 3.627954602241516, "learning_rate": 4.85018518660046e-05, "loss": 0.1451, "reward": 1.6747651100158691, "reward_std": 2.514962136745453, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.957529067993164, "rewards/no_repetition_reward_func": -0.2749515026807785, "rewards/verse_reward_func": -0.0078125, "step": 1248 }, { "completion_length": 252.84375, "epoch": 9.992, "grad_norm": 3.296875, "kl": 2.726029634475708, "learning_rate": 4.849708735752859e-05, "loss": 0.109, "reward": 2.555816411972046, "reward_std": 3.013188362121582, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8712284564971924, "rewards/no_repetition_reward_func": -0.3154121935367584, "rewards/verse_reward_func": 0.0, "step": 1249 }, { "completion_length": 256.0, "epoch": 10.0, "grad_norm": 1.484375, "kl": 3.098905563354492, "learning_rate": 4.849231551964771e-05, "loss": 0.124, "reward": 2.3589346408843994, "reward_std": 2.4331278800964355, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6464524269104004, "rewards/no_repetition_reward_func": -0.27970530092716217, "rewards/verse_reward_func": -0.0078125, "step": 1250 }, { "completion_length": 249.59375, "epoch": 10.008, "grad_norm": 3.109375, "kl": 3.525046229362488, "learning_rate": 4.8487536353850444e-05, "loss": 0.141, "reward": 2.0705642700195312, "reward_std": 2.4545475840568542, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.386692464351654, "rewards/no_repetition_reward_func": -0.3083159923553467, "rewards/verse_reward_func": -0.0078125, "step": 1251 }, { "completion_length": 251.6875, "epoch": 10.016, "grad_norm": 3.25, "kl": 3.448192000389099, "learning_rate": 4.848274986162754e-05, "loss": 0.1379, "reward": 1.596616506576538, "reward_std": 2.0029642581939697, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8608132004737854, "rewards/no_repetition_reward_func": -0.26419681310653687, "rewards/verse_reward_func": 0.0, "step": 1252 }, { "completion_length": 251.390625, "epoch": 10.024, "grad_norm": 1.9375, "kl": 3.047861933708191, "learning_rate": 4.847795604447204e-05, "loss": 0.1219, "reward": 2.376962423324585, "reward_std": 2.792561888694763, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.672375440597534, "rewards/no_repetition_reward_func": -0.2954131066799164, "rewards/verse_reward_func": 0.0, "step": 1253 }, { "completion_length": 253.03125, "epoch": 10.032, "grad_norm": 2.34375, "kl": 2.763579249382019, "learning_rate": 4.8473154903879276e-05, "loss": 0.1105, "reward": 2.007417678833008, "reward_std": 2.422816038131714, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2762840390205383, "rewards/no_repetition_reward_func": -0.2688664123415947, "rewards/verse_reward_func": 0.0, "step": 1254 }, { "completion_length": 250.9375, "epoch": 10.04, "grad_norm": 6.4375, "kl": 3.7920037508010864, "learning_rate": 4.846834644134686e-05, "loss": 0.1517, "reward": 1.1289617419242859, "reward_std": 2.1534159183502197, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3482713103294373, "rewards/no_repetition_reward_func": -0.21930965781211853, "rewards/verse_reward_func": 0.0, "step": 1255 }, { "completion_length": 256.0, "epoch": 10.048, "grad_norm": 2.578125, "kl": 2.875524163246155, "learning_rate": 4.846353065837467e-05, "loss": 0.115, "reward": 1.6098438501358032, "reward_std": 2.828488349914551, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8772268295288086, "rewards/no_repetition_reward_func": -0.259570449590683, "rewards/verse_reward_func": -0.0078125, "step": 1256 }, { "completion_length": 255.828125, "epoch": 10.056, "grad_norm": 1.7109375, "kl": 3.274288058280945, "learning_rate": 4.845870755646491e-05, "loss": 0.131, "reward": 1.9607408046722412, "reward_std": 2.772236466407776, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.228030562400818, "rewards/no_repetition_reward_func": -0.25947728753089905, "rewards/verse_reward_func": -0.0078125, "step": 1257 }, { "completion_length": 253.125, "epoch": 10.064, "grad_norm": 1.90625, "kl": 3.0240345001220703, "learning_rate": 4.845387713712203e-05, "loss": 0.121, "reward": 2.209833025932312, "reward_std": 2.9695606231689453, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.504556894302368, "rewards/no_repetition_reward_func": -0.28691136837005615, "rewards/verse_reward_func": -0.0078125, "step": 1258 }, { "completion_length": 247.65625, "epoch": 10.072, "grad_norm": 1.734375, "kl": 3.511682629585266, "learning_rate": 4.844903940185276e-05, "loss": 0.1405, "reward": 1.5581154227256775, "reward_std": 2.4176902770996094, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8117173910140991, "rewards/no_repetition_reward_func": -0.23797695338726044, "rewards/verse_reward_func": -0.015625, "step": 1259 }, { "completion_length": 255.859375, "epoch": 10.08, "grad_norm": 2.5, "kl": 3.4592690467834473, "learning_rate": 4.844419435216615e-05, "loss": 0.1384, "reward": 1.916201651096344, "reward_std": 2.2467806339263916, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1758322715759277, "rewards/no_repetition_reward_func": -0.2518182024359703, "rewards/verse_reward_func": -0.0078125, "step": 1260 }, { "completion_length": 247.015625, "epoch": 10.088, "grad_norm": 1.3125, "kl": 3.4056297540664673, "learning_rate": 4.84393419895735e-05, "loss": 0.1362, "reward": 1.550473928451538, "reward_std": 2.4805426597595215, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8251357078552246, "rewards/no_repetition_reward_func": -0.26684923470020294, "rewards/verse_reward_func": -0.0078125, "step": 1261 }, { "completion_length": 255.578125, "epoch": 10.096, "grad_norm": 5.125, "kl": 3.83229923248291, "learning_rate": 4.843448231558839e-05, "loss": 0.1533, "reward": 2.2509000301361084, "reward_std": 2.7748833894729614, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5257670879364014, "rewards/no_repetition_reward_func": -0.26705457270145416, "rewards/verse_reward_func": -0.0078125, "step": 1262 }, { "completion_length": 249.125, "epoch": 10.104, "grad_norm": 5.03125, "kl": 3.462276339530945, "learning_rate": 4.84296153317267e-05, "loss": 0.1385, "reward": 1.919523000717163, "reward_std": 2.6810306310653687, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2486987113952637, "rewards/no_repetition_reward_func": -0.3135507106781006, "rewards/verse_reward_func": -0.015625, "step": 1263 }, { "completion_length": 253.21875, "epoch": 10.112, "grad_norm": 2.953125, "kl": 2.4385993480682373, "learning_rate": 4.8424741039506575e-05, "loss": 0.0975, "reward": 2.925676107406616, "reward_std": 3.0135284662246704, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.214669704437256, "rewards/no_repetition_reward_func": -0.28899364173412323, "rewards/verse_reward_func": 0.0, "step": 1264 }, { "completion_length": 252.8125, "epoch": 10.12, "grad_norm": 2.0625, "kl": 3.0640945434570312, "learning_rate": 4.841985944044845e-05, "loss": 0.1226, "reward": 2.002803325653076, "reward_std": 2.3769125938415527, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2488327026367188, "rewards/no_repetition_reward_func": -0.2460295557975769, "rewards/verse_reward_func": 0.0, "step": 1265 }, { "completion_length": 250.109375, "epoch": 10.128, "grad_norm": 1.7421875, "kl": 3.251824378967285, "learning_rate": 4.8414970536075024e-05, "loss": 0.1301, "reward": 1.9639248847961426, "reward_std": 2.9084123373031616, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2349151372909546, "rewards/no_repetition_reward_func": -0.255365215241909, "rewards/verse_reward_func": -0.015625, "step": 1266 }, { "completion_length": 256.0, "epoch": 10.136, "grad_norm": 2.0625, "kl": 3.693337321281433, "learning_rate": 4.841007432791129e-05, "loss": 0.1477, "reward": 1.7491497993469238, "reward_std": 2.739825487136841, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0220248103141785, "rewards/no_repetition_reward_func": -0.272875040769577, "rewards/verse_reward_func": 0.0, "step": 1267 }, { "completion_length": 249.5, "epoch": 10.144, "grad_norm": 1.671875, "kl": 2.70027232170105, "learning_rate": 4.8405170817484515e-05, "loss": 0.108, "reward": 1.9281284809112549, "reward_std": 2.759758949279785, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2163988947868347, "rewards/no_repetition_reward_func": -0.2726454883813858, "rewards/verse_reward_func": -0.015625, "step": 1268 }, { "completion_length": 246.8125, "epoch": 10.152, "grad_norm": 2.515625, "kl": 3.088896870613098, "learning_rate": 4.8400260006324235e-05, "loss": 0.1236, "reward": 1.554189682006836, "reward_std": 2.2680097222328186, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8138097524642944, "rewards/no_repetition_reward_func": -0.25961995124816895, "rewards/verse_reward_func": 0.0, "step": 1269 }, { "completion_length": 249.75, "epoch": 10.16, "grad_norm": 5.125, "kl": 3.724344849586487, "learning_rate": 4.839534189596228e-05, "loss": 0.149, "reward": 0.8772310763597488, "reward_std": 1.824584722518921, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.1347396671772003, "rewards/no_repetition_reward_func": -0.2184460386633873, "rewards/verse_reward_func": -0.0390625, "step": 1270 }, { "completion_length": 253.3125, "epoch": 10.168, "grad_norm": 2.90625, "kl": 1.8027229309082031, "learning_rate": 4.8390416487932733e-05, "loss": 0.0721, "reward": 2.195545256137848, "reward_std": 2.8064767122268677, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4917187690734863, "rewards/no_repetition_reward_func": -0.2883610129356384, "rewards/verse_reward_func": -0.0078125, "step": 1271 }, { "completion_length": 253.140625, "epoch": 10.176, "grad_norm": 2.4375, "kl": 2.9388519525527954, "learning_rate": 4.8385483783771986e-05, "loss": 0.1176, "reward": 1.8008326888084412, "reward_std": 2.2266916036605835, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0652843713760376, "rewards/no_repetition_reward_func": -0.25663922727108, "rewards/verse_reward_func": -0.0078125, "step": 1272 }, { "completion_length": 252.1875, "epoch": 10.184, "grad_norm": 2.65625, "kl": 2.0015807151794434, "learning_rate": 4.8380543785018677e-05, "loss": 0.0801, "reward": 2.4029496908187866, "reward_std": 3.04882276058197, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7101550102233887, "rewards/no_repetition_reward_func": -0.29939277470111847, "rewards/verse_reward_func": -0.0078125, "step": 1273 }, { "completion_length": 251.5, "epoch": 10.192, "grad_norm": 2.671875, "kl": 2.3842357397079468, "learning_rate": 4.837559649321374e-05, "loss": 0.0954, "reward": 2.173826575279236, "reward_std": 2.9506083726882935, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4486933946609497, "rewards/no_repetition_reward_func": -0.2670543044805527, "rewards/verse_reward_func": -0.0078125, "step": 1274 }, { "completion_length": 251.28125, "epoch": 10.2, "grad_norm": 2.453125, "kl": 2.8792333602905273, "learning_rate": 4.837064190990036e-05, "loss": 0.1152, "reward": 1.2544004917144775, "reward_std": 2.1409807205200195, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5520693063735962, "rewards/no_repetition_reward_func": -0.2820438742637634, "rewards/verse_reward_func": -0.015625, "step": 1275 }, { "completion_length": 243.375, "epoch": 10.208, "grad_norm": 2.0625, "kl": 1.3826093673706055, "learning_rate": 4.8365680036624026e-05, "loss": 0.0553, "reward": 2.6123913526535034, "reward_std": 2.863073706626892, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9141842126846313, "rewards/no_repetition_reward_func": -0.2939804643392563, "rewards/verse_reward_func": -0.0078125, "step": 1276 }, { "completion_length": 252.375, "epoch": 10.216, "grad_norm": 3.09375, "kl": 1.8316097259521484, "learning_rate": 4.8360710874932485e-05, "loss": 0.0733, "reward": 2.023142099380493, "reward_std": 2.7739776372909546, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.324767589569092, "rewards/no_repetition_reward_func": -0.2860003411769867, "rewards/verse_reward_func": -0.015625, "step": 1277 }, { "completion_length": 244.921875, "epoch": 10.224, "grad_norm": 3.453125, "kl": 2.4518778324127197, "learning_rate": 4.8355734426375753e-05, "loss": 0.0981, "reward": 2.241658091545105, "reward_std": 2.9694212675094604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.528973340988159, "rewards/no_repetition_reward_func": -0.2560652494430542, "rewards/verse_reward_func": -0.03125, "step": 1278 }, { "completion_length": 246.75, "epoch": 10.232, "grad_norm": 1.9375, "kl": 2.437165141105652, "learning_rate": 4.835075069250613e-05, "loss": 0.0975, "reward": 2.207082509994507, "reward_std": 2.6991543769836426, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4567795991897583, "rewards/no_repetition_reward_func": -0.24969705939292908, "rewards/verse_reward_func": 0.0, "step": 1279 }, { "completion_length": 250.40625, "epoch": 10.24, "grad_norm": 2.75, "kl": 3.363276958465576, "learning_rate": 4.834575967487817e-05, "loss": 0.1345, "reward": 1.6130321025848389, "reward_std": 2.28447562456131, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8910455703735352, "rewards/no_repetition_reward_func": -0.2780134081840515, "rewards/verse_reward_func": 0.0, "step": 1280 }, { "completion_length": 243.796875, "epoch": 10.248, "grad_norm": 2.140625, "kl": 3.2561728954315186, "learning_rate": 4.834076137504873e-05, "loss": 0.1302, "reward": 1.6015873551368713, "reward_std": 2.5492557287216187, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8711386322975159, "rewards/no_repetition_reward_func": -0.2617388367652893, "rewards/verse_reward_func": -0.0078125, "step": 1281 }, { "completion_length": 250.953125, "epoch": 10.256, "grad_norm": 3.34375, "kl": 4.347336530685425, "learning_rate": 4.833575579457691e-05, "loss": 0.1739, "reward": 1.6219167709350586, "reward_std": 2.4415628910064697, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8789626359939575, "rewards/no_repetition_reward_func": -0.25704583525657654, "rewards/verse_reward_func": 0.0, "step": 1282 }, { "completion_length": 244.0625, "epoch": 10.264, "grad_norm": 1.515625, "kl": 3.4454232454299927, "learning_rate": 4.83307429350241e-05, "loss": 0.1378, "reward": 1.8616709113121033, "reward_std": 2.8258719444274902, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1348270177841187, "rewards/no_repetition_reward_func": -0.26534371078014374, "rewards/verse_reward_func": -0.0078125, "step": 1283 }, { "completion_length": 248.609375, "epoch": 10.272, "grad_norm": 4.09375, "kl": 3.8260287046432495, "learning_rate": 4.8325722797953945e-05, "loss": 0.153, "reward": 1.1639610528945923, "reward_std": 2.1474438309669495, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3956354260444641, "rewards/no_repetition_reward_func": -0.23167438805103302, "rewards/verse_reward_func": 0.0, "step": 1284 }, { "completion_length": 252.375, "epoch": 10.28, "grad_norm": 3.4375, "kl": 2.9342557191848755, "learning_rate": 4.832069538493237e-05, "loss": 0.1174, "reward": 1.7951459288597107, "reward_std": 2.428572416305542, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0717867016792297, "rewards/no_repetition_reward_func": -0.2688281685113907, "rewards/verse_reward_func": -0.0078125, "step": 1285 }, { "completion_length": 246.5625, "epoch": 10.288, "grad_norm": 2.21875, "kl": 3.044814109802246, "learning_rate": 4.8315660697527566e-05, "loss": 0.1218, "reward": 2.0913552045822144, "reward_std": 2.638360857963562, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3334661722183228, "rewards/no_repetition_reward_func": -0.24211110919713974, "rewards/verse_reward_func": 0.0, "step": 1286 }, { "completion_length": 254.703125, "epoch": 10.296, "grad_norm": 3.40625, "kl": 1.9147295951843262, "learning_rate": 4.831061873730999e-05, "loss": 0.0766, "reward": 2.3139761686325073, "reward_std": 2.694627285003662, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6136510372161865, "rewards/no_repetition_reward_func": -0.2996750771999359, "rewards/verse_reward_func": 0.0, "step": 1287 }, { "completion_length": 249.265625, "epoch": 10.304, "grad_norm": 2.21875, "kl": 2.91803240776062, "learning_rate": 4.830556950585238e-05, "loss": 0.1167, "reward": 1.492853045463562, "reward_std": 2.3730255365371704, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.760094404220581, "rewards/no_repetition_reward_func": -0.25161638110876083, "rewards/verse_reward_func": -0.015625, "step": 1288 }, { "completion_length": 252.125, "epoch": 10.312, "grad_norm": 2.125, "kl": 3.0454258918762207, "learning_rate": 4.8300513004729735e-05, "loss": 0.1218, "reward": 1.7321182489395142, "reward_std": 2.3975436687469482, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9605096578598022, "rewards/no_repetition_reward_func": -0.2283913865685463, "rewards/verse_reward_func": 0.0, "step": 1289 }, { "completion_length": 252.078125, "epoch": 10.32, "grad_norm": 2.0625, "kl": 2.668224573135376, "learning_rate": 4.829544923551931e-05, "loss": 0.1067, "reward": 1.430763602256775, "reward_std": 2.195650100708008, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6673569679260254, "rewards/no_repetition_reward_func": -0.23659338057041168, "rewards/verse_reward_func": 0.0, "step": 1290 }, { "completion_length": 248.578125, "epoch": 10.328, "grad_norm": 4.4375, "kl": 2.239967107772827, "learning_rate": 4.829037819980065e-05, "loss": 0.0896, "reward": 2.329179883003235, "reward_std": 2.797423481941223, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.622012138366699, "rewards/no_repetition_reward_func": -0.28501978516578674, "rewards/verse_reward_func": -0.0078125, "step": 1291 }, { "completion_length": 254.046875, "epoch": 10.336, "grad_norm": 1.796875, "kl": 2.7174497842788696, "learning_rate": 4.828529989915555e-05, "loss": 0.1087, "reward": 1.8406079411506653, "reward_std": 2.606667995452881, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.122002959251404, "rewards/no_repetition_reward_func": -0.28139497339725494, "rewards/verse_reward_func": 0.0, "step": 1292 }, { "completion_length": 250.328125, "epoch": 10.344, "grad_norm": 2.6875, "kl": 2.633272886276245, "learning_rate": 4.828021433516806e-05, "loss": 0.1053, "reward": 2.042636215686798, "reward_std": 2.9702574014663696, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3367022275924683, "rewards/no_repetition_reward_func": -0.2862536609172821, "rewards/verse_reward_func": -0.0078125, "step": 1293 }, { "completion_length": 246.875, "epoch": 10.352, "grad_norm": 1.609375, "kl": 3.244893431663513, "learning_rate": 4.827512150942454e-05, "loss": 0.1298, "reward": 1.4375944435596466, "reward_std": 2.254799962043762, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.683161199092865, "rewards/no_repetition_reward_func": -0.23775430023670197, "rewards/verse_reward_func": -0.0078125, "step": 1294 }, { "completion_length": 248.484375, "epoch": 10.36, "grad_norm": 2.75, "kl": 2.5212031602859497, "learning_rate": 4.8270021423513554e-05, "loss": 0.1008, "reward": 1.808414101600647, "reward_std": 2.6367790699005127, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.050745964050293, "rewards/no_repetition_reward_func": -0.24233174324035645, "rewards/verse_reward_func": 0.0, "step": 1295 }, { "completion_length": 256.0, "epoch": 10.368, "grad_norm": 1.7109375, "kl": 2.292543411254883, "learning_rate": 4.826491407902599e-05, "loss": 0.0917, "reward": 2.2255831360816956, "reward_std": 2.7239913940429688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5258967876434326, "rewards/no_repetition_reward_func": -0.30031368136405945, "rewards/verse_reward_func": 0.0, "step": 1296 }, { "completion_length": 250.03125, "epoch": 10.376, "grad_norm": 1.8515625, "kl": 3.332329273223877, "learning_rate": 4.8259799477554965e-05, "loss": 0.1333, "reward": 1.266997516155243, "reward_std": 1.9857488870620728, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5222235918045044, "rewards/no_repetition_reward_func": -0.24741347134113312, "rewards/verse_reward_func": -0.0078125, "step": 1297 }, { "completion_length": 252.0625, "epoch": 10.384, "grad_norm": 2.96875, "kl": 2.704203963279724, "learning_rate": 4.825467762069585e-05, "loss": 0.1082, "reward": 2.840353488922119, "reward_std": 2.899942636489868, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.149872899055481, "rewards/no_repetition_reward_func": -0.3095194101333618, "rewards/verse_reward_func": 0.0, "step": 1298 }, { "completion_length": 251.640625, "epoch": 10.392, "grad_norm": 2.265625, "kl": 3.1998804807662964, "learning_rate": 4.824954851004633e-05, "loss": 0.128, "reward": 2.415069103240967, "reward_std": 2.8599241971969604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6955026388168335, "rewards/no_repetition_reward_func": -0.28043346107006073, "rewards/verse_reward_func": 0.0, "step": 1299 }, { "completion_length": 242.34375, "epoch": 10.4, "grad_norm": 2.25, "kl": 3.2038095593452454, "learning_rate": 4.8244412147206284e-05, "loss": 0.1282, "reward": 2.3991130590438843, "reward_std": 2.6458141803741455, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.685195565223694, "rewards/no_repetition_reward_func": -0.2626451179385185, "rewards/verse_reward_func": -0.0234375, "step": 1300 }, { "completion_length": 250.359375, "epoch": 10.408, "grad_norm": 3.0, "kl": 4.039830923080444, "learning_rate": 4.823926853377791e-05, "loss": 0.1616, "reward": 2.209096372127533, "reward_std": 2.911450743675232, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4776923656463623, "rewards/no_repetition_reward_func": -0.2685958743095398, "rewards/verse_reward_func": 0.0, "step": 1301 }, { "completion_length": 252.546875, "epoch": 10.416, "grad_norm": 2.609375, "kl": 3.847018003463745, "learning_rate": 4.823411767136565e-05, "loss": 0.1539, "reward": 1.7356911897659302, "reward_std": 2.4337085485458374, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0302480459213257, "rewards/no_repetition_reward_func": -0.2867443859577179, "rewards/verse_reward_func": -0.0078125, "step": 1302 }, { "completion_length": 256.0, "epoch": 10.424, "grad_norm": 1.6015625, "kl": 3.043844223022461, "learning_rate": 4.822895956157619e-05, "loss": 0.1218, "reward": 2.607338547706604, "reward_std": 2.8988513946533203, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8907943964004517, "rewards/no_repetition_reward_func": -0.2834556847810745, "rewards/verse_reward_func": 0.0, "step": 1303 }, { "completion_length": 240.09375, "epoch": 10.432, "grad_norm": 4.46875, "kl": 4.600900888442993, "learning_rate": 4.822379420601849e-05, "loss": 0.184, "reward": 1.3911580741405487, "reward_std": 2.4616693258285522, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6218320727348328, "rewards/no_repetition_reward_func": -0.22286155819892883, "rewards/verse_reward_func": -0.0078125, "step": 1304 }, { "completion_length": 248.21875, "epoch": 10.44, "grad_norm": 2.28125, "kl": 3.693882703781128, "learning_rate": 4.821862160630378e-05, "loss": 0.1478, "reward": 2.161581516265869, "reward_std": 3.005698084831238, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.483582019805908, "rewards/no_repetition_reward_func": -0.3141879290342331, "rewards/verse_reward_func": -0.0078125, "step": 1305 }, { "completion_length": 250.4375, "epoch": 10.448, "grad_norm": 1.7421875, "kl": 2.912824273109436, "learning_rate": 4.821344176404554e-05, "loss": 0.1165, "reward": 2.124214768409729, "reward_std": 2.622672915458679, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4160760641098022, "rewards/no_repetition_reward_func": -0.29186147451400757, "rewards/verse_reward_func": 0.0, "step": 1306 }, { "completion_length": 253.640625, "epoch": 10.456, "grad_norm": 2.28125, "kl": 2.8211475610733032, "learning_rate": 4.8208254680859494e-05, "loss": 0.1128, "reward": 1.4363079071044922, "reward_std": 2.523097515106201, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7118335962295532, "rewards/no_repetition_reward_func": -0.27552567422389984, "rewards/verse_reward_func": 0.0, "step": 1307 }, { "completion_length": 247.453125, "epoch": 10.464, "grad_norm": 1.1640625, "kl": 3.006529211997986, "learning_rate": 4.820306035836365e-05, "loss": 0.1203, "reward": 2.024729371070862, "reward_std": 2.6620813608169556, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2945204973220825, "rewards/no_repetition_reward_func": -0.26197852194309235, "rewards/verse_reward_func": -0.0078125, "step": 1308 }, { "completion_length": 253.90625, "epoch": 10.472, "grad_norm": 3.109375, "kl": 2.747805416584015, "learning_rate": 4.819785879817827e-05, "loss": 0.1099, "reward": 2.4381068348884583, "reward_std": 2.898783326148987, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7389614582061768, "rewards/no_repetition_reward_func": -0.29304227232933044, "rewards/verse_reward_func": -0.0078125, "step": 1309 }, { "completion_length": 252.171875, "epoch": 10.48, "grad_norm": 1.8984375, "kl": 3.043882966041565, "learning_rate": 4.8192650001925855e-05, "loss": 0.1218, "reward": 2.0528865456581116, "reward_std": 2.40920090675354, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3329265117645264, "rewards/no_repetition_reward_func": -0.2644149959087372, "rewards/verse_reward_func": -0.015625, "step": 1310 }, { "completion_length": 246.625, "epoch": 10.488, "grad_norm": 2.03125, "kl": 2.83988618850708, "learning_rate": 4.818743397123119e-05, "loss": 0.1136, "reward": 2.169334888458252, "reward_std": 2.601257562637329, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4460864067077637, "rewards/no_repetition_reward_func": -0.26893898844718933, "rewards/verse_reward_func": -0.0078125, "step": 1311 }, { "completion_length": 253.234375, "epoch": 10.496, "grad_norm": 2.375, "kl": 3.2736748456954956, "learning_rate": 4.8182210707721284e-05, "loss": 0.1309, "reward": 2.3728479146957397, "reward_std": 2.831642508506775, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6667507886886597, "rewards/no_repetition_reward_func": -0.2939029186964035, "rewards/verse_reward_func": 0.0, "step": 1312 }, { "completion_length": 249.265625, "epoch": 10.504, "grad_norm": 2.953125, "kl": 2.352568030357361, "learning_rate": 4.8176980213025434e-05, "loss": 0.0941, "reward": 2.40652596950531, "reward_std": 3.1897950172424316, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.718060612678528, "rewards/no_repetition_reward_func": -0.3037220239639282, "rewards/verse_reward_func": -0.0078125, "step": 1313 }, { "completion_length": 247.84375, "epoch": 10.512, "grad_norm": 4.4375, "kl": 4.704723596572876, "learning_rate": 4.817174248877518e-05, "loss": 0.1882, "reward": 1.6775270700454712, "reward_std": 2.6127315759658813, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.953134536743164, "rewards/no_repetition_reward_func": -0.2599825859069824, "rewards/verse_reward_func": -0.015625, "step": 1314 }, { "completion_length": 254.3125, "epoch": 10.52, "grad_norm": 3.546875, "kl": 3.24955677986145, "learning_rate": 4.81664975366043e-05, "loss": 0.13, "reward": 1.772855520248413, "reward_std": 2.3008564710617065, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0669630765914917, "rewards/no_repetition_reward_func": -0.2862951457500458, "rewards/verse_reward_func": -0.0078125, "step": 1315 }, { "completion_length": 250.703125, "epoch": 10.528, "grad_norm": 6.84375, "kl": 4.813642263412476, "learning_rate": 4.8161245358148866e-05, "loss": 0.1925, "reward": 1.2859545350074768, "reward_std": 2.416119694709778, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5327450633049011, "rewards/no_repetition_reward_func": -0.2311655953526497, "rewards/verse_reward_func": -0.015625, "step": 1316 }, { "completion_length": 247.65625, "epoch": 10.536, "grad_norm": 2.265625, "kl": 3.013908863067627, "learning_rate": 4.815598595504717e-05, "loss": 0.1206, "reward": 2.508423686027527, "reward_std": 3.028068780899048, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.822439193725586, "rewards/no_repetition_reward_func": -0.3062030225992203, "rewards/verse_reward_func": -0.0078125, "step": 1317 }, { "completion_length": 249.75, "epoch": 10.544, "grad_norm": 4.71875, "kl": 4.042157173156738, "learning_rate": 4.8150719328939755e-05, "loss": 0.1617, "reward": 1.5203253030776978, "reward_std": 2.5624277591705322, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7720866203308105, "rewards/no_repetition_reward_func": -0.2517612725496292, "rewards/verse_reward_func": 0.0, "step": 1318 }, { "completion_length": 250.296875, "epoch": 10.552, "grad_norm": 1.578125, "kl": 3.162718415260315, "learning_rate": 4.814544548146945e-05, "loss": 0.1265, "reward": 2.1986038088798523, "reward_std": 2.99102520942688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5083147287368774, "rewards/no_repetition_reward_func": -0.28627343475818634, "rewards/verse_reward_func": -0.0234375, "step": 1319 }, { "completion_length": 256.0, "epoch": 10.56, "grad_norm": 1.6171875, "kl": 3.330010771751404, "learning_rate": 4.8140164414281306e-05, "loss": 0.1332, "reward": 2.212236166000366, "reward_std": 2.7272592782974243, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.49379301071167, "rewards/no_repetition_reward_func": -0.27374427020549774, "rewards/verse_reward_func": -0.0078125, "step": 1320 }, { "completion_length": 256.0, "epoch": 10.568, "grad_norm": 1.703125, "kl": 2.5814000368118286, "learning_rate": 4.813487612902264e-05, "loss": 0.1033, "reward": 2.403175950050354, "reward_std": 2.5798685550689697, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7107036113739014, "rewards/no_repetition_reward_func": -0.30752770602703094, "rewards/verse_reward_func": 0.0, "step": 1321 }, { "completion_length": 252.453125, "epoch": 10.576, "grad_norm": 4.78125, "kl": 2.1548415422439575, "learning_rate": 4.812958062734302e-05, "loss": 0.0862, "reward": 2.924537777900696, "reward_std": 3.141844391822815, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2179362773895264, "rewards/no_repetition_reward_func": -0.29339851438999176, "rewards/verse_reward_func": 0.0, "step": 1322 }, { "completion_length": 254.921875, "epoch": 10.584, "grad_norm": 1.84375, "kl": 2.9587372541427612, "learning_rate": 4.812427791089426e-05, "loss": 0.1183, "reward": 1.975887417793274, "reward_std": 2.5765678882598877, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2608667612075806, "rewards/no_repetition_reward_func": -0.2771669030189514, "rewards/verse_reward_func": -0.0078125, "step": 1323 }, { "completion_length": 253.15625, "epoch": 10.592, "grad_norm": 2.734375, "kl": 3.6237350702285767, "learning_rate": 4.811896798133042e-05, "loss": 0.1449, "reward": 1.7218995094299316, "reward_std": 2.5625245571136475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0293010473251343, "rewards/no_repetition_reward_func": -0.29958902299404144, "rewards/verse_reward_func": -0.0078125, "step": 1324 }, { "completion_length": 245.609375, "epoch": 10.6, "grad_norm": 3.21875, "kl": 3.0929518938064575, "learning_rate": 4.8113650840307834e-05, "loss": 0.1237, "reward": 2.0483089685440063, "reward_std": 2.550509214401245, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.31319123506546, "rewards/no_repetition_reward_func": -0.2570699006319046, "rewards/verse_reward_func": -0.0078125, "step": 1325 }, { "completion_length": 256.0, "epoch": 10.608, "grad_norm": 2.0625, "kl": 3.34014356136322, "learning_rate": 4.810832648948505e-05, "loss": 0.1336, "reward": 1.8805230855941772, "reward_std": 2.7938913106918335, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.175446391105652, "rewards/no_repetition_reward_func": -0.2949230819940567, "rewards/verse_reward_func": 0.0, "step": 1326 }, { "completion_length": 254.59375, "epoch": 10.616, "grad_norm": 3.84375, "kl": 4.962551832199097, "learning_rate": 4.810299493052289e-05, "loss": 0.1985, "reward": 1.8886666297912598, "reward_std": 2.7392266988754272, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1819838881492615, "rewards/no_repetition_reward_func": -0.2776922136545181, "rewards/verse_reward_func": -0.015625, "step": 1327 }, { "completion_length": 247.015625, "epoch": 10.624, "grad_norm": 1.3046875, "kl": 3.191590905189514, "learning_rate": 4.809765616508443e-05, "loss": 0.1277, "reward": 1.6524938941001892, "reward_std": 2.6462137699127197, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.940325915813446, "rewards/no_repetition_reward_func": -0.2722070813179016, "rewards/verse_reward_func": -0.015625, "step": 1328 }, { "completion_length": 256.0, "epoch": 10.632, "grad_norm": 1.6875, "kl": 2.4409568309783936, "learning_rate": 4.809231019483497e-05, "loss": 0.0976, "reward": 2.602574348449707, "reward_std": 2.784364104270935, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8949451446533203, "rewards/no_repetition_reward_func": -0.2923707515001297, "rewards/verse_reward_func": 0.0, "step": 1329 }, { "completion_length": 255.171875, "epoch": 10.64, "grad_norm": 2.140625, "kl": 2.7050139904022217, "learning_rate": 4.808695702144206e-05, "loss": 0.1082, "reward": 2.122928500175476, "reward_std": 2.619795799255371, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4135327339172363, "rewards/no_repetition_reward_func": -0.29060423374176025, "rewards/verse_reward_func": 0.0, "step": 1330 }, { "completion_length": 247.9375, "epoch": 10.648, "grad_norm": 1.265625, "kl": 3.5829432010650635, "learning_rate": 4.808159664657552e-05, "loss": 0.1433, "reward": 1.956480085849762, "reward_std": 2.6842072010040283, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2436152696609497, "rewards/no_repetition_reward_func": -0.26369766891002655, "rewards/verse_reward_func": -0.0234375, "step": 1331 }, { "completion_length": 255.3125, "epoch": 10.656, "grad_norm": 1.8359375, "kl": 2.8875828981399536, "learning_rate": 4.8076229071907397e-05, "loss": 0.1155, "reward": 1.944990634918213, "reward_std": 2.4214006662368774, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.204452633857727, "rewards/no_repetition_reward_func": -0.259461909532547, "rewards/verse_reward_func": 0.0, "step": 1332 }, { "completion_length": 249.875, "epoch": 10.664, "grad_norm": 2.09375, "kl": 2.8345253467559814, "learning_rate": 4.8070854299111994e-05, "loss": 0.1134, "reward": 1.9924103021621704, "reward_std": 2.6508610248565674, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2427033185958862, "rewards/no_repetition_reward_func": -0.2502930760383606, "rewards/verse_reward_func": 0.0, "step": 1333 }, { "completion_length": 252.1875, "epoch": 10.672, "grad_norm": 3.5, "kl": 2.269514799118042, "learning_rate": 4.8065472329865854e-05, "loss": 0.0908, "reward": 2.256093144416809, "reward_std": 2.740443229675293, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.564081847667694, "rewards/no_repetition_reward_func": -0.2923637926578522, "rewards/verse_reward_func": -0.015625, "step": 1334 }, { "completion_length": 252.609375, "epoch": 10.68, "grad_norm": 3.28125, "kl": 2.593971371650696, "learning_rate": 4.8060083165847754e-05, "loss": 0.1038, "reward": 2.351435899734497, "reward_std": 2.771678924560547, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6557854413986206, "rewards/no_repetition_reward_func": -0.3043495714664459, "rewards/verse_reward_func": 0.0, "step": 1335 }, { "completion_length": 253.6875, "epoch": 10.688, "grad_norm": 1.6953125, "kl": 3.072746992111206, "learning_rate": 4.805468680873874e-05, "loss": 0.1229, "reward": 1.9659852981567383, "reward_std": 2.6221508979797363, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2375733256340027, "rewards/no_repetition_reward_func": -0.2637755498290062, "rewards/verse_reward_func": -0.0078125, "step": 1336 }, { "completion_length": 251.625, "epoch": 10.696, "grad_norm": 1.71875, "kl": 3.362931728363037, "learning_rate": 4.8049283260222075e-05, "loss": 0.1345, "reward": 2.7130777835845947, "reward_std": 2.8334072828292847, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.003902316093445, "rewards/no_repetition_reward_func": -0.2751997113227844, "rewards/verse_reward_func": -0.015625, "step": 1337 }, { "completion_length": 256.0, "epoch": 10.704, "grad_norm": 4.1875, "kl": 3.5371711254119873, "learning_rate": 4.8043872521983294e-05, "loss": 0.1415, "reward": 1.629838466644287, "reward_std": 2.2076377868652344, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8678696155548096, "rewards/no_repetition_reward_func": -0.2380310744047165, "rewards/verse_reward_func": 0.0, "step": 1338 }, { "completion_length": 249.921875, "epoch": 10.712, "grad_norm": 1.640625, "kl": 3.89092755317688, "learning_rate": 4.803845459571014e-05, "loss": 0.1556, "reward": 2.6483899354934692, "reward_std": 2.963521122932434, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.938456654548645, "rewards/no_repetition_reward_func": -0.2900669723749161, "rewards/verse_reward_func": 0.0, "step": 1339 }, { "completion_length": 251.015625, "epoch": 10.72, "grad_norm": 1.6953125, "kl": 3.056521773338318, "learning_rate": 4.803302948309264e-05, "loss": 0.1223, "reward": 2.0388827323913574, "reward_std": 3.085874915122986, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3601553440093994, "rewards/no_repetition_reward_func": -0.31346020102500916, "rewards/verse_reward_func": -0.0078125, "step": 1340 }, { "completion_length": 256.0, "epoch": 10.728, "grad_norm": 2.484375, "kl": 4.00168776512146, "learning_rate": 4.8027597185823016e-05, "loss": 0.1601, "reward": 2.1292672157287598, "reward_std": 2.4782254695892334, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.417388916015625, "rewards/no_repetition_reward_func": -0.28030917048454285, "rewards/verse_reward_func": -0.0078125, "step": 1341 }, { "completion_length": 248.71875, "epoch": 10.736, "grad_norm": 2.484375, "kl": 4.45344090461731, "learning_rate": 4.802215770559577e-05, "loss": 0.1781, "reward": 1.8629655241966248, "reward_std": 2.902949810028076, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1392372846603394, "rewards/no_repetition_reward_func": -0.276271790266037, "rewards/verse_reward_func": 0.0, "step": 1342 }, { "completion_length": 246.53125, "epoch": 10.744, "grad_norm": 3.1875, "kl": 4.233939051628113, "learning_rate": 4.801671104410763e-05, "loss": 0.1694, "reward": 2.215682864189148, "reward_std": 2.5530059337615967, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5147976875305176, "rewards/no_repetition_reward_func": -0.2756774351000786, "rewards/verse_reward_func": -0.0234375, "step": 1343 }, { "completion_length": 256.0, "epoch": 10.752, "grad_norm": 1.3515625, "kl": 3.8447381258010864, "learning_rate": 4.8011257203057556e-05, "loss": 0.1538, "reward": 2.507455587387085, "reward_std": 2.904200553894043, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8185675144195557, "rewards/no_repetition_reward_func": -0.3111118823289871, "rewards/verse_reward_func": 0.0, "step": 1344 }, { "completion_length": 256.0, "epoch": 10.76, "grad_norm": 2.578125, "kl": 2.7347391843795776, "learning_rate": 4.800579618414676e-05, "loss": 0.1094, "reward": 2.510361433029175, "reward_std": 2.942573666572571, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.816643476486206, "rewards/no_repetition_reward_func": -0.2984696179628372, "rewards/verse_reward_func": -0.0078125, "step": 1345 }, { "completion_length": 250.8125, "epoch": 10.768, "grad_norm": 2.046875, "kl": 2.478702664375305, "learning_rate": 4.800032798907869e-05, "loss": 0.0991, "reward": 2.146777093410492, "reward_std": 2.907810926437378, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4431199431419373, "rewards/no_repetition_reward_func": -0.2963428646326065, "rewards/verse_reward_func": 0.0, "step": 1346 }, { "completion_length": 252.359375, "epoch": 10.776, "grad_norm": 2.65625, "kl": 2.496246337890625, "learning_rate": 4.7994852619559016e-05, "loss": 0.0998, "reward": 2.2755404710769653, "reward_std": 2.668538451194763, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.554896831512451, "rewards/no_repetition_reward_func": -0.27154381573200226, "rewards/verse_reward_func": -0.0078125, "step": 1347 }, { "completion_length": 249.21875, "epoch": 10.784, "grad_norm": 2.5625, "kl": 2.1361770033836365, "learning_rate": 4.798937007729568e-05, "loss": 0.0854, "reward": 2.116059899330139, "reward_std": 2.709002137184143, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4273877143859863, "rewards/no_repetition_reward_func": -0.29570285975933075, "rewards/verse_reward_func": -0.015625, "step": 1348 }, { "completion_length": 250.90625, "epoch": 10.792, "grad_norm": 1.5546875, "kl": 2.4848670959472656, "learning_rate": 4.798388036399883e-05, "loss": 0.0994, "reward": 1.9762804508209229, "reward_std": 2.7370413541793823, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.276212453842163, "rewards/no_repetition_reward_func": -0.29993200302124023, "rewards/verse_reward_func": 0.0, "step": 1349 }, { "completion_length": 252.890625, "epoch": 10.8, "grad_norm": 1.6015625, "kl": 3.1192270517349243, "learning_rate": 4.797838348138086e-05, "loss": 0.1248, "reward": 1.8486726880073547, "reward_std": 2.600929617881775, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.102156102657318, "rewards/no_repetition_reward_func": -0.25348352640867233, "rewards/verse_reward_func": 0.0, "step": 1350 }, { "completion_length": 256.0, "epoch": 10.808, "grad_norm": 1.78125, "kl": 2.5885530710220337, "learning_rate": 4.797287943115641e-05, "loss": 0.1035, "reward": 2.485548257827759, "reward_std": 2.6000871658325195, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.750664472579956, "rewards/no_repetition_reward_func": -0.25730374455451965, "rewards/verse_reward_func": -0.0078125, "step": 1351 }, { "completion_length": 256.0, "epoch": 10.816, "grad_norm": 1.3203125, "kl": 2.8831558227539062, "learning_rate": 4.796736821504235e-05, "loss": 0.1153, "reward": 1.7678515911102295, "reward_std": 2.510586977005005, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.080283522605896, "rewards/no_repetition_reward_func": -0.3124317228794098, "rewards/verse_reward_func": 0.0, "step": 1352 }, { "completion_length": 250.390625, "epoch": 10.824, "grad_norm": 3.03125, "kl": 3.7304065227508545, "learning_rate": 4.7961849834757786e-05, "loss": 0.1492, "reward": 1.6605530381202698, "reward_std": 2.4473907947540283, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9226118922233582, "rewards/no_repetition_reward_func": -0.2620589807629585, "rewards/verse_reward_func": 0.0, "step": 1353 }, { "completion_length": 229.03125, "epoch": 10.832, "grad_norm": 2.25, "kl": 3.4643821716308594, "learning_rate": 4.795632429202405e-05, "loss": 0.1386, "reward": 2.2751299142837524, "reward_std": 2.6269054412841797, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5430864095687866, "rewards/no_repetition_reward_func": -0.26014402508735657, "rewards/verse_reward_func": -0.0078125, "step": 1354 }, { "completion_length": 252.53125, "epoch": 10.84, "grad_norm": 1.96875, "kl": 3.3714133501052856, "learning_rate": 4.79507915885647e-05, "loss": 0.1349, "reward": 1.9844201803207397, "reward_std": 2.687579393386841, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2670334577560425, "rewards/no_repetition_reward_func": -0.2826133370399475, "rewards/verse_reward_func": 0.0, "step": 1355 }, { "completion_length": 249.84375, "epoch": 10.848, "grad_norm": 1.6796875, "kl": 2.659888982772827, "learning_rate": 4.794525172610558e-05, "loss": 0.1064, "reward": 1.932328462600708, "reward_std": 2.6967087984085083, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2071949243545532, "rewards/no_repetition_reward_func": -0.267053984105587, "rewards/verse_reward_func": -0.0078125, "step": 1356 }, { "completion_length": 252.203125, "epoch": 10.856, "grad_norm": 3.5625, "kl": 2.869115114212036, "learning_rate": 4.793970470637469e-05, "loss": 0.1148, "reward": 2.2939200401306152, "reward_std": 2.288434863090515, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5574350357055664, "rewards/no_repetition_reward_func": -0.255702368915081, "rewards/verse_reward_func": -0.0078125, "step": 1357 }, { "completion_length": 246.09375, "epoch": 10.864, "grad_norm": 1.078125, "kl": 2.880110740661621, "learning_rate": 4.793415053110233e-05, "loss": 0.1152, "reward": 2.0104297399520874, "reward_std": 2.427007794380188, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2589653730392456, "rewards/no_repetition_reward_func": -0.2485356479883194, "rewards/verse_reward_func": 0.0, "step": 1358 }, { "completion_length": 253.140625, "epoch": 10.872, "grad_norm": 1.8203125, "kl": 2.937825322151184, "learning_rate": 4.792858920202099e-05, "loss": 0.1175, "reward": 1.3863151669502258, "reward_std": 2.451823353767395, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6441611051559448, "rewards/no_repetition_reward_func": -0.257845938205719, "rewards/verse_reward_func": 0.0, "step": 1359 }, { "completion_length": 248.03125, "epoch": 10.88, "grad_norm": 2.34375, "kl": 2.7919254302978516, "learning_rate": 4.7923020720865414e-05, "loss": 0.1117, "reward": 1.878045678138733, "reward_std": 2.528690814971924, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.122705101966858, "rewards/no_repetition_reward_func": -0.22903435677289963, "rewards/verse_reward_func": -0.015625, "step": 1360 }, { "completion_length": 252.015625, "epoch": 10.888, "grad_norm": 2.75, "kl": 2.405871093273163, "learning_rate": 4.791744508937256e-05, "loss": 0.0962, "reward": 1.9797090291976929, "reward_std": 2.809287667274475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2551900148391724, "rewards/no_repetition_reward_func": -0.2676684781908989, "rewards/verse_reward_func": -0.0078125, "step": 1361 }, { "completion_length": 247.203125, "epoch": 10.896, "grad_norm": 2.375, "kl": 2.353765606880188, "learning_rate": 4.791186230928163e-05, "loss": 0.0942, "reward": 2.5597909688949585, "reward_std": 2.492441773414612, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8151276111602783, "rewards/no_repetition_reward_func": -0.25533680617809296, "rewards/verse_reward_func": 0.0, "step": 1362 }, { "completion_length": 248.546875, "epoch": 10.904, "grad_norm": 3.578125, "kl": 2.953064799308777, "learning_rate": 4.790627238233405e-05, "loss": 0.1181, "reward": 2.504089593887329, "reward_std": 2.9070205688476562, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.784755825996399, "rewards/no_repetition_reward_func": -0.28066644072532654, "rewards/verse_reward_func": 0.0, "step": 1363 }, { "completion_length": 251.203125, "epoch": 10.912, "grad_norm": 1.84375, "kl": 2.8218621015548706, "learning_rate": 4.7900675310273466e-05, "loss": 0.1129, "reward": 2.259797692298889, "reward_std": 2.6035677194595337, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5439951419830322, "rewards/no_repetition_reward_func": -0.2841973900794983, "rewards/verse_reward_func": 0.0, "step": 1364 }, { "completion_length": 253.40625, "epoch": 10.92, "grad_norm": 2.375, "kl": 2.4135371446609497, "learning_rate": 4.789507109484579e-05, "loss": 0.0965, "reward": 2.2230355739593506, "reward_std": 2.772768974304199, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.504231810569763, "rewards/no_repetition_reward_func": -0.2811962962150574, "rewards/verse_reward_func": 0.0, "step": 1365 }, { "completion_length": 250.671875, "epoch": 10.928, "grad_norm": 2.796875, "kl": 3.769008159637451, "learning_rate": 4.78894597377991e-05, "loss": 0.1508, "reward": 2.134775996208191, "reward_std": 2.5823646783828735, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3719266653060913, "rewards/no_repetition_reward_func": -0.2293381467461586, "rewards/verse_reward_func": -0.0078125, "step": 1366 }, { "completion_length": 249.890625, "epoch": 10.936, "grad_norm": 2.65625, "kl": 3.4541831016540527, "learning_rate": 4.7883841240883766e-05, "loss": 0.1382, "reward": 1.8632030487060547, "reward_std": 2.4229297637939453, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1271411180496216, "rewards/no_repetition_reward_func": -0.2639380991458893, "rewards/verse_reward_func": 0.0, "step": 1367 }, { "completion_length": 237.015625, "epoch": 10.943999999999999, "grad_norm": 2.1875, "kl": 4.729701042175293, "learning_rate": 4.7878215605852336e-05, "loss": 0.1892, "reward": 2.243142306804657, "reward_std": 3.019248127937317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5458379983901978, "rewards/no_repetition_reward_func": -0.26363329589366913, "rewards/verse_reward_func": -0.0390625, "step": 1368 }, { "completion_length": 255.5, "epoch": 10.952, "grad_norm": 1.6875, "kl": 3.1063482761383057, "learning_rate": 4.787258283445962e-05, "loss": 0.1243, "reward": 2.601607918739319, "reward_std": 3.014058470726013, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.888041377067566, "rewards/no_repetition_reward_func": -0.28643345832824707, "rewards/verse_reward_func": 0.0, "step": 1369 }, { "completion_length": 252.671875, "epoch": 10.96, "grad_norm": 2.8125, "kl": 3.1931101083755493, "learning_rate": 4.7866942928462625e-05, "loss": 0.1277, "reward": 2.34971284866333, "reward_std": 2.9500157833099365, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6060460805892944, "rewards/no_repetition_reward_func": -0.25633344054222107, "rewards/verse_reward_func": 0.0, "step": 1370 }, { "completion_length": 243.046875, "epoch": 10.968, "grad_norm": 3.21875, "kl": 3.935868501663208, "learning_rate": 4.786129588962061e-05, "loss": 0.1574, "reward": 2.009409546852112, "reward_std": 2.3048598170280457, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2671884894371033, "rewards/no_repetition_reward_func": -0.25777900218963623, "rewards/verse_reward_func": 0.0, "step": 1371 }, { "completion_length": 250.0625, "epoch": 10.975999999999999, "grad_norm": 1.8984375, "kl": 3.73074209690094, "learning_rate": 4.7855641719695023e-05, "loss": 0.1492, "reward": 2.522826313972473, "reward_std": 2.825343132019043, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8091877698898315, "rewards/no_repetition_reward_func": -0.286361500620842, "rewards/verse_reward_func": 0.0, "step": 1372 }, { "completion_length": 245.375, "epoch": 10.984, "grad_norm": 3.21875, "kl": 4.049211621284485, "learning_rate": 4.7849980420449594e-05, "loss": 0.162, "reward": 1.6349931359291077, "reward_std": 2.356738030910492, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9187715649604797, "rewards/no_repetition_reward_func": -0.2837785482406616, "rewards/verse_reward_func": 0.0, "step": 1373 }, { "completion_length": 251.375, "epoch": 10.992, "grad_norm": 2.96875, "kl": 4.388161897659302, "learning_rate": 4.7844311993650205e-05, "loss": 0.1755, "reward": 2.358746886253357, "reward_std": 3.011545181274414, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.668431520462036, "rewards/no_repetition_reward_func": -0.29405947029590607, "rewards/verse_reward_func": -0.015625, "step": 1374 }, { "completion_length": 249.3125, "epoch": 11.0, "grad_norm": 4.71875, "kl": 3.0861343145370483, "learning_rate": 4.783863644106502e-05, "loss": 0.1234, "reward": 3.040076494216919, "reward_std": 3.187499523162842, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.34237003326416, "rewards/no_repetition_reward_func": -0.30229364335536957, "rewards/verse_reward_func": 0.0, "step": 1375 }, { "completion_length": 255.296875, "epoch": 11.008, "grad_norm": 25.5, "kl": 4.307069778442383, "learning_rate": 4.7832953764464405e-05, "loss": 0.1723, "reward": 2.437124490737915, "reward_std": 2.7058725357055664, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7550963163375854, "rewards/no_repetition_reward_func": -0.30234667658805847, "rewards/verse_reward_func": -0.015625, "step": 1376 }, { "completion_length": 255.0625, "epoch": 11.016, "grad_norm": 3.0, "kl": 4.378148794174194, "learning_rate": 4.782726396562094e-05, "loss": 0.1751, "reward": 2.080992102622986, "reward_std": 2.871680736541748, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.354092836380005, "rewards/no_repetition_reward_func": -0.26528816670179367, "rewards/verse_reward_func": -0.0078125, "step": 1377 }, { "completion_length": 249.359375, "epoch": 11.024, "grad_norm": 3.265625, "kl": 2.808307647705078, "learning_rate": 4.782156704630944e-05, "loss": 0.1123, "reward": 2.374382972717285, "reward_std": 2.756473422050476, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.627641201019287, "rewards/no_repetition_reward_func": -0.24544572830200195, "rewards/verse_reward_func": -0.0078125, "step": 1378 }, { "completion_length": 251.5, "epoch": 11.032, "grad_norm": 1.984375, "kl": 3.8494696617126465, "learning_rate": 4.781586300830693e-05, "loss": 0.154, "reward": 1.7430866956710815, "reward_std": 2.7611011266708374, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.993477463722229, "rewards/no_repetition_reward_func": -0.2503907233476639, "rewards/verse_reward_func": 0.0, "step": 1379 }, { "completion_length": 254.015625, "epoch": 11.04, "grad_norm": 1.3515625, "kl": 3.8769129514694214, "learning_rate": 4.781015185339266e-05, "loss": 0.1551, "reward": 2.084280252456665, "reward_std": 2.90617835521698, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.371787190437317, "rewards/no_repetition_reward_func": -0.2875068783760071, "rewards/verse_reward_func": 0.0, "step": 1380 }, { "completion_length": 249.59375, "epoch": 11.048, "grad_norm": 4.96875, "kl": 4.078204154968262, "learning_rate": 4.78044335833481e-05, "loss": 0.1631, "reward": 1.2766412496566772, "reward_std": 2.4115830659866333, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5353778004646301, "rewards/no_repetition_reward_func": -0.2509239464998245, "rewards/verse_reward_func": -0.0078125, "step": 1381 }, { "completion_length": 252.421875, "epoch": 11.056, "grad_norm": 3.78125, "kl": 4.313994407653809, "learning_rate": 4.779870819995694e-05, "loss": 0.1726, "reward": 1.3398266434669495, "reward_std": 2.303655743598938, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5862170457839966, "rewards/no_repetition_reward_func": -0.23857785761356354, "rewards/verse_reward_func": -0.0078125, "step": 1382 }, { "completion_length": 251.703125, "epoch": 11.064, "grad_norm": 1.6640625, "kl": 3.4602925777435303, "learning_rate": 4.779297570500509e-05, "loss": 0.1384, "reward": 2.196622848510742, "reward_std": 2.7477359771728516, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.473855137825012, "rewards/no_repetition_reward_func": -0.2694196552038193, "rewards/verse_reward_func": -0.0078125, "step": 1383 }, { "completion_length": 249.34375, "epoch": 11.072, "grad_norm": 3.40625, "kl": 2.9055079221725464, "learning_rate": 4.7787236100280685e-05, "loss": 0.1162, "reward": 3.3426032066345215, "reward_std": 2.8505979776382446, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.650767207145691, "rewards/no_repetition_reward_func": -0.30035144090652466, "rewards/verse_reward_func": -0.0078125, "step": 1384 }, { "completion_length": 242.953125, "epoch": 11.08, "grad_norm": 4.34375, "kl": 1.9767450094223022, "learning_rate": 4.778148938757406e-05, "loss": 0.0791, "reward": 2.7057905197143555, "reward_std": 2.689351201057434, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9730775356292725, "rewards/no_repetition_reward_func": -0.26728709042072296, "rewards/verse_reward_func": 0.0, "step": 1385 }, { "completion_length": 254.046875, "epoch": 11.088, "grad_norm": 2.859375, "kl": 2.7234426736831665, "learning_rate": 4.7775735568677775e-05, "loss": 0.1089, "reward": 2.984671115875244, "reward_std": 2.973259687423706, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.296342134475708, "rewards/no_repetition_reward_func": -0.31167101860046387, "rewards/verse_reward_func": 0.0, "step": 1386 }, { "completion_length": 249.75, "epoch": 11.096, "grad_norm": 3.6875, "kl": 2.8489136695861816, "learning_rate": 4.776997464538662e-05, "loss": 0.114, "reward": 2.5997816920280457, "reward_std": 2.953573226928711, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8939223289489746, "rewards/no_repetition_reward_func": -0.2941405922174454, "rewards/verse_reward_func": 0.0, "step": 1387 }, { "completion_length": 244.96875, "epoch": 11.104, "grad_norm": 2.53125, "kl": 3.950275421142578, "learning_rate": 4.776420661949758e-05, "loss": 0.158, "reward": 1.5561586618423462, "reward_std": 2.743949055671692, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8560425639152527, "rewards/no_repetition_reward_func": -0.28425896167755127, "rewards/verse_reward_func": -0.015625, "step": 1388 }, { "completion_length": 243.453125, "epoch": 11.112, "grad_norm": 2.265625, "kl": 3.1693522930145264, "learning_rate": 4.775843149280986e-05, "loss": 0.1268, "reward": 2.504851818084717, "reward_std": 3.0548813343048096, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.757436752319336, "rewards/no_repetition_reward_func": -0.24477262794971466, "rewards/verse_reward_func": -0.0078125, "step": 1389 }, { "completion_length": 245.171875, "epoch": 11.12, "grad_norm": 2.078125, "kl": 3.6431931257247925, "learning_rate": 4.775264926712489e-05, "loss": 0.1457, "reward": 2.42355477809906, "reward_std": 2.9490623474121094, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7608059644699097, "rewards/no_repetition_reward_func": -0.3216260224580765, "rewards/verse_reward_func": -0.015625, "step": 1390 }, { "completion_length": 247.28125, "epoch": 11.128, "grad_norm": 3.765625, "kl": 4.09588623046875, "learning_rate": 4.7746859944246325e-05, "loss": 0.1638, "reward": 2.302524209022522, "reward_std": 2.5742255449295044, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.561556577682495, "rewards/no_repetition_reward_func": -0.2590325400233269, "rewards/verse_reward_func": 0.0, "step": 1391 }, { "completion_length": 246.0625, "epoch": 11.136, "grad_norm": 1.3984375, "kl": 4.4651665687561035, "learning_rate": 4.7741063525980004e-05, "loss": 0.1786, "reward": 1.967834711074829, "reward_std": 2.8207294940948486, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.232142686843872, "rewards/no_repetition_reward_func": -0.2564954087138176, "rewards/verse_reward_func": -0.0078125, "step": 1392 }, { "completion_length": 251.046875, "epoch": 11.144, "grad_norm": 2.34375, "kl": 3.6994175910949707, "learning_rate": 4.7735260014133986e-05, "loss": 0.148, "reward": 2.5667773485183716, "reward_std": 2.841654062271118, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.83006489276886, "rewards/no_repetition_reward_func": -0.2632875144481659, "rewards/verse_reward_func": 0.0, "step": 1393 }, { "completion_length": 248.0625, "epoch": 11.152, "grad_norm": 3.8125, "kl": 4.758953332901001, "learning_rate": 4.772944941051856e-05, "loss": 0.1904, "reward": 1.8080269694328308, "reward_std": 2.5220839977264404, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.080416262149811, "rewards/no_repetition_reward_func": -0.2723892629146576, "rewards/verse_reward_func": 0.0, "step": 1394 }, { "completion_length": 253.328125, "epoch": 11.16, "grad_norm": 2.265625, "kl": 3.136154890060425, "learning_rate": 4.772363171694622e-05, "loss": 0.1254, "reward": 2.526859760284424, "reward_std": 2.8118081092834473, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8529539108276367, "rewards/no_repetition_reward_func": -0.3260943293571472, "rewards/verse_reward_func": 0.0, "step": 1395 }, { "completion_length": 253.6875, "epoch": 11.168, "grad_norm": 1.8515625, "kl": 3.865527868270874, "learning_rate": 4.7717806935231665e-05, "loss": 0.1546, "reward": 2.9715360403060913, "reward_std": 3.0615196228027344, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2466506958007812, "rewards/no_repetition_reward_func": -0.2673020511865616, "rewards/verse_reward_func": -0.0078125, "step": 1396 }, { "completion_length": 254.09375, "epoch": 11.176, "grad_norm": 3.671875, "kl": 5.37794303894043, "learning_rate": 4.771197506719181e-05, "loss": 0.2151, "reward": 1.776896059513092, "reward_std": 2.731958031654358, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0522356033325195, "rewards/no_repetition_reward_func": -0.2675269842147827, "rewards/verse_reward_func": -0.0078125, "step": 1397 }, { "completion_length": 254.765625, "epoch": 11.184, "grad_norm": 4.75, "kl": 5.2697601318359375, "learning_rate": 4.770613611464577e-05, "loss": 0.2108, "reward": 1.9551005363464355, "reward_std": 2.518898844718933, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.244024693965912, "rewards/no_repetition_reward_func": -0.28111155331134796, "rewards/verse_reward_func": -0.0078125, "step": 1398 }, { "completion_length": 254.90625, "epoch": 11.192, "grad_norm": 3.328125, "kl": 3.608241081237793, "learning_rate": 4.7700290079414896e-05, "loss": 0.1443, "reward": 2.1032938361167908, "reward_std": 2.77923321723938, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.416294574737549, "rewards/no_repetition_reward_func": -0.28956303000450134, "rewards/verse_reward_func": -0.0234375, "step": 1399 }, { "completion_length": 236.15625, "epoch": 11.2, "grad_norm": 2.171875, "kl": 2.863968014717102, "learning_rate": 4.769443696332272e-05, "loss": 0.1146, "reward": 2.120300531387329, "reward_std": 2.7195208072662354, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.414489269256592, "rewards/no_repetition_reward_func": -0.27856360375881195, "rewards/verse_reward_func": -0.015625, "step": 1400 }, { "completion_length": 239.671875, "epoch": 11.208, "grad_norm": 3.09375, "kl": 2.7302470207214355, "learning_rate": 4.7688576768194994e-05, "loss": 0.1092, "reward": 2.5751630067825317, "reward_std": 2.7583353519439697, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8709888458251953, "rewards/no_repetition_reward_func": -0.2802008390426636, "rewards/verse_reward_func": -0.015625, "step": 1401 }, { "completion_length": 253.1875, "epoch": 11.216, "grad_norm": 1.8046875, "kl": 2.9687591791152954, "learning_rate": 4.768270949585968e-05, "loss": 0.1188, "reward": 1.7087826132774353, "reward_std": 2.379034996032715, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9591907262802124, "rewards/no_repetition_reward_func": -0.25040819495916367, "rewards/verse_reward_func": 0.0, "step": 1402 }, { "completion_length": 246.21875, "epoch": 11.224, "grad_norm": 2.203125, "kl": 2.5603320598602295, "learning_rate": 4.767683514814696e-05, "loss": 0.1024, "reward": 2.4878557920455933, "reward_std": 2.6575539112091064, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7600079774856567, "rewards/no_repetition_reward_func": -0.272152379155159, "rewards/verse_reward_func": 0.0, "step": 1403 }, { "completion_length": 254.578125, "epoch": 11.232, "grad_norm": 3.15625, "kl": 3.983857274055481, "learning_rate": 4.767095372688918e-05, "loss": 0.1594, "reward": 1.3946443796157837, "reward_std": 2.322476029396057, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.630854070186615, "rewards/no_repetition_reward_func": -0.23620973527431488, "rewards/verse_reward_func": 0.0, "step": 1404 }, { "completion_length": 252.578125, "epoch": 11.24, "grad_norm": 3.421875, "kl": 2.691891670227051, "learning_rate": 4.7665065233920945e-05, "loss": 0.1077, "reward": 2.5280121564865112, "reward_std": 3.0364010334014893, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.803342580795288, "rewards/no_repetition_reward_func": -0.26751798391342163, "rewards/verse_reward_func": -0.0078125, "step": 1405 }, { "completion_length": 245.40625, "epoch": 11.248, "grad_norm": 1.578125, "kl": 3.8366944789886475, "learning_rate": 4.765916967107903e-05, "loss": 0.1535, "reward": 1.764283001422882, "reward_std": 2.5486903190612793, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0664801597595215, "rewards/no_repetition_reward_func": -0.28657183796167374, "rewards/verse_reward_func": -0.015625, "step": 1406 }, { "completion_length": 237.703125, "epoch": 11.256, "grad_norm": 3.71875, "kl": 3.436556577682495, "learning_rate": 4.7653267040202436e-05, "loss": 0.1375, "reward": 1.6404356956481934, "reward_std": 2.545310616493225, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9019222259521484, "rewards/no_repetition_reward_func": -0.25367411971092224, "rewards/verse_reward_func": -0.0078125, "step": 1407 }, { "completion_length": 252.390625, "epoch": 11.264, "grad_norm": 3.0, "kl": 2.489121913909912, "learning_rate": 4.764735734313236e-05, "loss": 0.0996, "reward": 2.748911738395691, "reward_std": 3.1270227432250977, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.101662755012512, "rewards/no_repetition_reward_func": -0.3449384570121765, "rewards/verse_reward_func": -0.0078125, "step": 1408 }, { "completion_length": 251.21875, "epoch": 11.272, "grad_norm": 2.34375, "kl": 2.6183162927627563, "learning_rate": 4.764144058171219e-05, "loss": 0.1047, "reward": 2.469453811645508, "reward_std": 2.8280112743377686, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7415764331817627, "rewards/no_repetition_reward_func": -0.2721227556467056, "rewards/verse_reward_func": 0.0, "step": 1409 }, { "completion_length": 240.140625, "epoch": 11.28, "grad_norm": 1.59375, "kl": 3.475897431373596, "learning_rate": 4.763551675778755e-05, "loss": 0.139, "reward": 2.084083080291748, "reward_std": 2.745254397392273, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3762781620025635, "rewards/no_repetition_reward_func": -0.2843824625015259, "rewards/verse_reward_func": -0.0078125, "step": 1410 }, { "completion_length": 256.0, "epoch": 11.288, "grad_norm": 6.53125, "kl": 5.127103805541992, "learning_rate": 4.7629585873206226e-05, "loss": 0.2051, "reward": 1.8347488641738892, "reward_std": 2.2697455883026123, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.075053095817566, "rewards/no_repetition_reward_func": -0.24030432850122452, "rewards/verse_reward_func": 0.0, "step": 1411 }, { "completion_length": 246.78125, "epoch": 11.296, "grad_norm": 3.84375, "kl": 4.388891935348511, "learning_rate": 4.762364792981825e-05, "loss": 0.1756, "reward": 1.965996503829956, "reward_std": 2.6018396615982056, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.251864790916443, "rewards/no_repetition_reward_func": -0.278055801987648, "rewards/verse_reward_func": -0.0078125, "step": 1412 }, { "completion_length": 250.796875, "epoch": 11.304, "grad_norm": 2.25, "kl": 3.8081759214401245, "learning_rate": 4.761770292947582e-05, "loss": 0.1523, "reward": 2.099653720855713, "reward_std": 2.568287968635559, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.338091492652893, "rewards/no_repetition_reward_func": -0.23843776434659958, "rewards/verse_reward_func": 0.0, "step": 1413 }, { "completion_length": 253.1875, "epoch": 11.312, "grad_norm": 1.9765625, "kl": 3.3483256101608276, "learning_rate": 4.7611750874033356e-05, "loss": 0.1339, "reward": 2.257361054420471, "reward_std": 2.980707049369812, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5394067764282227, "rewards/no_repetition_reward_func": -0.26642075181007385, "rewards/verse_reward_func": -0.015625, "step": 1414 }, { "completion_length": 248.84375, "epoch": 11.32, "grad_norm": 1.8125, "kl": 2.918802857398987, "learning_rate": 4.760579176534747e-05, "loss": 0.1168, "reward": 2.1673821210861206, "reward_std": 2.8615787029266357, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.43472957611084, "rewards/no_repetition_reward_func": -0.267347514629364, "rewards/verse_reward_func": 0.0, "step": 1415 }, { "completion_length": 256.0, "epoch": 11.328, "grad_norm": 4.625, "kl": 2.7937204837799072, "learning_rate": 4.759982560527698e-05, "loss": 0.1117, "reward": 2.5698012709617615, "reward_std": 3.095607042312622, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8971214294433594, "rewards/no_repetition_reward_func": -0.3195077031850815, "rewards/verse_reward_func": -0.0078125, "step": 1416 }, { "completion_length": 253.890625, "epoch": 11.336, "grad_norm": 3.21875, "kl": 3.9287558794021606, "learning_rate": 4.759385239568289e-05, "loss": 0.1572, "reward": 1.6594293117523193, "reward_std": 2.6796395778656006, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9442824125289917, "rewards/no_repetition_reward_func": -0.2848529666662216, "rewards/verse_reward_func": 0.0, "step": 1417 }, { "completion_length": 256.0, "epoch": 11.344, "grad_norm": 1.9140625, "kl": 3.484013557434082, "learning_rate": 4.758787213842842e-05, "loss": 0.1394, "reward": 1.8962982296943665, "reward_std": 2.707392930984497, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1409127712249756, "rewards/no_repetition_reward_func": -0.2446145862340927, "rewards/verse_reward_func": 0.0, "step": 1418 }, { "completion_length": 243.453125, "epoch": 11.352, "grad_norm": 3.640625, "kl": 3.262161612510681, "learning_rate": 4.758188483537898e-05, "loss": 0.1305, "reward": 1.8344472646713257, "reward_std": 2.707538604736328, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1399129033088684, "rewards/no_repetition_reward_func": -0.2898406684398651, "rewards/verse_reward_func": -0.015625, "step": 1419 }, { "completion_length": 242.71875, "epoch": 11.36, "grad_norm": 3.71875, "kl": 3.8848646879196167, "learning_rate": 4.7575890488402185e-05, "loss": 0.1554, "reward": 1.1943838596343994, "reward_std": 1.975242257118225, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4253132939338684, "rewards/no_repetition_reward_func": -0.23092950135469437, "rewards/verse_reward_func": 0.0, "step": 1420 }, { "completion_length": 254.6875, "epoch": 11.368, "grad_norm": 2.65625, "kl": 3.0254708528518677, "learning_rate": 4.7569889099367824e-05, "loss": 0.121, "reward": 2.1846804022789, "reward_std": 2.8844258785247803, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4930742979049683, "rewards/no_repetition_reward_func": -0.29276901483535767, "rewards/verse_reward_func": -0.015625, "step": 1421 }, { "completion_length": 255.75, "epoch": 11.376, "grad_norm": 3.046875, "kl": 4.046651601791382, "learning_rate": 4.756388067014792e-05, "loss": 0.1619, "reward": 2.790343761444092, "reward_std": 3.366278648376465, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1017489433288574, "rewards/no_repetition_reward_func": -0.31140534579753876, "rewards/verse_reward_func": 0.0, "step": 1422 }, { "completion_length": 244.65625, "epoch": 11.384, "grad_norm": 1.84375, "kl": 3.595371723175049, "learning_rate": 4.7557865202616656e-05, "loss": 0.1438, "reward": 1.9659053683280945, "reward_std": 2.5923471450805664, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2749109268188477, "rewards/no_repetition_reward_func": -0.293380469083786, "rewards/verse_reward_func": -0.015625, "step": 1423 }, { "completion_length": 250.125, "epoch": 11.392, "grad_norm": 1.8203125, "kl": 5.0546863079071045, "learning_rate": 4.7551842698650436e-05, "loss": 0.2022, "reward": 1.9339213371276855, "reward_std": 2.72743821144104, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2004005908966064, "rewards/no_repetition_reward_func": -0.2586667239665985, "rewards/verse_reward_func": -0.0078125, "step": 1424 }, { "completion_length": 256.0, "epoch": 11.4, "grad_norm": 2.671875, "kl": 4.41279935836792, "learning_rate": 4.754581316012785e-05, "loss": 0.1765, "reward": 1.6381369829177856, "reward_std": 2.4656646251678467, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9113428592681885, "rewards/no_repetition_reward_func": -0.26539337635040283, "rewards/verse_reward_func": -0.0078125, "step": 1425 }, { "completion_length": 246.90625, "epoch": 11.408, "grad_norm": 4.0625, "kl": 2.9418437480926514, "learning_rate": 4.753977658892967e-05, "loss": 0.1177, "reward": 2.5557929277420044, "reward_std": 3.0544365644454956, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.844936966896057, "rewards/no_repetition_reward_func": -0.28914394974708557, "rewards/verse_reward_func": 0.0, "step": 1426 }, { "completion_length": 244.21875, "epoch": 11.416, "grad_norm": 2.84375, "kl": 3.2445383071899414, "learning_rate": 4.753373298693888e-05, "loss": 0.1298, "reward": 2.461775541305542, "reward_std": 2.9131386280059814, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.73680579662323, "rewards/no_repetition_reward_func": -0.27503031492233276, "rewards/verse_reward_func": 0.0, "step": 1427 }, { "completion_length": 250.390625, "epoch": 11.424, "grad_norm": 2.03125, "kl": 2.9455360174179077, "learning_rate": 4.752768235604065e-05, "loss": 0.1178, "reward": 2.060371160507202, "reward_std": 2.514532446861267, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3487476110458374, "rewards/no_repetition_reward_func": -0.2805640548467636, "rewards/verse_reward_func": -0.0078125, "step": 1428 }, { "completion_length": 247.5, "epoch": 11.432, "grad_norm": 2.328125, "kl": 2.7265427112579346, "learning_rate": 4.752162469812234e-05, "loss": 0.1091, "reward": 2.129593014717102, "reward_std": 3.0131359100341797, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4244539737701416, "rewards/no_repetition_reward_func": -0.28704849630594254, "rewards/verse_reward_func": -0.0078125, "step": 1429 }, { "completion_length": 249.078125, "epoch": 11.44, "grad_norm": 1.6796875, "kl": 2.683492660522461, "learning_rate": 4.7515560015073514e-05, "loss": 0.1073, "reward": 2.9622665643692017, "reward_std": 2.8745810985565186, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.307505488395691, "rewards/no_repetition_reward_func": -0.3296140730381012, "rewards/verse_reward_func": -0.015625, "step": 1430 }, { "completion_length": 248.671875, "epoch": 11.448, "grad_norm": 3.609375, "kl": 2.4995776414871216, "learning_rate": 4.7509488308785905e-05, "loss": 0.1, "reward": 3.1986334323883057, "reward_std": 2.769779324531555, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.48834764957428, "rewards/no_repetition_reward_func": -0.289714053273201, "rewards/verse_reward_func": 0.0, "step": 1431 }, { "completion_length": 245.78125, "epoch": 11.456, "grad_norm": 2.953125, "kl": 2.62058687210083, "learning_rate": 4.750340958115346e-05, "loss": 0.1048, "reward": 2.709551215171814, "reward_std": 2.928840160369873, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.966099977493286, "rewards/no_repetition_reward_func": -0.24873638153076172, "rewards/verse_reward_func": -0.0078125, "step": 1432 }, { "completion_length": 252.515625, "epoch": 11.464, "grad_norm": 1.5703125, "kl": 3.808983087539673, "learning_rate": 4.749732383407229e-05, "loss": 0.1524, "reward": 2.0816768407821655, "reward_std": 2.796705722808838, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4023276567459106, "rewards/no_repetition_reward_func": -0.30502578616142273, "rewards/verse_reward_func": -0.015625, "step": 1433 }, { "completion_length": 245.203125, "epoch": 11.472, "grad_norm": 3.421875, "kl": 4.051374554634094, "learning_rate": 4.749123106944073e-05, "loss": 0.1621, "reward": 1.873068392276764, "reward_std": 2.645816683769226, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.142529606819153, "rewards/no_repetition_reward_func": -0.26164858043193817, "rewards/verse_reward_func": -0.0078125, "step": 1434 }, { "completion_length": 254.859375, "epoch": 11.48, "grad_norm": 1.640625, "kl": 3.5840072631835938, "learning_rate": 4.7485131289159276e-05, "loss": 0.1434, "reward": 2.4048542976379395, "reward_std": 2.9355480670928955, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6530213356018066, "rewards/no_repetition_reward_func": -0.24816709756851196, "rewards/verse_reward_func": 0.0, "step": 1435 }, { "completion_length": 251.09375, "epoch": 11.488, "grad_norm": 2.171875, "kl": 4.274404168128967, "learning_rate": 4.747902449513063e-05, "loss": 0.171, "reward": 2.2169395685195923, "reward_std": 2.856274366378784, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.521633744239807, "rewards/no_repetition_reward_func": -0.2812568247318268, "rewards/verse_reward_func": -0.0234375, "step": 1436 }, { "completion_length": 253.5625, "epoch": 11.496, "grad_norm": 2.71875, "kl": 4.3026204109191895, "learning_rate": 4.7472910689259655e-05, "loss": 0.1721, "reward": 2.3636155128479004, "reward_std": 3.1536020040512085, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.671193242073059, "rewards/no_repetition_reward_func": -0.29195283353328705, "rewards/verse_reward_func": -0.015625, "step": 1437 }, { "completion_length": 250.75, "epoch": 11.504, "grad_norm": 3.171875, "kl": 3.0357565879821777, "learning_rate": 4.7466789873453444e-05, "loss": 0.1214, "reward": 2.573945999145508, "reward_std": 2.987426280975342, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.881423234939575, "rewards/no_repetition_reward_func": -0.3074771463871002, "rewards/verse_reward_func": 0.0, "step": 1438 }, { "completion_length": 255.8125, "epoch": 11.512, "grad_norm": 3.546875, "kl": 4.161548137664795, "learning_rate": 4.746066204962123e-05, "loss": 0.1665, "reward": 2.491811752319336, "reward_std": 3.1064592599868774, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7919405698776245, "rewards/no_repetition_reward_func": -0.30012889206409454, "rewards/verse_reward_func": 0.0, "step": 1439 }, { "completion_length": 256.0, "epoch": 11.52, "grad_norm": 2.078125, "kl": 3.439037799835205, "learning_rate": 4.745452721967446e-05, "loss": 0.1376, "reward": 2.6783100962638855, "reward_std": 2.9473276138305664, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0165529251098633, "rewards/no_repetition_reward_func": -0.3304302990436554, "rewards/verse_reward_func": -0.0078125, "step": 1440 }, { "completion_length": 248.328125, "epoch": 11.528, "grad_norm": 1.4453125, "kl": 3.5773948431015015, "learning_rate": 4.744838538552677e-05, "loss": 0.1431, "reward": 2.4403775930404663, "reward_std": 2.83901047706604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7260469794273376, "rewards/no_repetition_reward_func": -0.27785682678222656, "rewards/verse_reward_func": -0.0078125, "step": 1441 }, { "completion_length": 243.765625, "epoch": 11.536, "grad_norm": 3.296875, "kl": 2.6763546466827393, "learning_rate": 4.744223654909397e-05, "loss": 0.1071, "reward": 2.4094706177711487, "reward_std": 2.896107316017151, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7148131132125854, "rewards/no_repetition_reward_func": -0.2975301295518875, "rewards/verse_reward_func": -0.0078125, "step": 1442 }, { "completion_length": 247.890625, "epoch": 11.544, "grad_norm": 5.25, "kl": 4.440422534942627, "learning_rate": 4.743608071229405e-05, "loss": 0.1776, "reward": 1.6008878946304321, "reward_std": 2.4523125886917114, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8570128679275513, "rewards/no_repetition_reward_func": -0.24831251800060272, "rewards/verse_reward_func": -0.0078125, "step": 1443 }, { "completion_length": 250.046875, "epoch": 11.552, "grad_norm": 1.9453125, "kl": 4.1227240562438965, "learning_rate": 4.742991787704719e-05, "loss": 0.1649, "reward": 2.140823721885681, "reward_std": 2.6592756509780884, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3972493410110474, "rewards/no_repetition_reward_func": -0.24861300736665726, "rewards/verse_reward_func": -0.0078125, "step": 1444 }, { "completion_length": 249.0, "epoch": 11.56, "grad_norm": 2.265625, "kl": 3.2194279432296753, "learning_rate": 4.742374804527575e-05, "loss": 0.1288, "reward": 2.0915863513946533, "reward_std": 2.7534130811691284, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.433660864830017, "rewards/no_repetition_reward_func": -0.31863702833652496, "rewards/verse_reward_func": -0.0234375, "step": 1445 }, { "completion_length": 248.71875, "epoch": 11.568, "grad_norm": 1.609375, "kl": 3.1568996906280518, "learning_rate": 4.741757121890428e-05, "loss": 0.1263, "reward": 1.8900858759880066, "reward_std": 2.670133590698242, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.176891267299652, "rewards/no_repetition_reward_func": -0.27118048816919327, "rewards/verse_reward_func": -0.015625, "step": 1446 }, { "completion_length": 246.296875, "epoch": 11.576, "grad_norm": 5.375, "kl": 3.797896385192871, "learning_rate": 4.741138739985951e-05, "loss": 0.1519, "reward": 1.4967263340950012, "reward_std": 2.0553784370422363, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7142857313156128, "rewards/no_repetition_reward_func": -0.20974691957235336, "rewards/verse_reward_func": -0.0078125, "step": 1447 }, { "completion_length": 255.234375, "epoch": 11.584, "grad_norm": 3.296875, "kl": 2.8744736909866333, "learning_rate": 4.740519659007033e-05, "loss": 0.115, "reward": 2.4772456288337708, "reward_std": 2.9156529903411865, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.782726526260376, "rewards/no_repetition_reward_func": -0.30548095703125, "rewards/verse_reward_func": 0.0, "step": 1448 }, { "completion_length": 242.328125, "epoch": 11.592, "grad_norm": 2.609375, "kl": 4.388376235961914, "learning_rate": 4.739899879146785e-05, "loss": 0.1755, "reward": 1.7991290092468262, "reward_std": 2.8369168043136597, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.057693302631378, "rewards/no_repetition_reward_func": -0.24293921142816544, "rewards/verse_reward_func": -0.015625, "step": 1449 }, { "completion_length": 256.0, "epoch": 11.6, "grad_norm": 1.3828125, "kl": 3.56706964969635, "learning_rate": 4.7392794005985326e-05, "loss": 0.1427, "reward": 2.0104540586471558, "reward_std": 2.5407952070236206, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.255297303199768, "rewards/no_repetition_reward_func": -0.24484343826770782, "rewards/verse_reward_func": 0.0, "step": 1450 }, { "completion_length": 250.84375, "epoch": 11.608, "grad_norm": 1.5234375, "kl": 2.867061138153076, "learning_rate": 4.7386582235558205e-05, "loss": 0.1147, "reward": 2.142375946044922, "reward_std": 2.353951096534729, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4013184309005737, "rewards/no_repetition_reward_func": -0.2589426711201668, "rewards/verse_reward_func": 0.0, "step": 1451 }, { "completion_length": 250.453125, "epoch": 11.616, "grad_norm": 2.265625, "kl": 2.5575376749038696, "learning_rate": 4.738036348212412e-05, "loss": 0.1023, "reward": 2.1317203044891357, "reward_std": 2.5689467191696167, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.412233829498291, "rewards/no_repetition_reward_func": -0.2805134207010269, "rewards/verse_reward_func": 0.0, "step": 1452 }, { "completion_length": 250.21875, "epoch": 11.624, "grad_norm": 2.75, "kl": 3.5423158407211304, "learning_rate": 4.737413774762287e-05, "loss": 0.1417, "reward": 1.9124073386192322, "reward_std": 2.574357032775879, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.197325587272644, "rewards/no_repetition_reward_func": -0.2771056592464447, "rewards/verse_reward_func": -0.0078125, "step": 1453 }, { "completion_length": 238.6875, "epoch": 11.632, "grad_norm": 2.171875, "kl": 2.75149667263031, "learning_rate": 4.7367905033996445e-05, "loss": 0.1101, "reward": 2.5310314893722534, "reward_std": 3.023085355758667, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.834367275238037, "rewards/no_repetition_reward_func": -0.29552340507507324, "rewards/verse_reward_func": -0.0078125, "step": 1454 }, { "completion_length": 248.765625, "epoch": 11.64, "grad_norm": 2.015625, "kl": 3.3382132053375244, "learning_rate": 4.7361665343189e-05, "loss": 0.1335, "reward": 2.6226797103881836, "reward_std": 3.2192310094833374, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9365991353988647, "rewards/no_repetition_reward_func": -0.3061068654060364, "rewards/verse_reward_func": -0.0078125, "step": 1455 }, { "completion_length": 255.171875, "epoch": 11.648, "grad_norm": 2.171875, "kl": 4.677009582519531, "learning_rate": 4.735541867714687e-05, "loss": 0.1871, "reward": 2.178377151489258, "reward_std": 2.4362186193466187, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4687159061431885, "rewards/no_repetition_reward_func": -0.2903386950492859, "rewards/verse_reward_func": 0.0, "step": 1456 }, { "completion_length": 239.578125, "epoch": 11.656, "grad_norm": 4.78125, "kl": 5.167854428291321, "learning_rate": 4.734916503781856e-05, "loss": 0.2067, "reward": 1.5173451900482178, "reward_std": 2.21795392036438, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7731432914733887, "rewards/no_repetition_reward_func": -0.2401731088757515, "rewards/verse_reward_func": -0.015625, "step": 1457 }, { "completion_length": 252.5, "epoch": 11.664, "grad_norm": 2.171875, "kl": 4.212717533111572, "learning_rate": 4.7342904427154766e-05, "loss": 0.1685, "reward": 2.629697561264038, "reward_std": 3.0341954231262207, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.957536220550537, "rewards/no_repetition_reward_func": -0.3200264126062393, "rewards/verse_reward_func": -0.0078125, "step": 1458 }, { "completion_length": 246.21875, "epoch": 11.672, "grad_norm": 5.5, "kl": 5.2565598487854, "learning_rate": 4.733663684710835e-05, "loss": 0.2103, "reward": 1.6041860580444336, "reward_std": 2.4051380157470703, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8731110095977783, "rewards/no_repetition_reward_func": -0.2611125409603119, "rewards/verse_reward_func": -0.0078125, "step": 1459 }, { "completion_length": 255.375, "epoch": 11.68, "grad_norm": 6.34375, "kl": 5.056926488876343, "learning_rate": 4.733036229963435e-05, "loss": 0.2023, "reward": 1.9182190895080566, "reward_std": 2.62939989566803, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.155701756477356, "rewards/no_repetition_reward_func": -0.23748257756233215, "rewards/verse_reward_func": 0.0, "step": 1460 }, { "completion_length": 249.140625, "epoch": 11.688, "grad_norm": 3.296875, "kl": 3.97802197933197, "learning_rate": 4.732408078668995e-05, "loss": 0.1591, "reward": 1.8531900644302368, "reward_std": 2.632315158843994, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.139577269554138, "rewards/no_repetition_reward_func": -0.2785748243331909, "rewards/verse_reward_func": -0.0078125, "step": 1461 }, { "completion_length": 233.34375, "epoch": 11.696, "grad_norm": 1.9765625, "kl": 3.537346839904785, "learning_rate": 4.731779231023456e-05, "loss": 0.1415, "reward": 1.7380582094192505, "reward_std": 2.59045672416687, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9665826559066772, "rewards/no_repetition_reward_func": -0.22852441668510437, "rewards/verse_reward_func": 0.0, "step": 1462 }, { "completion_length": 252.625, "epoch": 11.704, "grad_norm": 2.9375, "kl": 3.5637978315353394, "learning_rate": 4.731149687222972e-05, "loss": 0.1426, "reward": 2.570344030857086, "reward_std": 3.025832772254944, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8805952072143555, "rewards/no_repetition_reward_func": -0.31025145947933197, "rewards/verse_reward_func": 0.0, "step": 1463 }, { "completion_length": 247.953125, "epoch": 11.712, "grad_norm": 1.78125, "kl": 3.3282885551452637, "learning_rate": 4.730519447463916e-05, "loss": 0.1331, "reward": 2.584825277328491, "reward_std": 3.144259810447693, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.894326329231262, "rewards/no_repetition_reward_func": -0.3016885444521904, "rewards/verse_reward_func": -0.0078125, "step": 1464 }, { "completion_length": 239.921875, "epoch": 11.72, "grad_norm": 3.203125, "kl": 3.4396421909332275, "learning_rate": 4.7298885119428773e-05, "loss": 0.1376, "reward": 1.8151575922966003, "reward_std": 2.9088103771209717, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0810914039611816, "rewards/no_repetition_reward_func": -0.25030873715877533, "rewards/verse_reward_func": -0.015625, "step": 1465 }, { "completion_length": 252.765625, "epoch": 11.728, "grad_norm": 3.265625, "kl": 3.380464196205139, "learning_rate": 4.729256880856662e-05, "loss": 0.1352, "reward": 1.8148704767227173, "reward_std": 2.627306818962097, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.053056478500366, "rewards/no_repetition_reward_func": -0.2381858378648758, "rewards/verse_reward_func": 0.0, "step": 1466 }, { "completion_length": 245.3125, "epoch": 11.736, "grad_norm": 5.09375, "kl": 2.5386969447135925, "learning_rate": 4.728624554402295e-05, "loss": 0.1015, "reward": 2.0615084171295166, "reward_std": 2.8800175189971924, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3565753698349, "rewards/no_repetition_reward_func": -0.28725457191467285, "rewards/verse_reward_func": -0.0078125, "step": 1467 }, { "completion_length": 246.609375, "epoch": 11.744, "grad_norm": 2.734375, "kl": 2.560022711753845, "learning_rate": 4.7279915327770155e-05, "loss": 0.1024, "reward": 2.365167021751404, "reward_std": 2.661555767059326, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6327301263809204, "rewards/no_repetition_reward_func": -0.26756298542022705, "rewards/verse_reward_func": 0.0, "step": 1468 }, { "completion_length": 253.15625, "epoch": 11.752, "grad_norm": 2.171875, "kl": 2.8062055110931396, "learning_rate": 4.727357816178282e-05, "loss": 0.1122, "reward": 2.0655288696289062, "reward_std": 2.631769895553589, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3513420820236206, "rewards/no_repetition_reward_func": -0.28581322729587555, "rewards/verse_reward_func": 0.0, "step": 1469 }, { "completion_length": 251.328125, "epoch": 11.76, "grad_norm": 1.53125, "kl": 3.20220685005188, "learning_rate": 4.7267234048037664e-05, "loss": 0.1281, "reward": 1.2493318915367126, "reward_std": 2.155321478843689, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.4866751432418823, "rewards/no_repetition_reward_func": -0.23734323680400848, "rewards/verse_reward_func": 0.0, "step": 1470 }, { "completion_length": 246.34375, "epoch": 11.768, "grad_norm": 2.53125, "kl": 1.8518226742744446, "learning_rate": 4.7260882988513624e-05, "loss": 0.0741, "reward": 2.8223456144332886, "reward_std": 2.9113965034484863, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1450639963150024, "rewards/no_repetition_reward_func": -0.3149058222770691, "rewards/verse_reward_func": -0.0078125, "step": 1471 }, { "completion_length": 240.65625, "epoch": 11.776, "grad_norm": 1.4296875, "kl": 3.073288917541504, "learning_rate": 4.725452498519175e-05, "loss": 0.1229, "reward": 2.0070979595184326, "reward_std": 2.739479899406433, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2677148580551147, "rewards/no_repetition_reward_func": -0.25280432403087616, "rewards/verse_reward_func": -0.0078125, "step": 1472 }, { "completion_length": 254.375, "epoch": 11.784, "grad_norm": 2.53125, "kl": 3.0057761669158936, "learning_rate": 4.7248160040055304e-05, "loss": 0.1202, "reward": 2.3748353719711304, "reward_std": 2.8028218746185303, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6534643173217773, "rewards/no_repetition_reward_func": -0.27862879633903503, "rewards/verse_reward_func": 0.0, "step": 1473 }, { "completion_length": 248.828125, "epoch": 11.792, "grad_norm": 2.65625, "kl": 2.4118518829345703, "learning_rate": 4.724178815508967e-05, "loss": 0.0965, "reward": 2.998030185699463, "reward_std": 3.0175098180770874, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2730374336242676, "rewards/no_repetition_reward_func": -0.2750071734189987, "rewards/verse_reward_func": 0.0, "step": 1474 }, { "completion_length": 256.0, "epoch": 11.8, "grad_norm": 1.5234375, "kl": 2.7532291412353516, "learning_rate": 4.723540933228244e-05, "loss": 0.1101, "reward": 2.1511335372924805, "reward_std": 2.6112412214279175, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.421731472015381, "rewards/no_repetition_reward_func": -0.2705979272723198, "rewards/verse_reward_func": 0.0, "step": 1475 }, { "completion_length": 247.75, "epoch": 11.808, "grad_norm": 2.84375, "kl": 3.950162172317505, "learning_rate": 4.722902357362333e-05, "loss": 0.158, "reward": 1.6601126790046692, "reward_std": 2.3901976346969604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9050532579421997, "rewards/no_repetition_reward_func": -0.24494053423404694, "rewards/verse_reward_func": 0.0, "step": 1476 }, { "completion_length": 248.53125, "epoch": 11.816, "grad_norm": 1.4765625, "kl": 4.548578977584839, "learning_rate": 4.722263088110426e-05, "loss": 0.1819, "reward": 2.218106985092163, "reward_std": 2.9302589893341064, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4980812072753906, "rewards/no_repetition_reward_func": -0.27216193079948425, "rewards/verse_reward_func": -0.0078125, "step": 1477 }, { "completion_length": 252.3125, "epoch": 11.824, "grad_norm": 1.3359375, "kl": 2.5808829069137573, "learning_rate": 4.721623125671927e-05, "loss": 0.1032, "reward": 2.4177781343460083, "reward_std": 2.9052971601486206, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.705030083656311, "rewards/no_repetition_reward_func": -0.2872518002986908, "rewards/verse_reward_func": 0.0, "step": 1478 }, { "completion_length": 244.875, "epoch": 11.832, "grad_norm": 2.59375, "kl": 3.607999801635742, "learning_rate": 4.720982470246459e-05, "loss": 0.1443, "reward": 1.6557728052139282, "reward_std": 2.6774500608444214, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9314895868301392, "rewards/no_repetition_reward_func": -0.2600918561220169, "rewards/verse_reward_func": -0.015625, "step": 1479 }, { "completion_length": 254.9375, "epoch": 11.84, "grad_norm": 5.3125, "kl": 2.550771713256836, "learning_rate": 4.720341122033862e-05, "loss": 0.102, "reward": 3.4705597162246704, "reward_std": 3.2434006929397583, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.795412540435791, "rewards/no_repetition_reward_func": -0.32485270500183105, "rewards/verse_reward_func": 0.0, "step": 1480 }, { "completion_length": 250.03125, "epoch": 11.848, "grad_norm": 2.03125, "kl": 3.4045567512512207, "learning_rate": 4.719699081234188e-05, "loss": 0.1362, "reward": 2.4242258071899414, "reward_std": 3.0344295501708984, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7289189100265503, "rewards/no_repetition_reward_func": -0.30469319224357605, "rewards/verse_reward_func": 0.0, "step": 1481 }, { "completion_length": 245.71875, "epoch": 11.856, "grad_norm": 2.1875, "kl": 4.118790030479431, "learning_rate": 4.7190563480477095e-05, "loss": 0.1648, "reward": 2.2497977018356323, "reward_std": 2.609958529472351, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5114676356315613, "rewards/no_repetition_reward_func": -0.2538573741912842, "rewards/verse_reward_func": -0.0078125, "step": 1482 }, { "completion_length": 252.375, "epoch": 11.864, "grad_norm": 3.265625, "kl": 3.2672505378723145, "learning_rate": 4.718412922674913e-05, "loss": 0.1307, "reward": 3.14945387840271, "reward_std": 2.9592857360839844, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4457002878189087, "rewards/no_repetition_reward_func": -0.2962462455034256, "rewards/verse_reward_func": 0.0, "step": 1483 }, { "completion_length": 244.515625, "epoch": 11.872, "grad_norm": 2.59375, "kl": 4.47635555267334, "learning_rate": 4.717768805316501e-05, "loss": 0.1791, "reward": 1.893887996673584, "reward_std": 2.779241919517517, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.206426739692688, "rewards/no_repetition_reward_func": -0.2969137877225876, "rewards/verse_reward_func": -0.015625, "step": 1484 }, { "completion_length": 249.390625, "epoch": 11.88, "grad_norm": 2.453125, "kl": 3.6804773807525635, "learning_rate": 4.71712399617339e-05, "loss": 0.1472, "reward": 2.5270848274230957, "reward_std": 2.522564172744751, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7905627489089966, "rewards/no_repetition_reward_func": -0.25566551089286804, "rewards/verse_reward_func": -0.0078125, "step": 1485 }, { "completion_length": 252.96875, "epoch": 11.888, "grad_norm": 3.375, "kl": 4.275830507278442, "learning_rate": 4.7164784954467166e-05, "loss": 0.171, "reward": 2.24162495136261, "reward_std": 2.6823394298553467, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.508546233177185, "rewards/no_repetition_reward_func": -0.2669212520122528, "rewards/verse_reward_func": 0.0, "step": 1486 }, { "completion_length": 255.515625, "epoch": 11.896, "grad_norm": 2.390625, "kl": 3.9875149726867676, "learning_rate": 4.715832303337829e-05, "loss": 0.1595, "reward": 2.320760428905487, "reward_std": 2.3950027227401733, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.61419016122818, "rewards/no_repetition_reward_func": -0.28561727702617645, "rewards/verse_reward_func": -0.0078125, "step": 1487 }, { "completion_length": 239.5, "epoch": 11.904, "grad_norm": 2.3125, "kl": 3.048241972923279, "learning_rate": 4.715185420048295e-05, "loss": 0.1219, "reward": 2.633526563644409, "reward_std": 2.881089925765991, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9378137588500977, "rewards/no_repetition_reward_func": -0.2886620908975601, "rewards/verse_reward_func": -0.015625, "step": 1488 }, { "completion_length": 256.0, "epoch": 11.912, "grad_norm": 2.53125, "kl": 3.4861639738082886, "learning_rate": 4.714537845779894e-05, "loss": 0.1394, "reward": 2.6307790279388428, "reward_std": 2.538844108581543, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9134130477905273, "rewards/no_repetition_reward_func": -0.28263382613658905, "rewards/verse_reward_func": 0.0, "step": 1489 }, { "completion_length": 252.28125, "epoch": 11.92, "grad_norm": 1.875, "kl": 3.485668659210205, "learning_rate": 4.713889580734623e-05, "loss": 0.1394, "reward": 2.919156789779663, "reward_std": 3.058621883392334, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.218210458755493, "rewards/no_repetition_reward_func": -0.2990535944700241, "rewards/verse_reward_func": 0.0, "step": 1490 }, { "completion_length": 239.59375, "epoch": 11.928, "grad_norm": 3.640625, "kl": 4.110300183296204, "learning_rate": 4.7132406251146935e-05, "loss": 0.1644, "reward": 1.5188137292861938, "reward_std": 2.3752867579460144, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7585408687591553, "rewards/no_repetition_reward_func": -0.23972709476947784, "rewards/verse_reward_func": 0.0, "step": 1491 }, { "completion_length": 247.875, "epoch": 11.936, "grad_norm": 2.375, "kl": 3.7817633152008057, "learning_rate": 4.712590979122534e-05, "loss": 0.1513, "reward": 1.926757276058197, "reward_std": 2.2533448934555054, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2009244561195374, "rewards/no_repetition_reward_func": -0.2663547992706299, "rewards/verse_reward_func": -0.0078125, "step": 1492 }, { "completion_length": 248.5, "epoch": 11.943999999999999, "grad_norm": 3.046875, "kl": 3.503788471221924, "learning_rate": 4.7119406429607885e-05, "loss": 0.1402, "reward": 2.244926929473877, "reward_std": 2.3382813930511475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5200005769729614, "rewards/no_repetition_reward_func": -0.2750736251473427, "rewards/verse_reward_func": 0.0, "step": 1493 }, { "completion_length": 241.953125, "epoch": 11.952, "grad_norm": 2.765625, "kl": 3.3204089403152466, "learning_rate": 4.711289616832312e-05, "loss": 0.1328, "reward": 2.43573796749115, "reward_std": 2.9273163080215454, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7219150066375732, "rewards/no_repetition_reward_func": -0.2705521881580353, "rewards/verse_reward_func": -0.015625, "step": 1494 }, { "completion_length": 255.03125, "epoch": 11.96, "grad_norm": 1.9453125, "kl": 3.390212655067444, "learning_rate": 4.710637900940181e-05, "loss": 0.1356, "reward": 1.8860268592834473, "reward_std": 2.641677737236023, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2257564663887024, "rewards/no_repetition_reward_func": -0.32410475611686707, "rewards/verse_reward_func": -0.015625, "step": 1495 }, { "completion_length": 252.9375, "epoch": 11.968, "grad_norm": 3.890625, "kl": 2.691067337989807, "learning_rate": 4.709985495487682e-05, "loss": 0.1076, "reward": 2.0333617329597473, "reward_std": 2.81036376953125, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3245980739593506, "rewards/no_repetition_reward_func": -0.29123610258102417, "rewards/verse_reward_func": 0.0, "step": 1496 }, { "completion_length": 248.703125, "epoch": 11.975999999999999, "grad_norm": 3.234375, "kl": 2.5306073427200317, "learning_rate": 4.7093324006783214e-05, "loss": 0.1012, "reward": 2.4309087991714478, "reward_std": 2.569080352783203, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7114627361297607, "rewards/no_repetition_reward_func": -0.2727413848042488, "rewards/verse_reward_func": -0.0078125, "step": 1497 }, { "completion_length": 250.71875, "epoch": 11.984, "grad_norm": 2.890625, "kl": 3.3384557962417603, "learning_rate": 4.708678616715815e-05, "loss": 0.1335, "reward": 2.3784173727035522, "reward_std": 2.7437543869018555, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6343114376068115, "rewards/no_repetition_reward_func": -0.25589410960674286, "rewards/verse_reward_func": 0.0, "step": 1498 }, { "completion_length": 250.234375, "epoch": 11.992, "grad_norm": 1.8125, "kl": 3.3198487758636475, "learning_rate": 4.708024143804097e-05, "loss": 0.1328, "reward": 1.8058490753173828, "reward_std": 2.9438871145248413, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.082804501056671, "rewards/no_repetition_reward_func": -0.26133041083812714, "rewards/verse_reward_func": -0.015625, "step": 1499 }, { "completion_length": 256.0, "epoch": 12.0, "grad_norm": 2.078125, "kl": 3.2968926429748535, "learning_rate": 4.707368982147318e-05, "loss": 0.1319, "reward": 2.010868728160858, "reward_std": 2.8088139295578003, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.334894895553589, "rewards/no_repetition_reward_func": -0.3084012120962143, "rewards/verse_reward_func": -0.015625, "step": 1500 }, { "completion_length": 249.984375, "epoch": 12.008, "grad_norm": 3.25, "kl": 4.634685635566711, "learning_rate": 4.706713131949839e-05, "loss": 0.1854, "reward": 2.12497079372406, "reward_std": 2.7270032167434692, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4252110719680786, "rewards/no_repetition_reward_func": -0.29242755472660065, "rewards/verse_reward_func": -0.0078125, "step": 1501 }, { "completion_length": 255.859375, "epoch": 12.016, "grad_norm": 2.5625, "kl": 3.7367050647735596, "learning_rate": 4.7060565934162394e-05, "loss": 0.1495, "reward": 2.3373706340789795, "reward_std": 2.9707438945770264, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6569210290908813, "rewards/no_repetition_reward_func": -0.31173792481422424, "rewards/verse_reward_func": -0.0078125, "step": 1502 }, { "completion_length": 254.0625, "epoch": 12.024, "grad_norm": 2.734375, "kl": 4.806992769241333, "learning_rate": 4.705399366751312e-05, "loss": 0.1923, "reward": 2.2560269832611084, "reward_std": 3.0969432592391968, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5315204858779907, "rewards/no_repetition_reward_func": -0.2754933536052704, "rewards/verse_reward_func": 0.0, "step": 1503 }, { "completion_length": 246.921875, "epoch": 12.032, "grad_norm": 2.484375, "kl": 3.9626848697662354, "learning_rate": 4.7047414521600644e-05, "loss": 0.1585, "reward": 3.312117338180542, "reward_std": 2.7439775466918945, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.6071455478668213, "rewards/no_repetition_reward_func": -0.2794031649827957, "rewards/verse_reward_func": -0.015625, "step": 1504 }, { "completion_length": 245.1875, "epoch": 12.04, "grad_norm": 2.03125, "kl": 3.8578898906707764, "learning_rate": 4.704082849847718e-05, "loss": 0.1543, "reward": 2.14883953332901, "reward_std": 2.8592716455459595, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.403266429901123, "rewards/no_repetition_reward_func": -0.25442682206630707, "rewards/verse_reward_func": 0.0, "step": 1505 }, { "completion_length": 254.65625, "epoch": 12.048, "grad_norm": 4.625, "kl": 5.848163604736328, "learning_rate": 4.70342356001971e-05, "loss": 0.2339, "reward": 1.9316662549972534, "reward_std": 2.668682098388672, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1941776275634766, "rewards/no_repetition_reward_func": -0.25469888746738434, "rewards/verse_reward_func": -0.0078125, "step": 1506 }, { "completion_length": 245.390625, "epoch": 12.056, "grad_norm": 3.46875, "kl": 4.057803153991699, "learning_rate": 4.702763582881692e-05, "loss": 0.1623, "reward": 2.036598801612854, "reward_std": 2.6801297664642334, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.324217677116394, "rewards/no_repetition_reward_func": -0.2876189649105072, "rewards/verse_reward_func": 0.0, "step": 1507 }, { "completion_length": 242.40625, "epoch": 12.064, "grad_norm": 2.296875, "kl": 5.118767738342285, "learning_rate": 4.702102918639528e-05, "loss": 0.2048, "reward": 2.0806329250335693, "reward_std": 2.994666814804077, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.35556960105896, "rewards/no_repetition_reward_func": -0.2749367356300354, "rewards/verse_reward_func": 0.0, "step": 1508 }, { "completion_length": 249.96875, "epoch": 12.072, "grad_norm": 2.421875, "kl": 3.9036948680877686, "learning_rate": 4.7014415674993e-05, "loss": 0.1561, "reward": 1.8956271409988403, "reward_std": 2.914084553718567, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.187867224216461, "rewards/no_repetition_reward_func": -0.2844276577234268, "rewards/verse_reward_func": -0.0078125, "step": 1509 }, { "completion_length": 251.28125, "epoch": 12.08, "grad_norm": 4.0, "kl": 4.605770826339722, "learning_rate": 4.7007795296673006e-05, "loss": 0.1842, "reward": 2.1277196407318115, "reward_std": 2.8582128286361694, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4066230058670044, "rewards/no_repetition_reward_func": -0.27890343964099884, "rewards/verse_reward_func": 0.0, "step": 1510 }, { "completion_length": 240.9375, "epoch": 12.088, "grad_norm": 4.34375, "kl": 3.1220861673355103, "learning_rate": 4.700116805350039e-05, "loss": 0.1249, "reward": 1.8336971402168274, "reward_std": 2.6630314588546753, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.135917067527771, "rewards/no_repetition_reward_func": -0.2944074273109436, "rewards/verse_reward_func": -0.0078125, "step": 1511 }, { "completion_length": 249.28125, "epoch": 12.096, "grad_norm": 1.4140625, "kl": 3.6537041664123535, "learning_rate": 4.699453394754236e-05, "loss": 0.1461, "reward": 1.8715592622756958, "reward_std": 2.7418770790100098, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.121035397052765, "rewards/no_repetition_reward_func": -0.24166356772184372, "rewards/verse_reward_func": -0.0078125, "step": 1512 }, { "completion_length": 251.3125, "epoch": 12.104, "grad_norm": 3.484375, "kl": 2.7582645416259766, "learning_rate": 4.6987892980868296e-05, "loss": 0.1103, "reward": 2.3276318311691284, "reward_std": 2.9909467697143555, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5877432823181152, "rewards/no_repetition_reward_func": -0.2601114436984062, "rewards/verse_reward_func": 0.0, "step": 1513 }, { "completion_length": 250.609375, "epoch": 12.112, "grad_norm": 1.6796875, "kl": 2.9124616384506226, "learning_rate": 4.69812451555497e-05, "loss": 0.1165, "reward": 2.1112231016159058, "reward_std": 2.6439887285232544, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3748972415924072, "rewards/no_repetition_reward_func": -0.25586146861314774, "rewards/verse_reward_func": -0.0078125, "step": 1514 }, { "completion_length": 249.515625, "epoch": 12.12, "grad_norm": 2.75, "kl": 2.9101046323776245, "learning_rate": 4.6974590473660216e-05, "loss": 0.1164, "reward": 2.126152753829956, "reward_std": 2.6689809560775757, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.418108820915222, "rewards/no_repetition_reward_func": -0.29195602238178253, "rewards/verse_reward_func": 0.0, "step": 1515 }, { "completion_length": 240.5625, "epoch": 12.128, "grad_norm": 3.75, "kl": 2.0203434228897095, "learning_rate": 4.696792893727562e-05, "loss": 0.0808, "reward": 3.0600892305374146, "reward_std": 2.787590742111206, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.334978938102722, "rewards/no_repetition_reward_func": -0.27488958835601807, "rewards/verse_reward_func": 0.0, "step": 1516 }, { "completion_length": 255.796875, "epoch": 12.136, "grad_norm": 1.625, "kl": 3.2952362298965454, "learning_rate": 4.696126054847385e-05, "loss": 0.1318, "reward": 2.1500788927078247, "reward_std": 2.4258298873901367, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4069448709487915, "rewards/no_repetition_reward_func": -0.2568659484386444, "rewards/verse_reward_func": 0.0, "step": 1517 }, { "completion_length": 250.828125, "epoch": 12.144, "grad_norm": 2.296875, "kl": 3.162207841873169, "learning_rate": 4.695458530933494e-05, "loss": 0.1265, "reward": 2.3241885900497437, "reward_std": 2.753738045692444, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.601695716381073, "rewards/no_repetition_reward_func": -0.27750709652900696, "rewards/verse_reward_func": 0.0, "step": 1518 }, { "completion_length": 249.40625, "epoch": 12.152, "grad_norm": 1.9296875, "kl": 3.3465439081192017, "learning_rate": 4.694790322194111e-05, "loss": 0.1339, "reward": 1.974013090133667, "reward_std": 2.6270841360092163, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2286285758018494, "rewards/no_repetition_reward_func": -0.2546154633164406, "rewards/verse_reward_func": 0.0, "step": 1519 }, { "completion_length": 252.65625, "epoch": 12.16, "grad_norm": 1.578125, "kl": 3.808385133743286, "learning_rate": 4.694121428837668e-05, "loss": 0.1523, "reward": 2.321211040019989, "reward_std": 2.733747959136963, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5778300762176514, "rewards/no_repetition_reward_func": -0.25661899149417877, "rewards/verse_reward_func": 0.0, "step": 1520 }, { "completion_length": 256.0, "epoch": 12.168, "grad_norm": 2.4375, "kl": 3.4354625940322876, "learning_rate": 4.693451851072811e-05, "loss": 0.1374, "reward": 2.5249496698379517, "reward_std": 3.2045823335647583, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.800291657447815, "rewards/no_repetition_reward_func": -0.2753419131040573, "rewards/verse_reward_func": 0.0, "step": 1521 }, { "completion_length": 249.078125, "epoch": 12.176, "grad_norm": 2.09375, "kl": 3.4387468099594116, "learning_rate": 4.692781589108402e-05, "loss": 0.1375, "reward": 2.4532846212387085, "reward_std": 2.868033766746521, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7501282691955566, "rewards/no_repetition_reward_func": -0.2890309691429138, "rewards/verse_reward_func": -0.0078125, "step": 1522 }, { "completion_length": 247.9375, "epoch": 12.184, "grad_norm": 2.765625, "kl": 4.87101936340332, "learning_rate": 4.6921106431535135e-05, "loss": 0.1948, "reward": 2.2673299312591553, "reward_std": 2.7900381088256836, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.536052167415619, "rewards/no_repetition_reward_func": -0.2687222883105278, "rewards/verse_reward_func": 0.0, "step": 1523 }, { "completion_length": 250.875, "epoch": 12.192, "grad_norm": 6.90625, "kl": 5.0711469650268555, "learning_rate": 4.691439013417433e-05, "loss": 0.2028, "reward": 1.7360734939575195, "reward_std": 2.465748429298401, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0056235790252686, "rewards/no_repetition_reward_func": -0.2617376446723938, "rewards/verse_reward_func": -0.0078125, "step": 1524 }, { "completion_length": 256.0, "epoch": 12.2, "grad_norm": 5.125, "kl": 5.087576389312744, "learning_rate": 4.690766700109659e-05, "loss": 0.2035, "reward": 2.4158817529678345, "reward_std": 2.8255298137664795, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7227914333343506, "rewards/no_repetition_reward_func": -0.299097016453743, "rewards/verse_reward_func": -0.0078125, "step": 1525 }, { "completion_length": 251.78125, "epoch": 12.208, "grad_norm": 2.359375, "kl": 3.6044843196868896, "learning_rate": 4.690093703439907e-05, "loss": 0.1442, "reward": 2.5814255475997925, "reward_std": 2.5165865421295166, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8563640117645264, "rewards/no_repetition_reward_func": -0.27493828535079956, "rewards/verse_reward_func": 0.0, "step": 1526 }, { "completion_length": 246.234375, "epoch": 12.216, "grad_norm": 3.375, "kl": 3.5593690872192383, "learning_rate": 4.689420023618104e-05, "loss": 0.1424, "reward": 2.425694227218628, "reward_std": 2.5648248195648193, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7465940713882446, "rewards/no_repetition_reward_func": -0.32089969515800476, "rewards/verse_reward_func": 0.0, "step": 1527 }, { "completion_length": 248.34375, "epoch": 12.224, "grad_norm": 1.671875, "kl": 4.070478081703186, "learning_rate": 4.688745660854388e-05, "loss": 0.1628, "reward": 1.8955583572387695, "reward_std": 2.825953722000122, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1672468185424805, "rewards/no_repetition_reward_func": -0.2716883569955826, "rewards/verse_reward_func": 0.0, "step": 1528 }, { "completion_length": 249.78125, "epoch": 12.232, "grad_norm": 4.25, "kl": 3.113553762435913, "learning_rate": 4.688070615359114e-05, "loss": 0.1245, "reward": 2.9728399515151978, "reward_std": 3.1375190019607544, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2838354110717773, "rewards/no_repetition_reward_func": -0.2953703999519348, "rewards/verse_reward_func": -0.015625, "step": 1529 }, { "completion_length": 248.296875, "epoch": 12.24, "grad_norm": 2.953125, "kl": 3.608412504196167, "learning_rate": 4.687394887342845e-05, "loss": 0.1443, "reward": 2.5340943336486816, "reward_std": 2.9180474281311035, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.813134789466858, "rewards/no_repetition_reward_func": -0.2790403515100479, "rewards/verse_reward_func": 0.0, "step": 1530 }, { "completion_length": 249.09375, "epoch": 12.248, "grad_norm": 3.921875, "kl": 4.12192976474762, "learning_rate": 4.686718477016361e-05, "loss": 0.1649, "reward": 2.7650296688079834, "reward_std": 2.5658528804779053, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.068142294883728, "rewards/no_repetition_reward_func": -0.29530027508735657, "rewards/verse_reward_func": -0.0078125, "step": 1531 }, { "completion_length": 253.40625, "epoch": 12.256, "grad_norm": 2.703125, "kl": 4.734085321426392, "learning_rate": 4.6860413845906534e-05, "loss": 0.1894, "reward": 2.3116105794906616, "reward_std": 2.785853147506714, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.603518486022949, "rewards/no_repetition_reward_func": -0.28409547358751297, "rewards/verse_reward_func": -0.0078125, "step": 1532 }, { "completion_length": 237.953125, "epoch": 12.264, "grad_norm": 2.5625, "kl": 4.302041292190552, "learning_rate": 4.6853636102769274e-05, "loss": 0.1721, "reward": 2.1374688148498535, "reward_std": 3.013204336166382, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.395845651626587, "rewards/no_repetition_reward_func": -0.2505643367767334, "rewards/verse_reward_func": -0.0078125, "step": 1533 }, { "completion_length": 248.359375, "epoch": 12.272, "grad_norm": 3.171875, "kl": 4.456336975097656, "learning_rate": 4.684685154286599e-05, "loss": 0.1783, "reward": 2.483131170272827, "reward_std": 2.928344488143921, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7805169820785522, "rewards/no_repetition_reward_func": -0.2895735204219818, "rewards/verse_reward_func": -0.0078125, "step": 1534 }, { "completion_length": 253.1875, "epoch": 12.28, "grad_norm": 2.265625, "kl": 4.0724263191223145, "learning_rate": 4.684006016831297e-05, "loss": 0.1629, "reward": 2.140250325202942, "reward_std": 2.7519150972366333, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4131187200546265, "rewards/no_repetition_reward_func": -0.27286839485168457, "rewards/verse_reward_func": 0.0, "step": 1535 }, { "completion_length": 247.1875, "epoch": 12.288, "grad_norm": 4.46875, "kl": 3.5626593828201294, "learning_rate": 4.6833261981228646e-05, "loss": 0.1425, "reward": 2.1625808477401733, "reward_std": 2.4110140800476074, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4321274161338806, "rewards/no_repetition_reward_func": -0.25392163544893265, "rewards/verse_reward_func": -0.015625, "step": 1536 }, { "completion_length": 255.53125, "epoch": 12.296, "grad_norm": 2.390625, "kl": 3.5176831483840942, "learning_rate": 4.682645698373357e-05, "loss": 0.1407, "reward": 1.9635945558547974, "reward_std": 3.047981858253479, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2495614290237427, "rewards/no_repetition_reward_func": -0.2859669178724289, "rewards/verse_reward_func": 0.0, "step": 1537 }, { "completion_length": 251.15625, "epoch": 12.304, "grad_norm": 2.578125, "kl": 3.0394207239151, "learning_rate": 4.68196451779504e-05, "loss": 0.1216, "reward": 2.399913966655731, "reward_std": 3.074670433998108, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7058215141296387, "rewards/no_repetition_reward_func": -0.30590756237506866, "rewards/verse_reward_func": 0.0, "step": 1538 }, { "completion_length": 254.984375, "epoch": 12.312, "grad_norm": 4.125, "kl": 3.8723056316375732, "learning_rate": 4.6812826566003934e-05, "loss": 0.1549, "reward": 2.3103225231170654, "reward_std": 2.77334201335907, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.577003240585327, "rewards/no_repetition_reward_func": -0.2666807323694229, "rewards/verse_reward_func": 0.0, "step": 1539 }, { "completion_length": 252.71875, "epoch": 12.32, "grad_norm": 5.3125, "kl": 2.927532911300659, "learning_rate": 4.68060011500211e-05, "loss": 0.1171, "reward": 3.0318820476531982, "reward_std": 3.2110323905944824, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.358364462852478, "rewards/no_repetition_reward_func": -0.318669855594635, "rewards/verse_reward_func": -0.0078125, "step": 1540 }, { "completion_length": 250.15625, "epoch": 12.328, "grad_norm": 2.8125, "kl": 3.404276728630066, "learning_rate": 4.6799168932130915e-05, "loss": 0.1362, "reward": 2.6189157962799072, "reward_std": 3.1074635982513428, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.920172691345215, "rewards/no_repetition_reward_func": -0.29344433546066284, "rewards/verse_reward_func": -0.0078125, "step": 1541 }, { "completion_length": 253.9375, "epoch": 12.336, "grad_norm": 3.328125, "kl": 3.427414059638977, "learning_rate": 4.679232991446456e-05, "loss": 0.1371, "reward": 2.6045703887939453, "reward_std": 3.1448276042938232, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8834075927734375, "rewards/no_repetition_reward_func": -0.2710246741771698, "rewards/verse_reward_func": -0.0078125, "step": 1542 }, { "completion_length": 250.3125, "epoch": 12.344, "grad_norm": 2.515625, "kl": 4.212011337280273, "learning_rate": 4.678548409915532e-05, "loss": 0.1685, "reward": 2.6972970962524414, "reward_std": 3.0932153463363647, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0166683197021484, "rewards/no_repetition_reward_func": -0.311558797955513, "rewards/verse_reward_func": -0.0078125, "step": 1543 }, { "completion_length": 243.875, "epoch": 12.352, "grad_norm": 3.03125, "kl": 4.994361400604248, "learning_rate": 4.677863148833859e-05, "loss": 0.1998, "reward": 2.304375469684601, "reward_std": 3.099339723587036, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6082667112350464, "rewards/no_repetition_reward_func": -0.30389124155044556, "rewards/verse_reward_func": 0.0, "step": 1544 }, { "completion_length": 246.0, "epoch": 12.36, "grad_norm": 4.34375, "kl": 5.815626859664917, "learning_rate": 4.6771772084151885e-05, "loss": 0.2326, "reward": 2.0677966475486755, "reward_std": 2.9331037998199463, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3281033039093018, "rewards/no_repetition_reward_func": -0.2603066563606262, "rewards/verse_reward_func": 0.0, "step": 1545 }, { "completion_length": 252.703125, "epoch": 12.368, "grad_norm": 3.796875, "kl": 5.620810031890869, "learning_rate": 4.676490588873486e-05, "loss": 0.2248, "reward": 2.4439457058906555, "reward_std": 3.0489479303359985, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7576628923416138, "rewards/no_repetition_reward_func": -0.30590465664863586, "rewards/verse_reward_func": -0.0078125, "step": 1546 }, { "completion_length": 249.515625, "epoch": 12.376, "grad_norm": 1.421875, "kl": 3.397670030593872, "learning_rate": 4.675803290422927e-05, "loss": 0.1359, "reward": 2.7066489458084106, "reward_std": 2.704930543899536, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9929767847061157, "rewards/no_repetition_reward_func": -0.2863279730081558, "rewards/verse_reward_func": 0.0, "step": 1547 }, { "completion_length": 247.59375, "epoch": 12.384, "grad_norm": 7.53125, "kl": 4.900902509689331, "learning_rate": 4.6751153132779e-05, "loss": 0.196, "reward": 2.69244384765625, "reward_std": 2.827739953994751, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.958349823951721, "rewards/no_repetition_reward_func": -0.2659060209989548, "rewards/verse_reward_func": 0.0, "step": 1548 }, { "completion_length": 249.609375, "epoch": 12.392, "grad_norm": 1.953125, "kl": 4.103926658630371, "learning_rate": 4.674426657653003e-05, "loss": 0.1642, "reward": 2.476846933364868, "reward_std": 3.0556347370147705, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.779103636741638, "rewards/no_repetition_reward_func": -0.29444420337677, "rewards/verse_reward_func": -0.0078125, "step": 1549 }, { "completion_length": 245.203125, "epoch": 12.4, "grad_norm": 2.828125, "kl": 3.255561590194702, "learning_rate": 4.6737373237630476e-05, "loss": 0.1302, "reward": 2.626900553703308, "reward_std": 3.263106346130371, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.901752471923828, "rewards/no_repetition_reward_func": -0.2748519778251648, "rewards/verse_reward_func": 0.0, "step": 1550 }, { "completion_length": 252.09375, "epoch": 12.408, "grad_norm": 2.921875, "kl": 3.654764175415039, "learning_rate": 4.6730473118230575e-05, "loss": 0.1462, "reward": 2.439825713634491, "reward_std": 2.949162006378174, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7416162490844727, "rewards/no_repetition_reward_func": -0.30179065465927124, "rewards/verse_reward_func": 0.0, "step": 1551 }, { "completion_length": 247.71875, "epoch": 12.416, "grad_norm": 1.7109375, "kl": 3.7255133390426636, "learning_rate": 4.6723566220482664e-05, "loss": 0.149, "reward": 2.2404309511184692, "reward_std": 2.839890241622925, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.535091519355774, "rewards/no_repetition_reward_func": -0.2946605607867241, "rewards/verse_reward_func": 0.0, "step": 1552 }, { "completion_length": 249.109375, "epoch": 12.424, "grad_norm": 2.4375, "kl": 4.486394166946411, "learning_rate": 4.6716652546541194e-05, "loss": 0.1795, "reward": 2.480069160461426, "reward_std": 2.8997230529785156, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7383867502212524, "rewards/no_repetition_reward_func": -0.2583176791667938, "rewards/verse_reward_func": 0.0, "step": 1553 }, { "completion_length": 256.0, "epoch": 12.432, "grad_norm": 2.609375, "kl": 3.947481632232666, "learning_rate": 4.6709732098562745e-05, "loss": 0.1579, "reward": 2.2526224851608276, "reward_std": 2.813735008239746, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.524580478668213, "rewards/no_repetition_reward_func": -0.2641454115509987, "rewards/verse_reward_func": -0.0078125, "step": 1554 }, { "completion_length": 256.0, "epoch": 12.44, "grad_norm": 4.125, "kl": 2.985386371612549, "learning_rate": 4.670280487870598e-05, "loss": 0.1194, "reward": 2.706083655357361, "reward_std": 3.290062189102173, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0570309162139893, "rewards/no_repetition_reward_func": -0.3509472906589508, "rewards/verse_reward_func": 0.0, "step": 1555 }, { "completion_length": 254.421875, "epoch": 12.448, "grad_norm": 2.59375, "kl": 4.035356521606445, "learning_rate": 4.6695870889131724e-05, "loss": 0.1614, "reward": 2.456921339035034, "reward_std": 2.777892589569092, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.766453981399536, "rewards/no_repetition_reward_func": -0.3095323145389557, "rewards/verse_reward_func": 0.0, "step": 1556 }, { "completion_length": 255.375, "epoch": 12.456, "grad_norm": 2.15625, "kl": 4.170536756515503, "learning_rate": 4.668893013200286e-05, "loss": 0.1668, "reward": 2.623794436454773, "reward_std": 2.656400680541992, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9395121335983276, "rewards/no_repetition_reward_func": -0.3079051971435547, "rewards/verse_reward_func": -0.0078125, "step": 1557 }, { "completion_length": 247.8125, "epoch": 12.464, "grad_norm": 2.75, "kl": 4.730407953262329, "learning_rate": 4.6681982609484416e-05, "loss": 0.1892, "reward": 2.2689220905303955, "reward_std": 2.9416545629501343, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.550053119659424, "rewards/no_repetition_reward_func": -0.2733186185359955, "rewards/verse_reward_func": -0.0078125, "step": 1558 }, { "completion_length": 239.984375, "epoch": 12.472, "grad_norm": 2.78125, "kl": 4.270904064178467, "learning_rate": 4.667502832374352e-05, "loss": 0.1708, "reward": 2.148416578769684, "reward_std": 2.4802931547164917, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.385526418685913, "rewards/no_repetition_reward_func": -0.23710975050926208, "rewards/verse_reward_func": 0.0, "step": 1559 }, { "completion_length": 251.8125, "epoch": 12.48, "grad_norm": 3.0625, "kl": 4.104586243629456, "learning_rate": 4.6668067276949414e-05, "loss": 0.1642, "reward": 2.5091147422790527, "reward_std": 2.8335156440734863, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7843884229660034, "rewards/no_repetition_reward_func": -0.2752735912799835, "rewards/verse_reward_func": 0.0, "step": 1560 }, { "completion_length": 252.625, "epoch": 12.488, "grad_norm": 1.203125, "kl": 3.9929888248443604, "learning_rate": 4.666109947127343e-05, "loss": 0.1597, "reward": 2.00976026058197, "reward_std": 2.9222036600112915, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2865872383117676, "rewards/no_repetition_reward_func": -0.2612020969390869, "rewards/verse_reward_func": -0.015625, "step": 1561 }, { "completion_length": 247.890625, "epoch": 12.496, "grad_norm": 1.46875, "kl": 3.517458915710449, "learning_rate": 4.665412490888904e-05, "loss": 0.1407, "reward": 2.407650053501129, "reward_std": 2.658068895339966, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6946340799331665, "rewards/no_repetition_reward_func": -0.27135899662971497, "rewards/verse_reward_func": -0.015625, "step": 1562 }, { "completion_length": 255.21875, "epoch": 12.504, "grad_norm": 2.46875, "kl": 3.5077797174453735, "learning_rate": 4.66471435919718e-05, "loss": 0.1403, "reward": 2.3187525868415833, "reward_std": 2.93926739692688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.619118094444275, "rewards/no_repetition_reward_func": -0.3003654479980469, "rewards/verse_reward_func": 0.0, "step": 1563 }, { "completion_length": 239.640625, "epoch": 12.512, "grad_norm": 4.1875, "kl": 2.4381712675094604, "learning_rate": 4.6640155522699374e-05, "loss": 0.0975, "reward": 2.3454463481903076, "reward_std": 3.026001214981079, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.647749185562134, "rewards/no_repetition_reward_func": -0.28667794167995453, "rewards/verse_reward_func": -0.015625, "step": 1564 }, { "completion_length": 239.921875, "epoch": 12.52, "grad_norm": 2.25, "kl": 3.290971279144287, "learning_rate": 4.6633160703251554e-05, "loss": 0.1316, "reward": 1.1618568897247314, "reward_std": 2.19096577167511, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3927636742591858, "rewards/no_repetition_reward_func": -0.22309430688619614, "rewards/verse_reward_func": -0.0078125, "step": 1565 }, { "completion_length": 246.0625, "epoch": 12.528, "grad_norm": 2.265625, "kl": 2.1287012100219727, "learning_rate": 4.6626159135810205e-05, "loss": 0.0851, "reward": 2.7194902896881104, "reward_std": 3.003477096557617, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.011320471763611, "rewards/no_repetition_reward_func": -0.2918301522731781, "rewards/verse_reward_func": 0.0, "step": 1566 }, { "completion_length": 250.984375, "epoch": 12.536, "grad_norm": 1.6640625, "kl": 3.2455450296401978, "learning_rate": 4.661915082255932e-05, "loss": 0.1298, "reward": 1.7991175055503845, "reward_std": 2.52807080745697, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0450727939605713, "rewards/no_repetition_reward_func": -0.24595524370670319, "rewards/verse_reward_func": 0.0, "step": 1567 }, { "completion_length": 244.28125, "epoch": 12.544, "grad_norm": 2.578125, "kl": 2.7456140518188477, "learning_rate": 4.6612135765685e-05, "loss": 0.1098, "reward": 2.544510006904602, "reward_std": 2.8930994272232056, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8124988079071045, "rewards/no_repetition_reward_func": -0.2601761370897293, "rewards/verse_reward_func": -0.0078125, "step": 1568 }, { "completion_length": 246.53125, "epoch": 12.552, "grad_norm": 2.71875, "kl": 3.4318257570266724, "learning_rate": 4.660511396737541e-05, "loss": 0.1373, "reward": 1.491709053516388, "reward_std": 2.4351656436920166, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7615267634391785, "rewards/no_repetition_reward_func": -0.26200519502162933, "rewards/verse_reward_func": -0.0078125, "step": 1569 }, { "completion_length": 251.265625, "epoch": 12.56, "grad_norm": 2.359375, "kl": 2.5929224491119385, "learning_rate": 4.659808542982088e-05, "loss": 0.1037, "reward": 2.4966673851013184, "reward_std": 2.7653582096099854, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.812593698501587, "rewards/no_repetition_reward_func": -0.30811387300491333, "rewards/verse_reward_func": -0.0078125, "step": 1570 }, { "completion_length": 252.96875, "epoch": 12.568, "grad_norm": 3.78125, "kl": 2.1330649852752686, "learning_rate": 4.65910501552138e-05, "loss": 0.0853, "reward": 3.028707504272461, "reward_std": 2.9332239627838135, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3349616527557373, "rewards/no_repetition_reward_func": -0.3062540143728256, "rewards/verse_reward_func": 0.0, "step": 1571 }, { "completion_length": 255.078125, "epoch": 12.576, "grad_norm": 1.640625, "kl": 3.7391297817230225, "learning_rate": 4.6584008145748656e-05, "loss": 0.1496, "reward": 2.193704128265381, "reward_std": 2.8470299243927, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.448217988014221, "rewards/no_repetition_reward_func": -0.2545137405395508, "rewards/verse_reward_func": 0.0, "step": 1572 }, { "completion_length": 250.75, "epoch": 12.584, "grad_norm": 3.796875, "kl": 3.7188148498535156, "learning_rate": 4.657695940362207e-05, "loss": 0.1488, "reward": 1.8857189118862152, "reward_std": 2.1792563796043396, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.162529706954956, "rewards/no_repetition_reward_func": -0.276810884475708, "rewards/verse_reward_func": 0.0, "step": 1573 }, { "completion_length": 253.96875, "epoch": 12.592, "grad_norm": 2.75, "kl": 3.7829365730285645, "learning_rate": 4.6569903931032735e-05, "loss": 0.1513, "reward": 1.8821672201156616, "reward_std": 2.552983045578003, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1527838706970215, "rewards/no_repetition_reward_func": -0.2706165462732315, "rewards/verse_reward_func": 0.0, "step": 1574 }, { "completion_length": 248.828125, "epoch": 12.6, "grad_norm": 1.828125, "kl": 2.9481921195983887, "learning_rate": 4.656284173018144e-05, "loss": 0.1179, "reward": 2.618268370628357, "reward_std": 2.6889638900756836, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8937289714813232, "rewards/no_repetition_reward_func": -0.2676480710506439, "rewards/verse_reward_func": -0.0078125, "step": 1575 }, { "completion_length": 251.03125, "epoch": 12.608, "grad_norm": 2.984375, "kl": 3.1413286924362183, "learning_rate": 4.65557728032711e-05, "loss": 0.1257, "reward": 3.1104440689086914, "reward_std": 3.1986730098724365, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4022765159606934, "rewards/no_repetition_reward_func": -0.29183250665664673, "rewards/verse_reward_func": 0.0, "step": 1576 }, { "completion_length": 243.140625, "epoch": 12.616, "grad_norm": 1.5859375, "kl": 3.389531135559082, "learning_rate": 4.6548697152506705e-05, "loss": 0.1356, "reward": 2.3858824968338013, "reward_std": 2.6893216371536255, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6578874588012695, "rewards/no_repetition_reward_func": -0.2720048427581787, "rewards/verse_reward_func": 0.0, "step": 1577 }, { "completion_length": 254.546875, "epoch": 12.624, "grad_norm": 3.921875, "kl": 3.5605335235595703, "learning_rate": 4.654161478009536e-05, "loss": 0.1424, "reward": 2.972916603088379, "reward_std": 3.031718611717224, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2493650913238525, "rewards/no_repetition_reward_func": -0.276448592543602, "rewards/verse_reward_func": 0.0, "step": 1578 }, { "completion_length": 256.0, "epoch": 12.632, "grad_norm": 4.28125, "kl": 4.924998998641968, "learning_rate": 4.653452568824625e-05, "loss": 0.197, "reward": 2.053754210472107, "reward_std": 2.4409373998641968, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2973082065582275, "rewards/no_repetition_reward_func": -0.24355414509773254, "rewards/verse_reward_func": 0.0, "step": 1579 }, { "completion_length": 245.09375, "epoch": 12.64, "grad_norm": 3.015625, "kl": 5.541895627975464, "learning_rate": 4.652742987917066e-05, "loss": 0.2217, "reward": 1.9203497171401978, "reward_std": 2.7667644023895264, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.195128083229065, "rewards/no_repetition_reward_func": -0.2747783660888672, "rewards/verse_reward_func": 0.0, "step": 1580 }, { "completion_length": 254.765625, "epoch": 12.648, "grad_norm": 3.1875, "kl": 5.053730010986328, "learning_rate": 4.652032735508198e-05, "loss": 0.2021, "reward": 1.684786856174469, "reward_std": 2.7591278553009033, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9607250690460205, "rewards/no_repetition_reward_func": -0.2759382277727127, "rewards/verse_reward_func": 0.0, "step": 1581 }, { "completion_length": 252.578125, "epoch": 12.656, "grad_norm": 3.109375, "kl": 5.315795421600342, "learning_rate": 4.651321811819568e-05, "loss": 0.2126, "reward": 2.245721220970154, "reward_std": 2.6677541732788086, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4955414533615112, "rewards/no_repetition_reward_func": -0.23419517278671265, "rewards/verse_reward_func": -0.015625, "step": 1582 }, { "completion_length": 256.0, "epoch": 12.664, "grad_norm": 1.9765625, "kl": 3.773398995399475, "learning_rate": 4.650610217072934e-05, "loss": 0.1509, "reward": 2.349616229534149, "reward_std": 2.973053216934204, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.63826847076416, "rewards/no_repetition_reward_func": -0.27302733063697815, "rewards/verse_reward_func": -0.015625, "step": 1583 }, { "completion_length": 250.15625, "epoch": 12.672, "grad_norm": 1.9296875, "kl": 3.690843343734741, "learning_rate": 4.649897951490262e-05, "loss": 0.1476, "reward": 2.2663938999176025, "reward_std": 2.5871471166610718, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5561612844467163, "rewards/no_repetition_reward_func": -0.28195489943027496, "rewards/verse_reward_func": -0.0078125, "step": 1584 }, { "completion_length": 254.015625, "epoch": 12.68, "grad_norm": 3.609375, "kl": 2.7285141944885254, "learning_rate": 4.649185015293728e-05, "loss": 0.1091, "reward": 2.733781576156616, "reward_std": 3.305975556373596, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0390512943267822, "rewards/no_repetition_reward_func": -0.30526983737945557, "rewards/verse_reward_func": 0.0, "step": 1585 }, { "completion_length": 245.875, "epoch": 12.688, "grad_norm": 4.9375, "kl": 2.2793538570404053, "learning_rate": 4.648471408705717e-05, "loss": 0.0912, "reward": 3.0104904174804688, "reward_std": 3.1172502040863037, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.351611614227295, "rewards/no_repetition_reward_func": -0.3333085775375366, "rewards/verse_reward_func": -0.0078125, "step": 1586 }, { "completion_length": 256.0, "epoch": 12.696, "grad_norm": 3.046875, "kl": 3.281447649002075, "learning_rate": 4.647757131948822e-05, "loss": 0.1313, "reward": 2.051602363586426, "reward_std": 2.844096064567566, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3031927347183228, "rewards/no_repetition_reward_func": -0.2515902817249298, "rewards/verse_reward_func": 0.0, "step": 1587 }, { "completion_length": 252.234375, "epoch": 12.704, "grad_norm": 2.25, "kl": 3.500048041343689, "learning_rate": 4.647042185245847e-05, "loss": 0.14, "reward": 2.046271562576294, "reward_std": 2.5991275310516357, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3025336265563965, "rewards/no_repetition_reward_func": -0.2484496533870697, "rewards/verse_reward_func": -0.0078125, "step": 1588 }, { "completion_length": 248.765625, "epoch": 12.712, "grad_norm": 1.8203125, "kl": 4.466235637664795, "learning_rate": 4.6463265688198044e-05, "loss": 0.1786, "reward": 1.6937569379806519, "reward_std": 2.6542885303497314, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9214704036712646, "rewards/no_repetition_reward_func": -0.2277134731411934, "rewards/verse_reward_func": 0.0, "step": 1589 }, { "completion_length": 251.0625, "epoch": 12.72, "grad_norm": 3.046875, "kl": 3.0322742462158203, "learning_rate": 4.645610282893915e-05, "loss": 0.1213, "reward": 2.134419858455658, "reward_std": 2.4416396617889404, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4054864645004272, "rewards/no_repetition_reward_func": -0.26325415819883347, "rewards/verse_reward_func": -0.0078125, "step": 1590 }, { "completion_length": 252.640625, "epoch": 12.728, "grad_norm": 2.9375, "kl": 3.4713964462280273, "learning_rate": 4.6448933276916076e-05, "loss": 0.1389, "reward": 2.2035019993782043, "reward_std": 2.746744990348816, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4999300837516785, "rewards/no_repetition_reward_func": -0.2886155918240547, "rewards/verse_reward_func": -0.0078125, "step": 1591 }, { "completion_length": 250.375, "epoch": 12.736, "grad_norm": 3.734375, "kl": 3.981875419616699, "learning_rate": 4.644175703436522e-05, "loss": 0.1593, "reward": 2.1089504957199097, "reward_std": 2.4935922622680664, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.33998966217041, "rewards/no_repetition_reward_func": -0.23103928565979004, "rewards/verse_reward_func": 0.0, "step": 1592 }, { "completion_length": 255.34375, "epoch": 12.744, "grad_norm": 2.359375, "kl": 3.159983992576599, "learning_rate": 4.6434574103525044e-05, "loss": 0.1264, "reward": 2.4315325021743774, "reward_std": 3.0672194957733154, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7413724660873413, "rewards/no_repetition_reward_func": -0.3098400831222534, "rewards/verse_reward_func": 0.0, "step": 1593 }, { "completion_length": 252.265625, "epoch": 12.752, "grad_norm": 2.265625, "kl": 4.32274055480957, "learning_rate": 4.6427384486636113e-05, "loss": 0.1729, "reward": 1.8006508946418762, "reward_std": 2.791788935661316, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0858190059661865, "rewards/no_repetition_reward_func": -0.2773558348417282, "rewards/verse_reward_func": -0.0078125, "step": 1594 }, { "completion_length": 244.65625, "epoch": 12.76, "grad_norm": 2.0, "kl": 4.19868004322052, "learning_rate": 4.642018818594107e-05, "loss": 0.1679, "reward": 1.9910603761672974, "reward_std": 2.866353154182434, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2412872314453125, "rewards/no_repetition_reward_func": -0.2424144595861435, "rewards/verse_reward_func": -0.0078125, "step": 1595 }, { "completion_length": 252.53125, "epoch": 12.768, "grad_norm": 1.7265625, "kl": 4.330926895141602, "learning_rate": 4.6412985203684654e-05, "loss": 0.1732, "reward": 1.9610577821731567, "reward_std": 2.6825592517852783, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.240176558494568, "rewards/no_repetition_reward_func": -0.2791188955307007, "rewards/verse_reward_func": 0.0, "step": 1596 }, { "completion_length": 253.75, "epoch": 12.776, "grad_norm": 2.4375, "kl": 3.8826438188552856, "learning_rate": 4.640577554211366e-05, "loss": 0.1553, "reward": 2.4255930185317993, "reward_std": 2.8456201553344727, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.711156964302063, "rewards/no_repetition_reward_func": -0.2855638861656189, "rewards/verse_reward_func": 0.0, "step": 1597 }, { "completion_length": 255.921875, "epoch": 12.784, "grad_norm": 3.96875, "kl": 3.4424796104431152, "learning_rate": 4.639855920347701e-05, "loss": 0.1377, "reward": 2.3136165142059326, "reward_std": 2.28543883562088, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5641914010047913, "rewards/no_repetition_reward_func": -0.25057485699653625, "rewards/verse_reward_func": 0.0, "step": 1598 }, { "completion_length": 251.09375, "epoch": 12.792, "grad_norm": 1.6953125, "kl": 3.8580838441848755, "learning_rate": 4.6391336190025644e-05, "loss": 0.1543, "reward": 2.3102734088897705, "reward_std": 2.6650291681289673, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5607060194015503, "rewards/no_repetition_reward_func": -0.2504325807094574, "rewards/verse_reward_func": 0.0, "step": 1599 }, { "completion_length": 251.25, "epoch": 12.8, "grad_norm": 2.25, "kl": 3.535294771194458, "learning_rate": 4.638410650401267e-05, "loss": 0.1414, "reward": 2.4238080978393555, "reward_std": 2.9378156661987305, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.723520517349243, "rewards/no_repetition_reward_func": -0.29190002381801605, "rewards/verse_reward_func": -0.0078125, "step": 1600 }, { "completion_length": 255.453125, "epoch": 12.808, "grad_norm": 5.8125, "kl": 2.971329689025879, "learning_rate": 4.6376870147693196e-05, "loss": 0.1189, "reward": 2.3552982807159424, "reward_std": 2.5208593010902405, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6680002212524414, "rewards/no_repetition_reward_func": -0.31270190328359604, "rewards/verse_reward_func": 0.0, "step": 1601 }, { "completion_length": 254.375, "epoch": 12.816, "grad_norm": 2.796875, "kl": 3.6561684608459473, "learning_rate": 4.6369627123324465e-05, "loss": 0.1462, "reward": 1.8005212545394897, "reward_std": 2.4567192792892456, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.07392281293869, "rewards/no_repetition_reward_func": -0.2655891329050064, "rewards/verse_reward_func": -0.0078125, "step": 1602 }, { "completion_length": 253.78125, "epoch": 12.824, "grad_norm": 2.609375, "kl": 3.758575201034546, "learning_rate": 4.636237743316578e-05, "loss": 0.1503, "reward": 2.3231531381607056, "reward_std": 2.7339552640914917, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5974961519241333, "rewards/no_repetition_reward_func": -0.26653042435646057, "rewards/verse_reward_func": -0.0078125, "step": 1603 }, { "completion_length": 256.0, "epoch": 12.832, "grad_norm": 2.15625, "kl": 3.5705360174179077, "learning_rate": 4.635512107947851e-05, "loss": 0.1428, "reward": 2.4775960445404053, "reward_std": 2.851416230201721, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7538812160491943, "rewards/no_repetition_reward_func": -0.2762853279709816, "rewards/verse_reward_func": 0.0, "step": 1604 }, { "completion_length": 250.25, "epoch": 12.84, "grad_norm": 9.1875, "kl": 2.8245049715042114, "learning_rate": 4.6347858064526125e-05, "loss": 0.113, "reward": 2.7389620542526245, "reward_std": 3.063877820968628, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0597740411758423, "rewards/no_repetition_reward_func": -0.3129996508359909, "rewards/verse_reward_func": -0.0078125, "step": 1605 }, { "completion_length": 242.609375, "epoch": 12.848, "grad_norm": 3.15625, "kl": 3.43243145942688, "learning_rate": 4.634058839057417e-05, "loss": 0.1373, "reward": 2.2643831968307495, "reward_std": 2.8018252849578857, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5186190605163574, "rewards/no_repetition_reward_func": -0.23861081898212433, "rewards/verse_reward_func": -0.015625, "step": 1606 }, { "completion_length": 249.375, "epoch": 12.856, "grad_norm": 1.8203125, "kl": 4.278658032417297, "learning_rate": 4.6333312059890256e-05, "loss": 0.1711, "reward": 2.0799254179000854, "reward_std": 3.082559823989868, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3480145931243896, "rewards/no_repetition_reward_func": -0.260276734828949, "rewards/verse_reward_func": -0.0078125, "step": 1607 }, { "completion_length": 251.0625, "epoch": 12.864, "grad_norm": 1.4375, "kl": 3.4388214349746704, "learning_rate": 4.6326029074744074e-05, "loss": 0.1376, "reward": 2.866774320602417, "reward_std": 2.932356834411621, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.186600685119629, "rewards/no_repetition_reward_func": -0.3120136559009552, "rewards/verse_reward_func": -0.0078125, "step": 1608 }, { "completion_length": 256.0, "epoch": 12.872, "grad_norm": 2.640625, "kl": 4.291573524475098, "learning_rate": 4.63187394374074e-05, "loss": 0.1717, "reward": 2.4849112033843994, "reward_std": 3.1986958980560303, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8236485719680786, "rewards/no_repetition_reward_func": -0.3309251219034195, "rewards/verse_reward_func": -0.0078125, "step": 1609 }, { "completion_length": 243.75, "epoch": 12.88, "grad_norm": 2.5, "kl": 2.8356359004974365, "learning_rate": 4.631144315015407e-05, "loss": 0.1134, "reward": 2.620771288871765, "reward_std": 2.8412325382232666, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8905365467071533, "rewards/no_repetition_reward_func": -0.26976536214351654, "rewards/verse_reward_func": 0.0, "step": 1610 }, { "completion_length": 253.234375, "epoch": 12.888, "grad_norm": 1.8671875, "kl": 4.4500412940979, "learning_rate": 4.630414021525999e-05, "loss": 0.178, "reward": 2.674447536468506, "reward_std": 3.2485474348068237, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9714683294296265, "rewards/no_repetition_reward_func": -0.28920823335647583, "rewards/verse_reward_func": -0.0078125, "step": 1611 }, { "completion_length": 249.234375, "epoch": 12.896, "grad_norm": 2.0625, "kl": 5.750308036804199, "learning_rate": 4.629683063500319e-05, "loss": 0.23, "reward": 2.0177950859069824, "reward_std": 2.7992489337921143, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.273171901702881, "rewards/no_repetition_reward_func": -0.24756435304880142, "rewards/verse_reward_func": -0.0078125, "step": 1612 }, { "completion_length": 246.84375, "epoch": 12.904, "grad_norm": 2.0, "kl": 4.739517450332642, "learning_rate": 4.62895144116637e-05, "loss": 0.1896, "reward": 1.8426084518432617, "reward_std": 2.701866626739502, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.107474446296692, "rewards/no_repetition_reward_func": -0.24924106895923615, "rewards/verse_reward_func": -0.015625, "step": 1613 }, { "completion_length": 245.96875, "epoch": 12.912, "grad_norm": 4.40625, "kl": 5.136662483215332, "learning_rate": 4.628219154752367e-05, "loss": 0.2055, "reward": 1.7513875365257263, "reward_std": 2.629231095314026, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.023848533630371, "rewards/no_repetition_reward_func": -0.2646483778953552, "rewards/verse_reward_func": -0.0078125, "step": 1614 }, { "completion_length": 247.203125, "epoch": 12.92, "grad_norm": 2.625, "kl": 4.695785760879517, "learning_rate": 4.6274862044867304e-05, "loss": 0.1878, "reward": 2.2384159564971924, "reward_std": 2.7528955936431885, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5087674856185913, "rewards/no_repetition_reward_func": -0.2547266334295273, "rewards/verse_reward_func": -0.015625, "step": 1615 }, { "completion_length": 251.1875, "epoch": 12.928, "grad_norm": 2.0625, "kl": 3.984424114227295, "learning_rate": 4.626752590598088e-05, "loss": 0.1594, "reward": 2.029416859149933, "reward_std": 2.618002414703369, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3232550621032715, "rewards/no_repetition_reward_func": -0.2860257178544998, "rewards/verse_reward_func": -0.0078125, "step": 1616 }, { "completion_length": 236.578125, "epoch": 12.936, "grad_norm": 1.953125, "kl": 3.328640937805176, "learning_rate": 4.626018313315275e-05, "loss": 0.1331, "reward": 1.994189739227295, "reward_std": 2.943313479423523, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.341765880584717, "rewards/no_repetition_reward_func": -0.3241385966539383, "rewards/verse_reward_func": -0.0234375, "step": 1617 }, { "completion_length": 252.25, "epoch": 12.943999999999999, "grad_norm": 1.8125, "kl": 3.8770912885665894, "learning_rate": 4.625283372867333e-05, "loss": 0.1551, "reward": 2.253300666809082, "reward_std": 2.5057183504104614, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.472425103187561, "rewards/no_repetition_reward_func": -0.2191244587302208, "rewards/verse_reward_func": 0.0, "step": 1618 }, { "completion_length": 244.765625, "epoch": 12.952, "grad_norm": 2.125, "kl": 3.100154399871826, "learning_rate": 4.6245477694835106e-05, "loss": 0.124, "reward": 2.117991268634796, "reward_std": 2.6869235038757324, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.379185914993286, "rewards/no_repetition_reward_func": -0.2533821612596512, "rewards/verse_reward_func": -0.0078125, "step": 1619 }, { "completion_length": 242.5, "epoch": 12.96, "grad_norm": 2.5, "kl": 3.8033212423324585, "learning_rate": 4.6238115033932636e-05, "loss": 0.1521, "reward": 2.1443495750427246, "reward_std": 2.8015562295913696, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.390329957008362, "rewards/no_repetition_reward_func": -0.24598023295402527, "rewards/verse_reward_func": 0.0, "step": 1620 }, { "completion_length": 253.375, "epoch": 12.968, "grad_norm": 3.234375, "kl": 3.325142502784729, "learning_rate": 4.623074574826254e-05, "loss": 0.133, "reward": 2.808706760406494, "reward_std": 2.9600670337677, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1043779850006104, "rewards/no_repetition_reward_func": -0.2956711947917938, "rewards/verse_reward_func": 0.0, "step": 1621 }, { "completion_length": 251.984375, "epoch": 12.975999999999999, "grad_norm": 2.859375, "kl": 3.2056641578674316, "learning_rate": 4.622336984012351e-05, "loss": 0.1282, "reward": 3.01991069316864, "reward_std": 3.33636212348938, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3423221111297607, "rewards/no_repetition_reward_func": -0.3145989626646042, "rewards/verse_reward_func": -0.0078125, "step": 1622 }, { "completion_length": 255.125, "epoch": 12.984, "grad_norm": 3.78125, "kl": 2.9843409061431885, "learning_rate": 4.621598731181629e-05, "loss": 0.1194, "reward": 2.902596354484558, "reward_std": 2.845318555831909, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2192413806915283, "rewards/no_repetition_reward_func": -0.31664517521858215, "rewards/verse_reward_func": 0.0, "step": 1623 }, { "completion_length": 245.953125, "epoch": 12.992, "grad_norm": 2.0625, "kl": 3.432937264442444, "learning_rate": 4.6208598165643715e-05, "loss": 0.1373, "reward": 2.722576379776001, "reward_std": 3.3208985328674316, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0045971870422363, "rewards/no_repetition_reward_func": -0.27420854568481445, "rewards/verse_reward_func": -0.0078125, "step": 1624 }, { "completion_length": 256.0, "epoch": 13.0, "grad_norm": 2.9375, "kl": 4.368974685668945, "learning_rate": 4.620120240391065e-05, "loss": 0.1748, "reward": 2.715998411178589, "reward_std": 2.805495023727417, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.018716812133789, "rewards/no_repetition_reward_func": -0.30271854996681213, "rewards/verse_reward_func": 0.0, "step": 1625 }, { "completion_length": 251.515625, "epoch": 13.008, "grad_norm": 3.671875, "kl": 5.104730844497681, "learning_rate": 4.619380002892406e-05, "loss": 0.2042, "reward": 2.0482877492904663, "reward_std": 2.81010639667511, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.341486096382141, "rewards/no_repetition_reward_func": -0.2853858172893524, "rewards/verse_reward_func": -0.0078125, "step": 1626 }, { "completion_length": 249.25, "epoch": 13.016, "grad_norm": 6.6875, "kl": 5.739724159240723, "learning_rate": 4.618639104299294e-05, "loss": 0.2296, "reward": 2.1276735663414, "reward_std": 3.052404046058655, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4207042455673218, "rewards/no_repetition_reward_func": -0.2930306941270828, "rewards/verse_reward_func": 0.0, "step": 1627 }, { "completion_length": 247.984375, "epoch": 13.024, "grad_norm": 3.046875, "kl": 4.676158666610718, "learning_rate": 4.617897544842836e-05, "loss": 0.187, "reward": 1.9743176102638245, "reward_std": 2.5174434185028076, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2204543352127075, "rewards/no_repetition_reward_func": -0.2383243292570114, "rewards/verse_reward_func": -0.0078125, "step": 1628 }, { "completion_length": 251.078125, "epoch": 13.032, "grad_norm": 1.6171875, "kl": 4.224807500839233, "learning_rate": 4.617155324754346e-05, "loss": 0.169, "reward": 2.2442694902420044, "reward_std": 2.741036891937256, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5413626432418823, "rewards/no_repetition_reward_func": -0.2970931828022003, "rewards/verse_reward_func": 0.0, "step": 1629 }, { "completion_length": 249.359375, "epoch": 13.04, "grad_norm": 2.6875, "kl": 4.188807725906372, "learning_rate": 4.616412444265345e-05, "loss": 0.1676, "reward": 2.7665112018585205, "reward_std": 3.02338707447052, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.034608840942383, "rewards/no_repetition_reward_func": -0.2680976092815399, "rewards/verse_reward_func": 0.0, "step": 1630 }, { "completion_length": 248.328125, "epoch": 13.048, "grad_norm": 4.34375, "kl": 3.70311176776886, "learning_rate": 4.6156689036075555e-05, "loss": 0.1481, "reward": 2.6809115409851074, "reward_std": 3.4678597450256348, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9886187314987183, "rewards/no_repetition_reward_func": -0.2920822203159332, "rewards/verse_reward_func": -0.015625, "step": 1631 }, { "completion_length": 250.0, "epoch": 13.056, "grad_norm": 2.953125, "kl": 3.353654623031616, "learning_rate": 4.614924703012911e-05, "loss": 0.1341, "reward": 2.2276500463485718, "reward_std": 2.8446160554885864, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4972522258758545, "rewards/no_repetition_reward_func": -0.26960205286741257, "rewards/verse_reward_func": 0.0, "step": 1632 }, { "completion_length": 252.25, "epoch": 13.064, "grad_norm": 2.546875, "kl": 3.887268900871277, "learning_rate": 4.614179842713547e-05, "loss": 0.1555, "reward": 2.0975492000579834, "reward_std": 2.664811849594116, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3957309126853943, "rewards/no_repetition_reward_func": -0.29818175733089447, "rewards/verse_reward_func": 0.0, "step": 1633 }, { "completion_length": 253.484375, "epoch": 13.072, "grad_norm": 2.59375, "kl": 4.180517673492432, "learning_rate": 4.6134343229418075e-05, "loss": 0.1672, "reward": 1.900677502155304, "reward_std": 2.526578664779663, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.119898557662964, "rewards/no_repetition_reward_func": -0.2192210704088211, "rewards/verse_reward_func": 0.0, "step": 1634 }, { "completion_length": 249.140625, "epoch": 13.08, "grad_norm": 1.5625, "kl": 3.7327382564544678, "learning_rate": 4.612688143930242e-05, "loss": 0.1493, "reward": 2.316321909427643, "reward_std": 2.7757043838500977, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6048243045806885, "rewards/no_repetition_reward_func": -0.27287760376930237, "rewards/verse_reward_func": -0.015625, "step": 1635 }, { "completion_length": 252.671875, "epoch": 13.088, "grad_norm": 3.671875, "kl": 4.274078607559204, "learning_rate": 4.611941305911602e-05, "loss": 0.171, "reward": 2.2869625091552734, "reward_std": 3.1011722087860107, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.537584066390991, "rewards/no_repetition_reward_func": -0.24280904233455658, "rewards/verse_reward_func": -0.0078125, "step": 1636 }, { "completion_length": 248.671875, "epoch": 13.096, "grad_norm": 3.515625, "kl": 4.006612777709961, "learning_rate": 4.61119380911885e-05, "loss": 0.1603, "reward": 2.012388288974762, "reward_std": 2.0713589191436768, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.290987014770508, "rewards/no_repetition_reward_func": -0.278598889708519, "rewards/verse_reward_func": 0.0, "step": 1637 }, { "completion_length": 248.265625, "epoch": 13.104, "grad_norm": 4.28125, "kl": 5.479404449462891, "learning_rate": 4.610445653785151e-05, "loss": 0.2192, "reward": 1.8992301225662231, "reward_std": 2.9859501123428345, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.191808819770813, "rewards/no_repetition_reward_func": -0.28476622700691223, "rewards/verse_reward_func": -0.0078125, "step": 1638 }, { "completion_length": 252.78125, "epoch": 13.112, "grad_norm": 3.625, "kl": 4.5054768323898315, "learning_rate": 4.6096968401438745e-05, "loss": 0.1802, "reward": 3.3363776206970215, "reward_std": 3.3842943906784058, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.667829990386963, "rewards/no_repetition_reward_func": -0.3314523547887802, "rewards/verse_reward_func": 0.0, "step": 1639 }, { "completion_length": 253.578125, "epoch": 13.12, "grad_norm": 4.6875, "kl": 3.936478853225708, "learning_rate": 4.6089473684285974e-05, "loss": 0.1575, "reward": 2.9183318614959717, "reward_std": 3.185209274291992, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.202083945274353, "rewards/no_repetition_reward_func": -0.2759396582841873, "rewards/verse_reward_func": -0.0078125, "step": 1640 }, { "completion_length": 252.1875, "epoch": 13.128, "grad_norm": 3.15625, "kl": 4.549424648284912, "learning_rate": 4.608197238873101e-05, "loss": 0.182, "reward": 2.1147838830947876, "reward_std": 2.7106353044509888, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.360417604446411, "rewards/no_repetition_reward_func": -0.2456338107585907, "rewards/verse_reward_func": 0.0, "step": 1641 }, { "completion_length": 249.890625, "epoch": 13.136, "grad_norm": 3.0, "kl": 5.158722400665283, "learning_rate": 4.607446451711372e-05, "loss": 0.2063, "reward": 1.645550549030304, "reward_std": 2.237249732017517, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9173412322998047, "rewards/no_repetition_reward_func": -0.2561657279729843, "rewards/verse_reward_func": -0.015625, "step": 1642 }, { "completion_length": 254.203125, "epoch": 13.144, "grad_norm": 2.484375, "kl": 3.8473596572875977, "learning_rate": 4.6066950071776015e-05, "loss": 0.1539, "reward": 2.7225561141967773, "reward_std": 3.02406108379364, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0047541856765747, "rewards/no_repetition_reward_func": -0.28219807147979736, "rewards/verse_reward_func": 0.0, "step": 1643 }, { "completion_length": 247.609375, "epoch": 13.152, "grad_norm": 3.515625, "kl": 2.7976187467575073, "learning_rate": 4.605942905506188e-05, "loss": 0.1119, "reward": 3.34137225151062, "reward_std": 3.2763376235961914, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.6729177236557007, "rewards/no_repetition_reward_func": -0.3315454125404358, "rewards/verse_reward_func": 0.0, "step": 1644 }, { "completion_length": 250.1875, "epoch": 13.16, "grad_norm": 1.6953125, "kl": 4.485804796218872, "learning_rate": 4.605190146931731e-05, "loss": 0.1794, "reward": 2.031406581401825, "reward_std": 2.8578234910964966, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3116648197174072, "rewards/no_repetition_reward_func": -0.280258372426033, "rewards/verse_reward_func": 0.0, "step": 1645 }, { "completion_length": 254.9375, "epoch": 13.168, "grad_norm": 3.53125, "kl": 4.11227285861969, "learning_rate": 4.6044367316890386e-05, "loss": 0.1645, "reward": 2.6847424507141113, "reward_std": 2.9973864555358887, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9733091592788696, "rewards/no_repetition_reward_func": -0.2885664850473404, "rewards/verse_reward_func": 0.0, "step": 1646 }, { "completion_length": 255.03125, "epoch": 13.176, "grad_norm": 2.046875, "kl": 4.15923547744751, "learning_rate": 4.6036826600131216e-05, "loss": 0.1664, "reward": 2.8880350589752197, "reward_std": 2.8073447942733765, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2016241550445557, "rewards/no_repetition_reward_func": -0.3135893791913986, "rewards/verse_reward_func": 0.0, "step": 1647 }, { "completion_length": 248.984375, "epoch": 13.184, "grad_norm": 3.234375, "kl": 4.397980213165283, "learning_rate": 4.602927932139197e-05, "loss": 0.1759, "reward": 1.9115755558013916, "reward_std": 2.665304660797119, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1670045256614685, "rewards/no_repetition_reward_func": -0.23980404436588287, "rewards/verse_reward_func": -0.015625, "step": 1648 }, { "completion_length": 244.59375, "epoch": 13.192, "grad_norm": 6.8125, "kl": 5.356084108352661, "learning_rate": 4.602172548302684e-05, "loss": 0.2142, "reward": 2.004522204399109, "reward_std": 2.8402737379074097, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2618343830108643, "rewards/no_repetition_reward_func": -0.2573120445013046, "rewards/verse_reward_func": 0.0, "step": 1649 }, { "completion_length": 252.28125, "epoch": 13.2, "grad_norm": 2.328125, "kl": 4.63471794128418, "learning_rate": 4.601416508739211e-05, "loss": 0.1854, "reward": 2.385565996170044, "reward_std": 2.983916759490967, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.669820487499237, "rewards/no_repetition_reward_func": -0.26862942427396774, "rewards/verse_reward_func": -0.015625, "step": 1650 }, { "completion_length": 252.0, "epoch": 13.208, "grad_norm": 4.0, "kl": 4.913161754608154, "learning_rate": 4.6006598136846056e-05, "loss": 0.1965, "reward": 2.0912469625473022, "reward_std": 3.2701234817504883, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3790184259414673, "rewards/no_repetition_reward_func": -0.2721465826034546, "rewards/verse_reward_func": -0.015625, "step": 1651 }, { "completion_length": 250.78125, "epoch": 13.216, "grad_norm": 2.296875, "kl": 3.979487419128418, "learning_rate": 4.599902463374903e-05, "loss": 0.1592, "reward": 2.2103891372680664, "reward_std": 2.687657117843628, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4886398315429688, "rewards/no_repetition_reward_func": -0.27825070172548294, "rewards/verse_reward_func": 0.0, "step": 1652 }, { "completion_length": 254.875, "epoch": 13.224, "grad_norm": 2.9375, "kl": 3.509347081184387, "learning_rate": 4.599144458046343e-05, "loss": 0.1404, "reward": 2.251941680908203, "reward_std": 2.7385374307632446, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5103448629379272, "rewards/no_repetition_reward_func": -0.2584032788872719, "rewards/verse_reward_func": 0.0, "step": 1653 }, { "completion_length": 250.171875, "epoch": 13.232, "grad_norm": 2.09375, "kl": 4.0967795848846436, "learning_rate": 4.598385797935368e-05, "loss": 0.1639, "reward": 2.3251513242721558, "reward_std": 2.639124870300293, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.616198182106018, "rewards/no_repetition_reward_func": -0.2910469025373459, "rewards/verse_reward_func": 0.0, "step": 1654 }, { "completion_length": 231.890625, "epoch": 13.24, "grad_norm": 2.53125, "kl": 3.2347965240478516, "learning_rate": 4.597626483278625e-05, "loss": 0.1294, "reward": 2.2281484603881836, "reward_std": 2.8726131916046143, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.52078378200531, "rewards/no_repetition_reward_func": -0.2691977918148041, "rewards/verse_reward_func": -0.0234375, "step": 1655 }, { "completion_length": 254.1875, "epoch": 13.248, "grad_norm": 2.328125, "kl": 3.27033531665802, "learning_rate": 4.596866514312967e-05, "loss": 0.1308, "reward": 2.2200883626937866, "reward_std": 3.063509941101074, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5093235969543457, "rewards/no_repetition_reward_func": -0.28923527896404266, "rewards/verse_reward_func": 0.0, "step": 1656 }, { "completion_length": 249.578125, "epoch": 13.256, "grad_norm": 2.65625, "kl": 4.02722954750061, "learning_rate": 4.596105891275449e-05, "loss": 0.1611, "reward": 1.33786541223526, "reward_std": 2.3921438455581665, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5888969898223877, "rewards/no_repetition_reward_func": -0.2432190403342247, "rewards/verse_reward_func": -0.0078125, "step": 1657 }, { "completion_length": 254.75, "epoch": 13.264, "grad_norm": 4.46875, "kl": 1.890956699848175, "learning_rate": 4.5953446144033316e-05, "loss": 0.0756, "reward": 2.864677667617798, "reward_std": 3.1819498538970947, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2125309705734253, "rewards/no_repetition_reward_func": -0.34004098176956177, "rewards/verse_reward_func": -0.0078125, "step": 1658 }, { "completion_length": 240.515625, "epoch": 13.272, "grad_norm": 3.15625, "kl": 2.27107834815979, "learning_rate": 4.594582683934078e-05, "loss": 0.0908, "reward": 1.8848015666007996, "reward_std": 2.891184449195862, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1999140977859497, "rewards/no_repetition_reward_func": -0.29948751628398895, "rewards/verse_reward_func": -0.015625, "step": 1659 }, { "completion_length": 250.046875, "epoch": 13.28, "grad_norm": 4.09375, "kl": 2.304242253303528, "learning_rate": 4.593820100105355e-05, "loss": 0.0922, "reward": 2.423457384109497, "reward_std": 2.8345847129821777, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7180817127227783, "rewards/no_repetition_reward_func": -0.2868116945028305, "rewards/verse_reward_func": -0.0078125, "step": 1660 }, { "completion_length": 244.421875, "epoch": 13.288, "grad_norm": 3.765625, "kl": 2.9295694828033447, "learning_rate": 4.593056863155034e-05, "loss": 0.1172, "reward": 1.49612295627594, "reward_std": 2.051497220993042, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7474604845046997, "rewards/no_repetition_reward_func": -0.2279001995921135, "rewards/verse_reward_func": -0.0234375, "step": 1661 }, { "completion_length": 249.953125, "epoch": 13.296, "grad_norm": 4.0, "kl": 2.191919982433319, "learning_rate": 4.5922929733211926e-05, "loss": 0.0877, "reward": 2.398244857788086, "reward_std": 2.8106894493103027, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6851630210876465, "rewards/no_repetition_reward_func": -0.27910567820072174, "rewards/verse_reward_func": -0.0078125, "step": 1662 }, { "completion_length": 247.3125, "epoch": 13.304, "grad_norm": 2.703125, "kl": 2.7602556943893433, "learning_rate": 4.591528430842107e-05, "loss": 0.1104, "reward": 1.9511936902999878, "reward_std": 2.6085662841796875, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2031731605529785, "rewards/no_repetition_reward_func": -0.24416686594486237, "rewards/verse_reward_func": -0.0078125, "step": 1663 }, { "completion_length": 253.890625, "epoch": 13.312, "grad_norm": 3.078125, "kl": 3.0297396183013916, "learning_rate": 4.59076323595626e-05, "loss": 0.1212, "reward": 1.825307548046112, "reward_std": 2.6876723766326904, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.08513343334198, "rewards/no_repetition_reward_func": -0.2598258703947067, "rewards/verse_reward_func": 0.0, "step": 1664 }, { "completion_length": 249.09375, "epoch": 13.32, "grad_norm": 1.4296875, "kl": 2.9390772581100464, "learning_rate": 4.589997388902338e-05, "loss": 0.1176, "reward": 1.669019103050232, "reward_std": 2.615586757659912, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.953645646572113, "rewards/no_repetition_reward_func": -0.2768140882253647, "rewards/verse_reward_func": -0.0078125, "step": 1665 }, { "completion_length": 253.984375, "epoch": 13.328, "grad_norm": 4.125, "kl": 5.091781854629517, "learning_rate": 4.589230889919232e-05, "loss": 0.2037, "reward": 1.6045320630073547, "reward_std": 2.469750165939331, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8994295597076416, "rewards/no_repetition_reward_func": -0.27927258610725403, "rewards/verse_reward_func": -0.015625, "step": 1666 }, { "completion_length": 246.140625, "epoch": 13.336, "grad_norm": 1.1640625, "kl": 3.717679977416992, "learning_rate": 4.5884637392460314e-05, "loss": 0.1487, "reward": 2.0192466974258423, "reward_std": 2.7767070531845093, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.265177011489868, "rewards/no_repetition_reward_func": -0.24593020975589752, "rewards/verse_reward_func": 0.0, "step": 1667 }, { "completion_length": 245.125, "epoch": 13.344, "grad_norm": 2.1875, "kl": 2.7540762424468994, "learning_rate": 4.5876959371220344e-05, "loss": 0.1102, "reward": 2.1455233097076416, "reward_std": 2.74330997467041, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4195228815078735, "rewards/no_repetition_reward_func": -0.2739996165037155, "rewards/verse_reward_func": 0.0, "step": 1668 }, { "completion_length": 245.1875, "epoch": 13.352, "grad_norm": 2.546875, "kl": 3.675679564476013, "learning_rate": 4.5869274837867394e-05, "loss": 0.147, "reward": 1.9372786283493042, "reward_std": 2.281585931777954, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1972174048423767, "rewards/no_repetition_reward_func": -0.2521262913942337, "rewards/verse_reward_func": -0.0078125, "step": 1669 }, { "completion_length": 243.578125, "epoch": 13.36, "grad_norm": 1.5390625, "kl": 4.090217471122742, "learning_rate": 4.586158379479848e-05, "loss": 0.1636, "reward": 1.9523930549621582, "reward_std": 2.645199418067932, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2253633737564087, "rewards/no_repetition_reward_func": -0.2651579678058624, "rewards/verse_reward_func": -0.0078125, "step": 1670 }, { "completion_length": 233.984375, "epoch": 13.368, "grad_norm": 2.0625, "kl": 4.306548595428467, "learning_rate": 4.585388624441267e-05, "loss": 0.1723, "reward": 1.7137166261672974, "reward_std": 2.4620689153671265, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.925137460231781, "rewards/no_repetition_reward_func": -0.20360829681158066, "rewards/verse_reward_func": -0.0078125, "step": 1671 }, { "completion_length": 250.625, "epoch": 13.376, "grad_norm": 1.8515625, "kl": 4.394012928009033, "learning_rate": 4.5846182189111035e-05, "loss": 0.1758, "reward": 2.1364643573760986, "reward_std": 2.884698271751404, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3927210569381714, "rewards/no_repetition_reward_func": -0.24844422936439514, "rewards/verse_reward_func": -0.0078125, "step": 1672 }, { "completion_length": 252.96875, "epoch": 13.384, "grad_norm": 1.9609375, "kl": 3.8197368383407593, "learning_rate": 4.58384716312967e-05, "loss": 0.1528, "reward": 1.890413522720337, "reward_std": 2.580256223678589, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.177826225757599, "rewards/no_repetition_reward_func": -0.27960020303726196, "rewards/verse_reward_func": -0.0078125, "step": 1673 }, { "completion_length": 256.0, "epoch": 13.392, "grad_norm": 4.125, "kl": 4.612240672111511, "learning_rate": 4.583075457337479e-05, "loss": 0.1845, "reward": 2.1055628061294556, "reward_std": 2.698671817779541, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3713635206222534, "rewards/no_repetition_reward_func": -0.2658007964491844, "rewards/verse_reward_func": 0.0, "step": 1674 }, { "completion_length": 252.0625, "epoch": 13.4, "grad_norm": 2.328125, "kl": 4.913311243057251, "learning_rate": 4.5823031017752485e-05, "loss": 0.1965, "reward": 1.7545798420906067, "reward_std": 2.6858019828796387, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0334397554397583, "rewards/no_repetition_reward_func": -0.271047480404377, "rewards/verse_reward_func": -0.0078125, "step": 1675 }, { "completion_length": 249.65625, "epoch": 13.408, "grad_norm": 3.625, "kl": 4.61419939994812, "learning_rate": 4.581530096683898e-05, "loss": 0.1846, "reward": 1.6683043241500854, "reward_std": 2.665731191635132, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9466273784637451, "rewards/no_repetition_reward_func": -0.2626979649066925, "rewards/verse_reward_func": -0.015625, "step": 1676 }, { "completion_length": 255.15625, "epoch": 13.416, "grad_norm": 2.1875, "kl": 3.716050386428833, "learning_rate": 4.580756442304549e-05, "loss": 0.1486, "reward": 2.107650876045227, "reward_std": 2.581113576889038, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3446531295776367, "rewards/no_repetition_reward_func": -0.23700212687253952, "rewards/verse_reward_func": 0.0, "step": 1677 }, { "completion_length": 251.953125, "epoch": 13.424, "grad_norm": 3.046875, "kl": 3.123883008956909, "learning_rate": 4.579982138878527e-05, "loss": 0.125, "reward": 1.970259428024292, "reward_std": 2.273935079574585, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2581133246421814, "rewards/no_repetition_reward_func": -0.280041441321373, "rewards/verse_reward_func": -0.0078125, "step": 1678 }, { "completion_length": 243.546875, "epoch": 13.432, "grad_norm": 1.734375, "kl": 3.455186605453491, "learning_rate": 4.579207186647357e-05, "loss": 0.1382, "reward": 2.1758108139038086, "reward_std": 2.6128307580947876, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4179106950759888, "rewards/no_repetition_reward_func": -0.24209962785243988, "rewards/verse_reward_func": 0.0, "step": 1679 }, { "completion_length": 248.84375, "epoch": 13.44, "grad_norm": 2.390625, "kl": 2.8989177942276, "learning_rate": 4.5784315858527715e-05, "loss": 0.116, "reward": 2.0673792362213135, "reward_std": 2.9206875562667847, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3770468831062317, "rewards/no_repetition_reward_func": -0.29404258728027344, "rewards/verse_reward_func": -0.015625, "step": 1680 }, { "completion_length": 256.0, "epoch": 13.448, "grad_norm": 2.890625, "kl": 2.920210123062134, "learning_rate": 4.5776553367367e-05, "loss": 0.1168, "reward": 1.8380903005599976, "reward_std": 2.220283627510071, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1026394367218018, "rewards/no_repetition_reward_func": -0.2567365765571594, "rewards/verse_reward_func": -0.0078125, "step": 1681 }, { "completion_length": 253.171875, "epoch": 13.456, "grad_norm": 3.671875, "kl": 2.912128210067749, "learning_rate": 4.576878439541278e-05, "loss": 0.1165, "reward": 2.4390889406204224, "reward_std": 3.1681156158447266, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7189290523529053, "rewards/no_repetition_reward_func": -0.2720276266336441, "rewards/verse_reward_func": -0.0078125, "step": 1682 }, { "completion_length": 256.0, "epoch": 13.464, "grad_norm": 2.734375, "kl": 2.9727033376693726, "learning_rate": 4.57610089450884e-05, "loss": 0.1189, "reward": 1.7515432238578796, "reward_std": 2.409308075904846, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9706501364707947, "rewards/no_repetition_reward_func": -0.21910694986581802, "rewards/verse_reward_func": 0.0, "step": 1683 }, { "completion_length": 250.359375, "epoch": 13.472, "grad_norm": 1.9453125, "kl": 2.5901741981506348, "learning_rate": 4.575322701881926e-05, "loss": 0.1036, "reward": 2.5479485988616943, "reward_std": 2.8928229808807373, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.836195707321167, "rewards/no_repetition_reward_func": -0.2882471829652786, "rewards/verse_reward_func": 0.0, "step": 1684 }, { "completion_length": 249.90625, "epoch": 13.48, "grad_norm": 2.96875, "kl": 2.5362844467163086, "learning_rate": 4.574543861903274e-05, "loss": 0.1015, "reward": 3.3165457248687744, "reward_std": 3.2462618350982666, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.62071692943573, "rewards/no_repetition_reward_func": -0.2963586449623108, "rewards/verse_reward_func": -0.0078125, "step": 1685 }, { "completion_length": 251.875, "epoch": 13.488, "grad_norm": 1.96875, "kl": 3.687262535095215, "learning_rate": 4.5737643748158295e-05, "loss": 0.1475, "reward": 2.3890390396118164, "reward_std": 3.0588302612304688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6996114253997803, "rewards/no_repetition_reward_func": -0.2871347963809967, "rewards/verse_reward_func": -0.0234375, "step": 1686 }, { "completion_length": 253.0, "epoch": 13.496, "grad_norm": 1.984375, "kl": 2.8730051517486572, "learning_rate": 4.5729842408627334e-05, "loss": 0.1149, "reward": 2.4320584535598755, "reward_std": 2.6369214057922363, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7179629802703857, "rewards/no_repetition_reward_func": -0.28590449690818787, "rewards/verse_reward_func": 0.0, "step": 1687 }, { "completion_length": 249.6875, "epoch": 13.504, "grad_norm": 3.375, "kl": 3.33628249168396, "learning_rate": 4.572203460287333e-05, "loss": 0.1335, "reward": 3.1800918579101562, "reward_std": 3.1374958753585815, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.500280976295471, "rewards/no_repetition_reward_func": -0.32018908858299255, "rewards/verse_reward_func": 0.0, "step": 1688 }, { "completion_length": 249.484375, "epoch": 13.512, "grad_norm": 5.4375, "kl": 4.854228377342224, "learning_rate": 4.5714220333331756e-05, "loss": 0.1942, "reward": 2.2792898416519165, "reward_std": 2.876086950302124, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.569337248802185, "rewards/no_repetition_reward_func": -0.2822348028421402, "rewards/verse_reward_func": -0.0078125, "step": 1689 }, { "completion_length": 248.390625, "epoch": 13.52, "grad_norm": 2.59375, "kl": 4.985257863998413, "learning_rate": 4.5706399602440106e-05, "loss": 0.1994, "reward": 2.2062944173812866, "reward_std": 2.965130090713501, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.466514825820923, "rewards/no_repetition_reward_func": -0.26022032648324966, "rewards/verse_reward_func": 0.0, "step": 1690 }, { "completion_length": 246.75, "epoch": 13.528, "grad_norm": 4.53125, "kl": 5.858150482177734, "learning_rate": 4.569857241263788e-05, "loss": 0.2343, "reward": 2.046017646789551, "reward_std": 2.9357162714004517, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3642853498458862, "rewards/no_repetition_reward_func": -0.29483023285865784, "rewards/verse_reward_func": -0.0234375, "step": 1691 }, { "completion_length": 252.640625, "epoch": 13.536, "grad_norm": 1.8046875, "kl": 3.9924211502075195, "learning_rate": 4.56907387663666e-05, "loss": 0.1597, "reward": 2.9323856830596924, "reward_std": 2.8765382766723633, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.210922122001648, "rewards/no_repetition_reward_func": -0.2629115432500839, "rewards/verse_reward_func": -0.015625, "step": 1692 }, { "completion_length": 239.921875, "epoch": 13.544, "grad_norm": 3.140625, "kl": 4.040839433670044, "learning_rate": 4.568289866606981e-05, "loss": 0.1616, "reward": 2.4770607948303223, "reward_std": 2.93771755695343, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7284144163131714, "rewards/no_repetition_reward_func": -0.2357284501194954, "rewards/verse_reward_func": -0.015625, "step": 1693 }, { "completion_length": 254.0, "epoch": 13.552, "grad_norm": 8.125, "kl": 5.71073842048645, "learning_rate": 4.567505211419305e-05, "loss": 0.2284, "reward": 2.1465721130371094, "reward_std": 2.9691349267959595, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.448259711265564, "rewards/no_repetition_reward_func": -0.2782500237226486, "rewards/verse_reward_func": -0.0234375, "step": 1694 }, { "completion_length": 249.4375, "epoch": 13.56, "grad_norm": 2.671875, "kl": 3.4419909715652466, "learning_rate": 4.566719911318389e-05, "loss": 0.1377, "reward": 2.3095489740371704, "reward_std": 2.8382492065429688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.60455858707428, "rewards/no_repetition_reward_func": -0.29500965774059296, "rewards/verse_reward_func": 0.0, "step": 1695 }, { "completion_length": 250.21875, "epoch": 13.568, "grad_norm": 2.828125, "kl": 3.963308334350586, "learning_rate": 4.565933966549189e-05, "loss": 0.1585, "reward": 2.511783182621002, "reward_std": 2.9961788654327393, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.804235100746155, "rewards/no_repetition_reward_func": -0.29245200753211975, "rewards/verse_reward_func": 0.0, "step": 1696 }, { "completion_length": 243.921875, "epoch": 13.576, "grad_norm": 2.390625, "kl": 4.561949729919434, "learning_rate": 4.565147377356864e-05, "loss": 0.1825, "reward": 2.621750593185425, "reward_std": 2.9461309909820557, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9188504219055176, "rewards/no_repetition_reward_func": -0.2971000224351883, "rewards/verse_reward_func": 0.0, "step": 1697 }, { "completion_length": 244.40625, "epoch": 13.584, "grad_norm": 4.71875, "kl": 4.803455352783203, "learning_rate": 4.5643601439867734e-05, "loss": 0.1921, "reward": 1.7896308898925781, "reward_std": 2.318223714828491, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.024853467941284, "rewards/no_repetition_reward_func": -0.22741014510393143, "rewards/verse_reward_func": -0.0078125, "step": 1698 }, { "completion_length": 249.265625, "epoch": 13.592, "grad_norm": 2.65625, "kl": 4.316707611083984, "learning_rate": 4.5635722666844775e-05, "loss": 0.1727, "reward": 2.1383265256881714, "reward_std": 2.6431469917297363, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.385954260826111, "rewards/no_repetition_reward_func": -0.23981529474258423, "rewards/verse_reward_func": -0.0078125, "step": 1699 }, { "completion_length": 252.203125, "epoch": 13.6, "grad_norm": 1.9453125, "kl": 3.676575779914856, "learning_rate": 4.562783745695738e-05, "loss": 0.1471, "reward": 2.0189965963363647, "reward_std": 2.4824658632278442, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.272496283054352, "rewards/no_repetition_reward_func": -0.24568723142147064, "rewards/verse_reward_func": -0.0078125, "step": 1700 }, { "completion_length": 249.171875, "epoch": 13.608, "grad_norm": 2.265625, "kl": 3.252159357070923, "learning_rate": 4.561994581266516e-05, "loss": 0.1301, "reward": 1.9005310535430908, "reward_std": 2.335246443748474, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1711507439613342, "rewards/no_repetition_reward_func": -0.26280713081359863, "rewards/verse_reward_func": -0.0078125, "step": 1701 }, { "completion_length": 250.40625, "epoch": 13.616, "grad_norm": 2.25, "kl": 3.690682888031006, "learning_rate": 4.561204773642974e-05, "loss": 0.1476, "reward": 2.368348717689514, "reward_std": 3.2491910457611084, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6428537368774414, "rewards/no_repetition_reward_func": -0.27450504899024963, "rewards/verse_reward_func": 0.0, "step": 1702 }, { "completion_length": 256.0, "epoch": 13.624, "grad_norm": 3.515625, "kl": 3.2182289361953735, "learning_rate": 4.560414323071477e-05, "loss": 0.1287, "reward": 2.3344736099243164, "reward_std": 3.180867910385132, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.645545721054077, "rewards/no_repetition_reward_func": -0.31107211112976074, "rewards/verse_reward_func": 0.0, "step": 1703 }, { "completion_length": 250.09375, "epoch": 13.632, "grad_norm": 3.28125, "kl": 3.1336575746536255, "learning_rate": 4.559623229798587e-05, "loss": 0.1253, "reward": 2.39687716960907, "reward_std": 2.7086052894592285, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6782991886138916, "rewards/no_repetition_reward_func": -0.2814221978187561, "rewards/verse_reward_func": 0.0, "step": 1704 }, { "completion_length": 244.875, "epoch": 13.64, "grad_norm": 2.40625, "kl": 3.867690086364746, "learning_rate": 4.558831494071069e-05, "loss": 0.1547, "reward": 1.6447088122367859, "reward_std": 2.454597592353821, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9202802181243896, "rewards/no_repetition_reward_func": -0.2677590250968933, "rewards/verse_reward_func": -0.0078125, "step": 1705 }, { "completion_length": 242.546875, "epoch": 13.648, "grad_norm": 3.921875, "kl": 3.0206483602523804, "learning_rate": 4.558039116135887e-05, "loss": 0.1208, "reward": 2.6449145078659058, "reward_std": 2.9817419052124023, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9256250858306885, "rewards/no_repetition_reward_func": -0.2728980630636215, "rewards/verse_reward_func": -0.0078125, "step": 1706 }, { "completion_length": 251.5625, "epoch": 13.656, "grad_norm": 3.5625, "kl": 2.8739278316497803, "learning_rate": 4.5572460962402075e-05, "loss": 0.115, "reward": 2.820172071456909, "reward_std": 3.2241469621658325, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0763334035873413, "rewards/no_repetition_reward_func": -0.25616148114204407, "rewards/verse_reward_func": 0.0, "step": 1707 }, { "completion_length": 256.0, "epoch": 13.664, "grad_norm": 2.53125, "kl": 3.924473762512207, "learning_rate": 4.556452434631395e-05, "loss": 0.157, "reward": 2.524057984352112, "reward_std": 2.7853387594223022, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.851511597633362, "rewards/no_repetition_reward_func": -0.31182868778705597, "rewards/verse_reward_func": -0.015625, "step": 1708 }, { "completion_length": 249.765625, "epoch": 13.672, "grad_norm": 2.421875, "kl": 3.1792197227478027, "learning_rate": 4.555658131557015e-05, "loss": 0.1272, "reward": 2.5503060817718506, "reward_std": 2.17777281999588, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8224538564682007, "rewards/no_repetition_reward_func": -0.2721479535102844, "rewards/verse_reward_func": 0.0, "step": 1709 }, { "completion_length": 245.40625, "epoch": 13.68, "grad_norm": 1.6640625, "kl": 3.845124840736389, "learning_rate": 4.5548631872648326e-05, "loss": 0.1538, "reward": 2.4357234239578247, "reward_std": 2.9422730207443237, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.72645103931427, "rewards/no_repetition_reward_func": -0.290727823972702, "rewards/verse_reward_func": 0.0, "step": 1710 }, { "completion_length": 252.46875, "epoch": 13.688, "grad_norm": 2.234375, "kl": 4.8145105838775635, "learning_rate": 4.5540676020028145e-05, "loss": 0.1926, "reward": 2.3605563640594482, "reward_std": 2.8264135122299194, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.603538155555725, "rewards/no_repetition_reward_func": -0.24298188090324402, "rewards/verse_reward_func": 0.0, "step": 1711 }, { "completion_length": 247.171875, "epoch": 13.696, "grad_norm": 1.953125, "kl": 3.7356691360473633, "learning_rate": 4.553271376019125e-05, "loss": 0.1494, "reward": 2.6988784074783325, "reward_std": 3.0865734815597534, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.999395251274109, "rewards/no_repetition_reward_func": -0.3005170673131943, "rewards/verse_reward_func": 0.0, "step": 1712 }, { "completion_length": 253.59375, "epoch": 13.704, "grad_norm": 2.125, "kl": 3.7914953231811523, "learning_rate": 4.55247450956213e-05, "loss": 0.1517, "reward": 2.846498131752014, "reward_std": 3.1401236057281494, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.148310899734497, "rewards/no_repetition_reward_func": -0.3018127679824829, "rewards/verse_reward_func": 0.0, "step": 1713 }, { "completion_length": 252.21875, "epoch": 13.712, "grad_norm": 3.4375, "kl": 4.760542631149292, "learning_rate": 4.5516770028803954e-05, "loss": 0.1904, "reward": 2.3441882133483887, "reward_std": 2.8038140535354614, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6340219974517822, "rewards/no_repetition_reward_func": -0.2820213586091995, "rewards/verse_reward_func": -0.0078125, "step": 1714 }, { "completion_length": 251.59375, "epoch": 13.72, "grad_norm": 4.03125, "kl": 5.601217269897461, "learning_rate": 4.550878856222685e-05, "loss": 0.224, "reward": 2.189577102661133, "reward_std": 2.5945372581481934, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4811770915985107, "rewards/no_repetition_reward_func": -0.2837876230478287, "rewards/verse_reward_func": -0.0078125, "step": 1715 }, { "completion_length": 253.671875, "epoch": 13.728, "grad_norm": 2.75, "kl": 3.9383225440979004, "learning_rate": 4.5500800698379624e-05, "loss": 0.1575, "reward": 2.9934273958206177, "reward_std": 3.3686869144439697, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.319751262664795, "rewards/no_repetition_reward_func": -0.31851159036159515, "rewards/verse_reward_func": -0.0078125, "step": 1716 }, { "completion_length": 245.96875, "epoch": 13.736, "grad_norm": 4.09375, "kl": 5.2617785930633545, "learning_rate": 4.5492806439753935e-05, "loss": 0.2105, "reward": 2.148388624191284, "reward_std": 3.070664167404175, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.445790410041809, "rewards/no_repetition_reward_func": -0.28958944976329803, "rewards/verse_reward_func": -0.0078125, "step": 1717 }, { "completion_length": 253.09375, "epoch": 13.744, "grad_norm": 5.3125, "kl": 4.755767345428467, "learning_rate": 4.548480578884341e-05, "loss": 0.1902, "reward": 2.819474458694458, "reward_std": 2.9494788646698, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0912928581237793, "rewards/no_repetition_reward_func": -0.2718184143304825, "rewards/verse_reward_func": 0.0, "step": 1718 }, { "completion_length": 253.109375, "epoch": 13.752, "grad_norm": 1.21875, "kl": 4.4694178104400635, "learning_rate": 4.547679874814368e-05, "loss": 0.1788, "reward": 2.5335140228271484, "reward_std": 3.0266380310058594, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.850054621696472, "rewards/no_repetition_reward_func": -0.30091556906700134, "rewards/verse_reward_func": -0.015625, "step": 1719 }, { "completion_length": 247.15625, "epoch": 13.76, "grad_norm": 2.234375, "kl": 4.659998178482056, "learning_rate": 4.5468785320152365e-05, "loss": 0.1864, "reward": 2.5890671014785767, "reward_std": 3.149357795715332, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8977246284484863, "rewards/no_repetition_reward_func": -0.30865731835365295, "rewards/verse_reward_func": 0.0, "step": 1720 }, { "completion_length": 250.765625, "epoch": 13.768, "grad_norm": 4.21875, "kl": 5.192217826843262, "learning_rate": 4.5460765507369084e-05, "loss": 0.2077, "reward": 2.184368848800659, "reward_std": 2.7685853242874146, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4373319149017334, "rewards/no_repetition_reward_func": -0.2451508566737175, "rewards/verse_reward_func": -0.0078125, "step": 1721 }, { "completion_length": 250.734375, "epoch": 13.776, "grad_norm": 3.734375, "kl": 3.7204320430755615, "learning_rate": 4.5452739312295436e-05, "loss": 0.1488, "reward": 2.895632743835449, "reward_std": 2.925487995147705, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1949998140335083, "rewards/no_repetition_reward_func": -0.29155467450618744, "rewards/verse_reward_func": -0.0078125, "step": 1722 }, { "completion_length": 254.6875, "epoch": 13.784, "grad_norm": 3.71875, "kl": 3.1026341915130615, "learning_rate": 4.5444706737435014e-05, "loss": 0.1241, "reward": 2.736980438232422, "reward_std": 3.114777445793152, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.059443950653076, "rewards/no_repetition_reward_func": -0.3068384826183319, "rewards/verse_reward_func": -0.015625, "step": 1723 }, { "completion_length": 251.859375, "epoch": 13.792, "grad_norm": 2.703125, "kl": 4.380930185317993, "learning_rate": 4.543666778529342e-05, "loss": 0.1752, "reward": 2.080538511276245, "reward_std": 2.5924755334854126, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.320311188697815, "rewards/no_repetition_reward_func": -0.23977258056402206, "rewards/verse_reward_func": 0.0, "step": 1724 }, { "completion_length": 252.390625, "epoch": 13.8, "grad_norm": 2.484375, "kl": 4.093225717544556, "learning_rate": 4.542862245837821e-05, "loss": 0.1637, "reward": 2.123079776763916, "reward_std": 3.03855037689209, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4214353561401367, "rewards/no_repetition_reward_func": -0.2905430793762207, "rewards/verse_reward_func": -0.0078125, "step": 1725 }, { "completion_length": 253.1875, "epoch": 13.808, "grad_norm": 3.703125, "kl": 2.217728614807129, "learning_rate": 4.542057075919897e-05, "loss": 0.0887, "reward": 2.769731283187866, "reward_std": 3.0146028995513916, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0703046321868896, "rewards/no_repetition_reward_func": -0.3005734533071518, "rewards/verse_reward_func": 0.0, "step": 1726 }, { "completion_length": 248.859375, "epoch": 13.816, "grad_norm": 5.90625, "kl": 2.4602530002593994, "learning_rate": 4.5412512690267246e-05, "loss": 0.0984, "reward": 3.4311505556106567, "reward_std": 3.3317021131515503, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.765580892562866, "rewards/no_repetition_reward_func": -0.3266178369522095, "rewards/verse_reward_func": -0.0078125, "step": 1727 }, { "completion_length": 256.0, "epoch": 13.824, "grad_norm": 1.9921875, "kl": 3.257045865058899, "learning_rate": 4.540444825409657e-05, "loss": 0.1303, "reward": 2.1933799386024475, "reward_std": 2.644131660461426, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4187241792678833, "rewards/no_repetition_reward_func": -0.22534435987472534, "rewards/verse_reward_func": 0.0, "step": 1728 }, { "completion_length": 252.421875, "epoch": 13.832, "grad_norm": 4.21875, "kl": 2.6965590715408325, "learning_rate": 4.5396377453202466e-05, "loss": 0.1079, "reward": 3.3202112913131714, "reward_std": 3.0955992937088013, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.6063971519470215, "rewards/no_repetition_reward_func": -0.2861858010292053, "rewards/verse_reward_func": 0.0, "step": 1729 }, { "completion_length": 247.515625, "epoch": 13.84, "grad_norm": 2.953125, "kl": 3.543998956680298, "learning_rate": 4.5388300290102456e-05, "loss": 0.1418, "reward": 2.5101767778396606, "reward_std": 3.0376522541046143, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7851279973983765, "rewards/no_repetition_reward_func": -0.26713868975639343, "rewards/verse_reward_func": -0.0078125, "step": 1730 }, { "completion_length": 247.640625, "epoch": 13.848, "grad_norm": 1.71875, "kl": 3.884377598762512, "learning_rate": 4.538021676731603e-05, "loss": 0.1554, "reward": 2.477137863636017, "reward_std": 2.8303321599960327, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7575374841690063, "rewards/no_repetition_reward_func": -0.2647745758295059, "rewards/verse_reward_func": -0.015625, "step": 1731 }, { "completion_length": 255.515625, "epoch": 13.856, "grad_norm": 3.6875, "kl": 4.425583124160767, "learning_rate": 4.5372126887364655e-05, "loss": 0.177, "reward": 2.602999448776245, "reward_std": 3.347878336906433, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.927147388458252, "rewards/no_repetition_reward_func": -0.3241479843854904, "rewards/verse_reward_func": 0.0, "step": 1732 }, { "completion_length": 242.96875, "epoch": 13.864, "grad_norm": 2.671875, "kl": 4.936306715011597, "learning_rate": 4.536403065277182e-05, "loss": 0.1975, "reward": 2.4140288829803467, "reward_std": 3.2588449716567993, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7317023277282715, "rewards/no_repetition_reward_func": -0.3020485192537308, "rewards/verse_reward_func": -0.015625, "step": 1733 }, { "completion_length": 248.703125, "epoch": 13.872, "grad_norm": 3.09375, "kl": 4.235710144042969, "learning_rate": 4.535592806606294e-05, "loss": 0.1694, "reward": 2.8302289247512817, "reward_std": 2.814329981803894, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.103149175643921, "rewards/no_repetition_reward_func": -0.2729201912879944, "rewards/verse_reward_func": 0.0, "step": 1734 }, { "completion_length": 254.734375, "epoch": 13.88, "grad_norm": 3.4375, "kl": 5.598885536193848, "learning_rate": 4.534781912976546e-05, "loss": 0.224, "reward": 1.9099829196929932, "reward_std": 2.414042592048645, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.18095600605011, "rewards/no_repetition_reward_func": -0.27097301185131073, "rewards/verse_reward_func": 0.0, "step": 1735 }, { "completion_length": 251.0, "epoch": 13.888, "grad_norm": 8.125, "kl": 6.6703877449035645, "learning_rate": 4.533970384640877e-05, "loss": 0.2668, "reward": 2.479905605316162, "reward_std": 3.08694064617157, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7897177934646606, "rewards/no_repetition_reward_func": -0.30199968814849854, "rewards/verse_reward_func": -0.0078125, "step": 1736 }, { "completion_length": 243.609375, "epoch": 13.896, "grad_norm": 2.96875, "kl": 4.391346216201782, "learning_rate": 4.533158221852427e-05, "loss": 0.1757, "reward": 2.330491781234741, "reward_std": 2.676068663597107, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6405001878738403, "rewards/no_repetition_reward_func": -0.3100084364414215, "rewards/verse_reward_func": 0.0, "step": 1737 }, { "completion_length": 251.15625, "epoch": 13.904, "grad_norm": 2.109375, "kl": 5.66410493850708, "learning_rate": 4.5323454248645324e-05, "loss": 0.2266, "reward": 2.1134159564971924, "reward_std": 3.0524508953094482, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3920440673828125, "rewards/no_repetition_reward_func": -0.2786279618740082, "rewards/verse_reward_func": 0.0, "step": 1738 }, { "completion_length": 249.75, "epoch": 13.912, "grad_norm": 3.375, "kl": 5.471651077270508, "learning_rate": 4.531531993930727e-05, "loss": 0.2189, "reward": 2.0480798482894897, "reward_std": 2.84161913394928, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2994332313537598, "rewards/no_repetition_reward_func": -0.2513531967997551, "rewards/verse_reward_func": 0.0, "step": 1739 }, { "completion_length": 244.921875, "epoch": 13.92, "grad_norm": 4.5, "kl": 5.318350553512573, "learning_rate": 4.530717929304743e-05, "loss": 0.2127, "reward": 1.902035653591156, "reward_std": 2.872814178466797, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.156486988067627, "rewards/no_repetition_reward_func": -0.24663883447647095, "rewards/verse_reward_func": -0.0078125, "step": 1740 }, { "completion_length": 245.75, "epoch": 13.928, "grad_norm": 2.96875, "kl": 3.037355661392212, "learning_rate": 4.529903231240511e-05, "loss": 0.1215, "reward": 3.0005966424942017, "reward_std": 2.807459592819214, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.326305866241455, "rewards/no_repetition_reward_func": -0.3022715747356415, "rewards/verse_reward_func": -0.0234375, "step": 1741 }, { "completion_length": 250.734375, "epoch": 13.936, "grad_norm": 4.625, "kl": 2.9252469539642334, "learning_rate": 4.529087899992156e-05, "loss": 0.117, "reward": 2.8197948932647705, "reward_std": 3.0412325859069824, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1220489740371704, "rewards/no_repetition_reward_func": -0.2866288274526596, "rewards/verse_reward_func": -0.015625, "step": 1742 }, { "completion_length": 249.34375, "epoch": 13.943999999999999, "grad_norm": 2.9375, "kl": 2.985764980316162, "learning_rate": 4.5282719358140056e-05, "loss": 0.1194, "reward": 2.9705060720443726, "reward_std": 3.149346947669983, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2712008953094482, "rewards/no_repetition_reward_func": -0.3006947487592697, "rewards/verse_reward_func": 0.0, "step": 1743 }, { "completion_length": 244.265625, "epoch": 13.952, "grad_norm": 1.875, "kl": 3.829693555831909, "learning_rate": 4.52745533896058e-05, "loss": 0.1532, "reward": 1.7140703201293945, "reward_std": 2.547049880027771, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9531445503234863, "rewards/no_repetition_reward_func": -0.23907408863306046, "rewards/verse_reward_func": 0.0, "step": 1744 }, { "completion_length": 244.046875, "epoch": 13.96, "grad_norm": 4.71875, "kl": 3.073602080345154, "learning_rate": 4.5266381096866e-05, "loss": 0.1229, "reward": 2.884386420249939, "reward_std": 3.076311945915222, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1767771244049072, "rewards/no_repetition_reward_func": -0.2845783531665802, "rewards/verse_reward_func": -0.0078125, "step": 1745 }, { "completion_length": 255.09375, "epoch": 13.968, "grad_norm": 2.3125, "kl": 3.0709869861602783, "learning_rate": 4.525820248246982e-05, "loss": 0.1228, "reward": 1.8666207790374756, "reward_std": 2.5087666511535645, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1711609959602356, "rewards/no_repetition_reward_func": -0.2967277020215988, "rewards/verse_reward_func": -0.0078125, "step": 1746 }, { "completion_length": 240.078125, "epoch": 13.975999999999999, "grad_norm": 1.7421875, "kl": 2.931776285171509, "learning_rate": 4.5250017548968404e-05, "loss": 0.1173, "reward": 1.578087329864502, "reward_std": 2.0256584882736206, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.802833378314972, "rewards/no_repetition_reward_func": -0.22474610805511475, "rewards/verse_reward_func": 0.0, "step": 1747 }, { "completion_length": 253.0, "epoch": 13.984, "grad_norm": 3.515625, "kl": 4.339308381080627, "learning_rate": 4.524182629891486e-05, "loss": 0.1736, "reward": 1.5318278074264526, "reward_std": 2.624584913253784, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7990872859954834, "rewards/no_repetition_reward_func": -0.2672593966126442, "rewards/verse_reward_func": 0.0, "step": 1748 }, { "completion_length": 250.59375, "epoch": 13.992, "grad_norm": 4.78125, "kl": 3.379760980606079, "learning_rate": 4.523362873486427e-05, "loss": 0.1352, "reward": 2.4932992458343506, "reward_std": 3.3593684434890747, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7906527519226074, "rewards/no_repetition_reward_func": -0.297353595495224, "rewards/verse_reward_func": 0.0, "step": 1749 }, { "completion_length": 256.0, "epoch": 14.0, "grad_norm": 3.453125, "kl": 3.272552013397217, "learning_rate": 4.522542485937369e-05, "loss": 0.1309, "reward": 2.8560644388198853, "reward_std": 3.2426209449768066, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.147520899772644, "rewards/no_repetition_reward_func": -0.29145656526088715, "rewards/verse_reward_func": 0.0, "step": 1750 }, { "completion_length": 249.421875, "epoch": 14.008, "grad_norm": 1.9609375, "kl": 3.511127233505249, "learning_rate": 4.521721467500213e-05, "loss": 0.1404, "reward": 2.2062907218933105, "reward_std": 2.5841275453567505, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4947898387908936, "rewards/no_repetition_reward_func": -0.2806866094470024, "rewards/verse_reward_func": -0.0078125, "step": 1751 }, { "completion_length": 253.40625, "epoch": 14.016, "grad_norm": 7.3125, "kl": 3.860497236251831, "learning_rate": 4.5208998184310596e-05, "loss": 0.1544, "reward": 1.1119602024555206, "reward_std": 2.1979626417160034, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3542629480361938, "rewards/no_repetition_reward_func": -0.2344902604818344, "rewards/verse_reward_func": -0.0078125, "step": 1752 }, { "completion_length": 247.46875, "epoch": 14.024, "grad_norm": 2.296875, "kl": 3.5282368659973145, "learning_rate": 4.5200775389862026e-05, "loss": 0.1411, "reward": 2.116565704345703, "reward_std": 2.9730846881866455, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.42029070854187, "rewards/no_repetition_reward_func": -0.29591263830661774, "rewards/verse_reward_func": -0.0078125, "step": 1753 }, { "completion_length": 243.625, "epoch": 14.032, "grad_norm": 4.34375, "kl": 3.0140492916107178, "learning_rate": 4.519254629422136e-05, "loss": 0.1206, "reward": 2.9423269033432007, "reward_std": 3.037073850631714, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2078553438186646, "rewards/no_repetition_reward_func": -0.25771602243185043, "rewards/verse_reward_func": -0.0078125, "step": 1754 }, { "completion_length": 248.65625, "epoch": 14.04, "grad_norm": 3.609375, "kl": 3.8083488941192627, "learning_rate": 4.5184310899955465e-05, "loss": 0.1523, "reward": 1.6654375195503235, "reward_std": 2.6326860189437866, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9194166660308838, "rewards/no_repetition_reward_func": -0.2539791837334633, "rewards/verse_reward_func": 0.0, "step": 1755 }, { "completion_length": 247.046875, "epoch": 14.048, "grad_norm": 3.015625, "kl": 3.1367238759994507, "learning_rate": 4.51760692096332e-05, "loss": 0.1255, "reward": 2.719232678413391, "reward_std": 2.8645386695861816, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.991205930709839, "rewards/no_repetition_reward_func": -0.2719734385609627, "rewards/verse_reward_func": 0.0, "step": 1756 }, { "completion_length": 242.15625, "epoch": 14.056, "grad_norm": 2.03125, "kl": 3.4890546798706055, "learning_rate": 4.516782122582538e-05, "loss": 0.1396, "reward": 2.391808032989502, "reward_std": 2.6707422733306885, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6774314641952515, "rewards/no_repetition_reward_func": -0.2778109461069107, "rewards/verse_reward_func": -0.0078125, "step": 1757 }, { "completion_length": 250.78125, "epoch": 14.064, "grad_norm": 1.890625, "kl": 3.417982816696167, "learning_rate": 4.5159566951104796e-05, "loss": 0.1367, "reward": 2.566420316696167, "reward_std": 2.5970778465270996, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8445847034454346, "rewards/no_repetition_reward_func": -0.2703518271446228, "rewards/verse_reward_func": -0.0078125, "step": 1758 }, { "completion_length": 247.0, "epoch": 14.072, "grad_norm": 3.640625, "kl": 4.252184867858887, "learning_rate": 4.5151306388046175e-05, "loss": 0.1701, "reward": 2.173549771308899, "reward_std": 2.8577747344970703, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.468281388282776, "rewards/no_repetition_reward_func": -0.2869190573692322, "rewards/verse_reward_func": -0.0078125, "step": 1759 }, { "completion_length": 246.078125, "epoch": 14.08, "grad_norm": 2.46875, "kl": 3.2648465633392334, "learning_rate": 4.5143039539226234e-05, "loss": 0.1306, "reward": 2.749457359313965, "reward_std": 2.8607685565948486, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.030969262123108, "rewards/no_repetition_reward_func": -0.28151196241378784, "rewards/verse_reward_func": 0.0, "step": 1760 }, { "completion_length": 249.1875, "epoch": 14.088, "grad_norm": 4.09375, "kl": 5.381285905838013, "learning_rate": 4.513476640722362e-05, "loss": 0.2153, "reward": 1.8941888213157654, "reward_std": 2.874802827835083, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1194307804107666, "rewards/no_repetition_reward_func": -0.22524191439151764, "rewards/verse_reward_func": 0.0, "step": 1761 }, { "completion_length": 252.515625, "epoch": 14.096, "grad_norm": 1.9375, "kl": 3.8091182708740234, "learning_rate": 4.512648699461897e-05, "loss": 0.1524, "reward": 2.621338725090027, "reward_std": 3.1074483394622803, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9212546348571777, "rewards/no_repetition_reward_func": -0.2921034246683121, "rewards/verse_reward_func": -0.0078125, "step": 1762 }, { "completion_length": 250.375, "epoch": 14.104, "grad_norm": 2.3125, "kl": 3.8519086837768555, "learning_rate": 4.511820130399485e-05, "loss": 0.1541, "reward": 2.6963549852371216, "reward_std": 2.904787063598633, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.984862208366394, "rewards/no_repetition_reward_func": -0.28069470822811127, "rewards/verse_reward_func": -0.0078125, "step": 1763 }, { "completion_length": 247.953125, "epoch": 14.112, "grad_norm": 2.703125, "kl": 4.279860734939575, "learning_rate": 4.510990933793583e-05, "loss": 0.1712, "reward": 2.3426941633224487, "reward_std": 2.7091991901397705, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.599573850631714, "rewards/no_repetition_reward_func": -0.25687965750694275, "rewards/verse_reward_func": 0.0, "step": 1764 }, { "completion_length": 241.765625, "epoch": 14.12, "grad_norm": 1.5546875, "kl": 3.727641224861145, "learning_rate": 4.510161109902837e-05, "loss": 0.1491, "reward": 2.312682271003723, "reward_std": 2.800874710083008, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5634900331497192, "rewards/no_repetition_reward_func": -0.2351827621459961, "rewards/verse_reward_func": -0.015625, "step": 1765 }, { "completion_length": 254.359375, "epoch": 14.128, "grad_norm": 2.015625, "kl": 4.129788756370544, "learning_rate": 4.509330658986095e-05, "loss": 0.1652, "reward": 2.8041939735412598, "reward_std": 2.951021909713745, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0776227712631226, "rewards/no_repetition_reward_func": -0.2734287232160568, "rewards/verse_reward_func": 0.0, "step": 1766 }, { "completion_length": 248.46875, "epoch": 14.136, "grad_norm": 2.84375, "kl": 3.74922251701355, "learning_rate": 4.508499581302398e-05, "loss": 0.15, "reward": 2.709437370300293, "reward_std": 3.1650904417037964, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.015303134918213, "rewards/no_repetition_reward_func": -0.2902405261993408, "rewards/verse_reward_func": -0.015625, "step": 1767 }, { "completion_length": 254.46875, "epoch": 14.144, "grad_norm": 1.8984375, "kl": 4.056035161018372, "learning_rate": 4.507667877110982e-05, "loss": 0.1622, "reward": 2.753948211669922, "reward_std": 2.535078525543213, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.075125813484192, "rewards/no_repetition_reward_func": -0.3211776316165924, "rewards/verse_reward_func": 0.0, "step": 1768 }, { "completion_length": 245.21875, "epoch": 14.152, "grad_norm": 2.859375, "kl": 2.8039227724075317, "learning_rate": 4.506835546671278e-05, "loss": 0.1122, "reward": 2.634944438934326, "reward_std": 3.1202389001846313, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.958567261695862, "rewards/no_repetition_reward_func": -0.30799782276153564, "rewards/verse_reward_func": -0.015625, "step": 1769 }, { "completion_length": 250.828125, "epoch": 14.16, "grad_norm": 1.53125, "kl": 3.7240785360336304, "learning_rate": 4.5060025902429174e-05, "loss": 0.149, "reward": 3.0668487548828125, "reward_std": 3.272927403450012, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3821563720703125, "rewards/no_repetition_reward_func": -0.31530751287937164, "rewards/verse_reward_func": 0.0, "step": 1770 }, { "completion_length": 252.03125, "epoch": 14.168, "grad_norm": 2.453125, "kl": 4.685540795326233, "learning_rate": 4.5051690080857176e-05, "loss": 0.1874, "reward": 2.2333303689956665, "reward_std": 2.933764934539795, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.489618420600891, "rewards/no_repetition_reward_func": -0.24847546219825745, "rewards/verse_reward_func": -0.0078125, "step": 1771 }, { "completion_length": 244.84375, "epoch": 14.176, "grad_norm": 1.7109375, "kl": 4.260264039039612, "learning_rate": 4.504334800459699e-05, "loss": 0.1704, "reward": 2.195716381072998, "reward_std": 2.469349980354309, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.451645016670227, "rewards/no_repetition_reward_func": -0.2481161206960678, "rewards/verse_reward_func": -0.0078125, "step": 1772 }, { "completion_length": 253.359375, "epoch": 14.184, "grad_norm": 4.0, "kl": 3.8924070596694946, "learning_rate": 4.5034999676250745e-05, "loss": 0.1557, "reward": 2.6566542387008667, "reward_std": 2.9220328330993652, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9827581644058228, "rewards/no_repetition_reward_func": -0.3261037766933441, "rewards/verse_reward_func": 0.0, "step": 1773 }, { "completion_length": 255.34375, "epoch": 14.192, "grad_norm": 2.4375, "kl": 3.6222591400146484, "learning_rate": 4.5026645098422515e-05, "loss": 0.1449, "reward": 3.194288730621338, "reward_std": 2.8832738399505615, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.51345431804657, "rewards/no_repetition_reward_func": -0.31916551291942596, "rewards/verse_reward_func": 0.0, "step": 1774 }, { "completion_length": 251.890625, "epoch": 14.2, "grad_norm": 3.328125, "kl": 5.780743598937988, "learning_rate": 4.5018284273718336e-05, "loss": 0.2312, "reward": 1.8158433437347412, "reward_std": 2.8015044927597046, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.101439654827118, "rewards/no_repetition_reward_func": -0.28559623658657074, "rewards/verse_reward_func": 0.0, "step": 1775 }, { "completion_length": 249.671875, "epoch": 14.208, "grad_norm": 2.921875, "kl": 4.990322589874268, "learning_rate": 4.5009917204746184e-05, "loss": 0.1996, "reward": 2.6285295486450195, "reward_std": 3.2578872442245483, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9236810207366943, "rewards/no_repetition_reward_func": -0.2873389720916748, "rewards/verse_reward_func": -0.0078125, "step": 1776 }, { "completion_length": 249.40625, "epoch": 14.216, "grad_norm": 4.3125, "kl": 5.751516342163086, "learning_rate": 4.5001543894115975e-05, "loss": 0.2301, "reward": 1.8228195905685425, "reward_std": 2.776708245277405, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.090874969959259, "rewards/no_repetition_reward_func": -0.2602430135011673, "rewards/verse_reward_func": -0.0078125, "step": 1777 }, { "completion_length": 252.296875, "epoch": 14.224, "grad_norm": 4.625, "kl": 5.727651834487915, "learning_rate": 4.499316434443959e-05, "loss": 0.2291, "reward": 1.036066472530365, "reward_std": 2.2351250648498535, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.2982123494148254, "rewards/no_repetition_reward_func": -0.25433337688446045, "rewards/verse_reward_func": -0.0078125, "step": 1778 }, { "completion_length": 244.8125, "epoch": 14.232, "grad_norm": 8.1875, "kl": 5.382858037948608, "learning_rate": 4.4984778558330844e-05, "loss": 0.2153, "reward": 2.6674057245254517, "reward_std": 3.117820143699646, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0089213848114014, "rewards/no_repetition_reward_func": -0.32589058578014374, "rewards/verse_reward_func": -0.015625, "step": 1779 }, { "completion_length": 251.125, "epoch": 14.24, "grad_norm": 3.53125, "kl": 3.867187976837158, "learning_rate": 4.4976386538405495e-05, "loss": 0.1547, "reward": 2.3696213960647583, "reward_std": 3.3224802017211914, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.665179967880249, "rewards/no_repetition_reward_func": -0.2955585718154907, "rewards/verse_reward_func": 0.0, "step": 1780 }, { "completion_length": 251.671875, "epoch": 14.248, "grad_norm": 2.390625, "kl": 4.053664922714233, "learning_rate": 4.496798828728126e-05, "loss": 0.1621, "reward": 2.5447016954421997, "reward_std": 3.196216106414795, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.816840410232544, "rewards/no_repetition_reward_func": -0.26432621479034424, "rewards/verse_reward_func": -0.0078125, "step": 1781 }, { "completion_length": 254.28125, "epoch": 14.256, "grad_norm": 1.5859375, "kl": 4.050326943397522, "learning_rate": 4.495958380757779e-05, "loss": 0.162, "reward": 2.638184666633606, "reward_std": 2.8058277368545532, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9243208169937134, "rewards/no_repetition_reward_func": -0.28613607585430145, "rewards/verse_reward_func": 0.0, "step": 1782 }, { "completion_length": 249.484375, "epoch": 14.264, "grad_norm": 1.796875, "kl": 3.9833984375, "learning_rate": 4.4951173101916675e-05, "loss": 0.1593, "reward": 2.3976532220840454, "reward_std": 2.7840806245803833, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6796776056289673, "rewards/no_repetition_reward_func": -0.28202445805072784, "rewards/verse_reward_func": 0.0, "step": 1783 }, { "completion_length": 248.40625, "epoch": 14.272, "grad_norm": 3.515625, "kl": 4.572150945663452, "learning_rate": 4.494275617292144e-05, "loss": 0.1829, "reward": 1.8403697609901428, "reward_std": 2.502837061882019, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1334909200668335, "rewards/no_repetition_reward_func": -0.2853085994720459, "rewards/verse_reward_func": -0.0078125, "step": 1784 }, { "completion_length": 247.1875, "epoch": 14.28, "grad_norm": 1.953125, "kl": 4.263514757156372, "learning_rate": 4.493433302321759e-05, "loss": 0.1705, "reward": 2.385098695755005, "reward_std": 3.074066400527954, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.66718065738678, "rewards/no_repetition_reward_func": -0.2820819020271301, "rewards/verse_reward_func": 0.0, "step": 1785 }, { "completion_length": 245.40625, "epoch": 14.288, "grad_norm": 5.0625, "kl": 4.825958728790283, "learning_rate": 4.492590365543253e-05, "loss": 0.193, "reward": 2.0116246938705444, "reward_std": 2.452575206756592, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2634665966033936, "rewards/no_repetition_reward_func": -0.2518419921398163, "rewards/verse_reward_func": 0.0, "step": 1786 }, { "completion_length": 251.625, "epoch": 14.296, "grad_norm": 2.796875, "kl": 3.210962653160095, "learning_rate": 4.491746807219561e-05, "loss": 0.1284, "reward": 3.1296169757843018, "reward_std": 2.9620234966278076, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4258166551589966, "rewards/no_repetition_reward_func": -0.296199694275856, "rewards/verse_reward_func": 0.0, "step": 1787 }, { "completion_length": 245.140625, "epoch": 14.304, "grad_norm": 2.1875, "kl": 3.791170120239258, "learning_rate": 4.490902627613813e-05, "loss": 0.1516, "reward": 2.319908857345581, "reward_std": 3.105176329612732, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5957401990890503, "rewards/no_repetition_reward_func": -0.2758314162492752, "rewards/verse_reward_func": 0.0, "step": 1788 }, { "completion_length": 250.28125, "epoch": 14.312, "grad_norm": 2.609375, "kl": 4.379751682281494, "learning_rate": 4.4900578269893335e-05, "loss": 0.1752, "reward": 2.0424436926841736, "reward_std": 2.6098092794418335, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.297577440738678, "rewards/no_repetition_reward_func": -0.23950877040624619, "rewards/verse_reward_func": -0.015625, "step": 1789 }, { "completion_length": 236.734375, "epoch": 14.32, "grad_norm": 3.765625, "kl": 3.0369112491607666, "learning_rate": 4.4892124056096386e-05, "loss": 0.1215, "reward": 2.781141996383667, "reward_std": 3.2917760610580444, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0768797397613525, "rewards/no_repetition_reward_func": -0.26448775827884674, "rewards/verse_reward_func": -0.03125, "step": 1790 }, { "completion_length": 248.4375, "epoch": 14.328, "grad_norm": 5.0625, "kl": 2.5210052728652954, "learning_rate": 4.4883663637384396e-05, "loss": 0.1008, "reward": 3.571039080619812, "reward_std": 3.1257827281951904, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.8528475761413574, "rewards/no_repetition_reward_func": -0.28180843591690063, "rewards/verse_reward_func": 0.0, "step": 1791 }, { "completion_length": 245.671875, "epoch": 14.336, "grad_norm": 2.09375, "kl": 3.1567513942718506, "learning_rate": 4.487519701639641e-05, "loss": 0.1263, "reward": 2.4958252906799316, "reward_std": 2.8222858905792236, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.742443323135376, "rewards/no_repetition_reward_func": -0.24661807715892792, "rewards/verse_reward_func": 0.0, "step": 1792 }, { "completion_length": 248.265625, "epoch": 14.344, "grad_norm": 2.359375, "kl": 3.5542523860931396, "learning_rate": 4.486672419577339e-05, "loss": 0.1422, "reward": 2.4227916598320007, "reward_std": 2.969240665435791, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.681581497192383, "rewards/no_repetition_reward_func": -0.2509773001074791, "rewards/verse_reward_func": -0.0078125, "step": 1793 }, { "completion_length": 250.96875, "epoch": 14.352, "grad_norm": 2.875, "kl": 2.9588615894317627, "learning_rate": 4.4858245178158276e-05, "loss": 0.1184, "reward": 2.793813467025757, "reward_std": 2.439748764038086, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.063905119895935, "rewards/no_repetition_reward_func": -0.270091712474823, "rewards/verse_reward_func": 0.0, "step": 1794 }, { "completion_length": 241.140625, "epoch": 14.36, "grad_norm": 1.640625, "kl": 3.94584059715271, "learning_rate": 4.484975996619589e-05, "loss": 0.1578, "reward": 2.487336754798889, "reward_std": 2.9272719621658325, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7414228916168213, "rewards/no_repetition_reward_func": -0.2462734878063202, "rewards/verse_reward_func": -0.0078125, "step": 1795 }, { "completion_length": 246.671875, "epoch": 14.368, "grad_norm": 2.21875, "kl": 3.857462167739868, "learning_rate": 4.484126856253301e-05, "loss": 0.1543, "reward": 2.5974042415618896, "reward_std": 2.8784581422805786, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8695536851882935, "rewards/no_repetition_reward_func": -0.26433703303337097, "rewards/verse_reward_func": -0.0078125, "step": 1796 }, { "completion_length": 250.609375, "epoch": 14.376, "grad_norm": 1.640625, "kl": 4.341941595077515, "learning_rate": 4.483277096981836e-05, "loss": 0.1737, "reward": 3.024122476577759, "reward_std": 2.8332141637802124, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3329321146011353, "rewards/no_repetition_reward_func": -0.3009970486164093, "rewards/verse_reward_func": -0.0078125, "step": 1797 }, { "completion_length": 252.40625, "epoch": 14.384, "grad_norm": 2.3125, "kl": 4.195339679718018, "learning_rate": 4.482426719070258e-05, "loss": 0.1678, "reward": 2.914252996444702, "reward_std": 3.0291353464126587, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.18025803565979, "rewards/no_repetition_reward_func": -0.26600518822669983, "rewards/verse_reward_func": 0.0, "step": 1798 }, { "completion_length": 246.15625, "epoch": 14.392, "grad_norm": 3.109375, "kl": 4.787959575653076, "learning_rate": 4.481575722783821e-05, "loss": 0.1915, "reward": 2.2694958448410034, "reward_std": 2.646822929382324, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5268222093582153, "rewards/no_repetition_reward_func": -0.25732631981372833, "rewards/verse_reward_func": 0.0, "step": 1799 }, { "completion_length": 256.0, "epoch": 14.4, "grad_norm": 1.2890625, "kl": 4.483238458633423, "learning_rate": 4.480724108387977e-05, "loss": 0.1793, "reward": 2.823270559310913, "reward_std": 3.2667609453201294, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.123884916305542, "rewards/no_repetition_reward_func": -0.3006143569946289, "rewards/verse_reward_func": 0.0, "step": 1800 }, { "completion_length": 251.84375, "epoch": 14.408, "grad_norm": 2.921875, "kl": 5.640796184539795, "learning_rate": 4.479871876148368e-05, "loss": 0.2256, "reward": 2.305836498737335, "reward_std": 2.6395256519317627, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.595874786376953, "rewards/no_repetition_reward_func": -0.2822257876396179, "rewards/verse_reward_func": -0.0078125, "step": 1801 }, { "completion_length": 248.625, "epoch": 14.416, "grad_norm": 3.09375, "kl": 4.735196828842163, "learning_rate": 4.4790190263308306e-05, "loss": 0.1894, "reward": 2.3559324741363525, "reward_std": 2.76284122467041, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6371819972991943, "rewards/no_repetition_reward_func": -0.2812494784593582, "rewards/verse_reward_func": 0.0, "step": 1802 }, { "completion_length": 256.0, "epoch": 14.424, "grad_norm": 3.40625, "kl": 4.777745485305786, "learning_rate": 4.4781655592013914e-05, "loss": 0.1911, "reward": 2.9308176040649414, "reward_std": 3.102312922477722, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.206018090248108, "rewards/no_repetition_reward_func": -0.2673879414796829, "rewards/verse_reward_func": -0.0078125, "step": 1803 }, { "completion_length": 249.625, "epoch": 14.432, "grad_norm": 4.03125, "kl": 3.801825165748596, "learning_rate": 4.477311475026271e-05, "loss": 0.1521, "reward": 2.9218965768814087, "reward_std": 2.807782769203186, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.195674180984497, "rewards/no_repetition_reward_func": -0.2737775146961212, "rewards/verse_reward_func": 0.0, "step": 1804 }, { "completion_length": 254.421875, "epoch": 14.44, "grad_norm": 2.765625, "kl": 4.050289034843445, "learning_rate": 4.4764567740718825e-05, "loss": 0.162, "reward": 2.691061854362488, "reward_std": 2.9888393878936768, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9636778831481934, "rewards/no_repetition_reward_func": -0.2726162075996399, "rewards/verse_reward_func": 0.0, "step": 1805 }, { "completion_length": 251.359375, "epoch": 14.448, "grad_norm": 2.40625, "kl": 4.331995010375977, "learning_rate": 4.475601456604831e-05, "loss": 0.1733, "reward": 2.516990065574646, "reward_std": 3.195584297180176, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8140170574188232, "rewards/no_repetition_reward_func": -0.2970270663499832, "rewards/verse_reward_func": 0.0, "step": 1806 }, { "completion_length": 248.875, "epoch": 14.456, "grad_norm": 1.7265625, "kl": 4.553014755249023, "learning_rate": 4.4747455228919146e-05, "loss": 0.1821, "reward": 2.774810552597046, "reward_std": 3.0112557411193848, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0548455715179443, "rewards/no_repetition_reward_func": -0.2800348997116089, "rewards/verse_reward_func": 0.0, "step": 1807 }, { "completion_length": 246.265625, "epoch": 14.464, "grad_norm": 4.21875, "kl": 5.443798303604126, "learning_rate": 4.4738889732001234e-05, "loss": 0.2178, "reward": 1.5542046427726746, "reward_std": 2.442275285720825, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8449243903160095, "rewards/no_repetition_reward_func": -0.2829073369503021, "rewards/verse_reward_func": -0.0078125, "step": 1808 }, { "completion_length": 252.328125, "epoch": 14.472, "grad_norm": 3.109375, "kl": 3.4431127309799194, "learning_rate": 4.473031807796639e-05, "loss": 0.1377, "reward": 3.1344839334487915, "reward_std": 3.1871596574783325, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4603382349014282, "rewards/no_repetition_reward_func": -0.3180417865514755, "rewards/verse_reward_func": -0.0078125, "step": 1809 }, { "completion_length": 252.21875, "epoch": 14.48, "grad_norm": 2.75, "kl": 4.129770994186401, "learning_rate": 4.4721740269488355e-05, "loss": 0.1652, "reward": 2.7518630027770996, "reward_std": 2.779923677444458, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.032968521118164, "rewards/no_repetition_reward_func": -0.28110551834106445, "rewards/verse_reward_func": 0.0, "step": 1810 }, { "completion_length": 256.0, "epoch": 14.488, "grad_norm": 4.8125, "kl": 4.043972969055176, "learning_rate": 4.471315630924279e-05, "loss": 0.1618, "reward": 3.2187689542770386, "reward_std": 3.395465850830078, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5168683528900146, "rewards/no_repetition_reward_func": -0.29809945821762085, "rewards/verse_reward_func": 0.0, "step": 1811 }, { "completion_length": 251.703125, "epoch": 14.496, "grad_norm": 1.7890625, "kl": 4.617461085319519, "learning_rate": 4.470456619990727e-05, "loss": 0.1847, "reward": 2.4050228595733643, "reward_std": 3.039270520210266, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6872713565826416, "rewards/no_repetition_reward_func": -0.282248392701149, "rewards/verse_reward_func": 0.0, "step": 1812 }, { "completion_length": 249.015625, "epoch": 14.504, "grad_norm": 2.6875, "kl": 3.636535406112671, "learning_rate": 4.46959699441613e-05, "loss": 0.1455, "reward": 2.9422446489334106, "reward_std": 2.8693348169326782, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2109930515289307, "rewards/no_repetition_reward_func": -0.26093585789203644, "rewards/verse_reward_func": -0.0078125, "step": 1813 }, { "completion_length": 253.578125, "epoch": 14.512, "grad_norm": 1.8125, "kl": 3.016451358795166, "learning_rate": 4.46873675446863e-05, "loss": 0.1207, "reward": 3.075165033340454, "reward_std": 2.961665153503418, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3627349138259888, "rewards/no_repetition_reward_func": -0.2875698208808899, "rewards/verse_reward_func": 0.0, "step": 1814 }, { "completion_length": 249.890625, "epoch": 14.52, "grad_norm": 2.125, "kl": 4.463883638381958, "learning_rate": 4.4678759004165584e-05, "loss": 0.1786, "reward": 1.78596431016922, "reward_std": 2.62847900390625, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0493826866149902, "rewards/no_repetition_reward_func": -0.25560592859983444, "rewards/verse_reward_func": -0.0078125, "step": 1815 }, { "completion_length": 254.859375, "epoch": 14.528, "grad_norm": 3.1875, "kl": 4.457666873931885, "learning_rate": 4.4670144325284414e-05, "loss": 0.1783, "reward": 1.875624656677246, "reward_std": 2.4850217700004578, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1553549766540527, "rewards/no_repetition_reward_func": -0.27973026037216187, "rewards/verse_reward_func": 0.0, "step": 1816 }, { "completion_length": 254.203125, "epoch": 14.536, "grad_norm": 1.59375, "kl": 4.19492495059967, "learning_rate": 4.466152351072994e-05, "loss": 0.1678, "reward": 2.3203917741775513, "reward_std": 2.478487730026245, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.575574040412903, "rewards/no_repetition_reward_func": -0.25518227368593216, "rewards/verse_reward_func": 0.0, "step": 1817 }, { "completion_length": 252.421875, "epoch": 14.544, "grad_norm": 1.78125, "kl": 4.233865022659302, "learning_rate": 4.465289656319124e-05, "loss": 0.1694, "reward": 2.9016464948654175, "reward_std": 3.3079299926757812, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1970046758651733, "rewards/no_repetition_reward_func": -0.2875458151102066, "rewards/verse_reward_func": -0.0078125, "step": 1818 }, { "completion_length": 256.0, "epoch": 14.552, "grad_norm": 3.359375, "kl": 3.2068958282470703, "learning_rate": 4.464426348535931e-05, "loss": 0.1283, "reward": 3.125808298587799, "reward_std": 2.6489832401275635, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4167027473449707, "rewards/no_repetition_reward_func": -0.2908947169780731, "rewards/verse_reward_func": 0.0, "step": 1819 }, { "completion_length": 255.21875, "epoch": 14.56, "grad_norm": 4.5, "kl": 3.431750535964966, "learning_rate": 4.4635624279927044e-05, "loss": 0.1373, "reward": 3.5217444896698, "reward_std": 3.109593629837036, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.8269734382629395, "rewards/no_repetition_reward_func": -0.30522893369197845, "rewards/verse_reward_func": 0.0, "step": 1820 }, { "completion_length": 253.59375, "epoch": 14.568, "grad_norm": 2.890625, "kl": 4.753268480300903, "learning_rate": 4.462697894958926e-05, "loss": 0.1901, "reward": 1.5587921142578125, "reward_std": 2.6653748750686646, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8084181547164917, "rewards/no_repetition_reward_func": -0.24962595105171204, "rewards/verse_reward_func": 0.0, "step": 1821 }, { "completion_length": 256.0, "epoch": 14.576, "grad_norm": 1.8671875, "kl": 4.0989670753479, "learning_rate": 4.461832749704268e-05, "loss": 0.164, "reward": 2.1589738726615906, "reward_std": 3.110671877861023, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4644415378570557, "rewards/no_repetition_reward_func": -0.3054676055908203, "rewards/verse_reward_func": 0.0, "step": 1822 }, { "completion_length": 255.53125, "epoch": 14.584, "grad_norm": 1.484375, "kl": 4.250131845474243, "learning_rate": 4.460966992498593e-05, "loss": 0.17, "reward": 2.7535674571990967, "reward_std": 3.173603653907776, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.024683952331543, "rewards/no_repetition_reward_func": -0.27111653983592987, "rewards/verse_reward_func": 0.0, "step": 1823 }, { "completion_length": 256.0, "epoch": 14.592, "grad_norm": 2.40625, "kl": 4.600650787353516, "learning_rate": 4.460100623611955e-05, "loss": 0.184, "reward": 2.4680298566818237, "reward_std": 3.040310025215149, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.737311601638794, "rewards/no_repetition_reward_func": -0.2692818343639374, "rewards/verse_reward_func": 0.0, "step": 1824 }, { "completion_length": 251.09375, "epoch": 14.6, "grad_norm": 2.421875, "kl": 5.256101369857788, "learning_rate": 4.4592336433146e-05, "loss": 0.2102, "reward": 2.3612990379333496, "reward_std": 2.704888105392456, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6311511993408203, "rewards/no_repetition_reward_func": -0.2698521018028259, "rewards/verse_reward_func": 0.0, "step": 1825 }, { "completion_length": 253.125, "epoch": 14.608, "grad_norm": 2.453125, "kl": 4.410403728485107, "learning_rate": 4.458366051876962e-05, "loss": 0.1764, "reward": 2.821632981300354, "reward_std": 2.820295810699463, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.113127112388611, "rewards/no_repetition_reward_func": -0.2836814671754837, "rewards/verse_reward_func": -0.0078125, "step": 1826 }, { "completion_length": 254.359375, "epoch": 14.616, "grad_norm": 3.109375, "kl": 3.8401384353637695, "learning_rate": 4.45749784956967e-05, "loss": 0.1536, "reward": 2.8387051820755005, "reward_std": 3.1318271160125732, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1340079307556152, "rewards/no_repetition_reward_func": -0.2953025698661804, "rewards/verse_reward_func": 0.0, "step": 1827 }, { "completion_length": 241.0, "epoch": 14.624, "grad_norm": 2.609375, "kl": 5.423281192779541, "learning_rate": 4.456629036663537e-05, "loss": 0.2169, "reward": 1.5759941935539246, "reward_std": 2.4277517795562744, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7993090748786926, "rewards/no_repetition_reward_func": -0.2155023217201233, "rewards/verse_reward_func": -0.0078125, "step": 1828 }, { "completion_length": 252.640625, "epoch": 14.632, "grad_norm": 2.21875, "kl": 5.179393768310547, "learning_rate": 4.455759613429573e-05, "loss": 0.2072, "reward": 2.6007460355758667, "reward_std": 3.285610318183899, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.887034296989441, "rewards/no_repetition_reward_func": -0.28628823161125183, "rewards/verse_reward_func": 0.0, "step": 1829 }, { "completion_length": 252.46875, "epoch": 14.64, "grad_norm": 1.8671875, "kl": 4.192113399505615, "learning_rate": 4.454889580138975e-05, "loss": 0.1677, "reward": 2.7010464668273926, "reward_std": 2.8870044946670532, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9912238121032715, "rewards/no_repetition_reward_func": -0.29017725586891174, "rewards/verse_reward_func": 0.0, "step": 1830 }, { "completion_length": 250.640625, "epoch": 14.648, "grad_norm": 2.078125, "kl": 4.35615611076355, "learning_rate": 4.4540189370631315e-05, "loss": 0.1742, "reward": 2.6122719049453735, "reward_std": 3.2431514263153076, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8765735626220703, "rewards/no_repetition_reward_func": -0.2643016278743744, "rewards/verse_reward_func": 0.0, "step": 1831 }, { "completion_length": 253.296875, "epoch": 14.656, "grad_norm": 1.734375, "kl": 5.07262110710144, "learning_rate": 4.45314768447362e-05, "loss": 0.2029, "reward": 2.479531764984131, "reward_std": 2.8077590465545654, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7593469619750977, "rewards/no_repetition_reward_func": -0.2641901820898056, "rewards/verse_reward_func": -0.015625, "step": 1832 }, { "completion_length": 252.390625, "epoch": 14.664, "grad_norm": 2.890625, "kl": 4.501817941665649, "learning_rate": 4.4522758226422076e-05, "loss": 0.1801, "reward": 1.899569034576416, "reward_std": 2.602607011795044, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.193479895591736, "rewards/no_repetition_reward_func": -0.262660875916481, "rewards/verse_reward_func": -0.03125, "step": 1833 }, { "completion_length": 253.203125, "epoch": 14.672, "grad_norm": 3.03125, "kl": 4.91625714302063, "learning_rate": 4.451403351840855e-05, "loss": 0.1967, "reward": 2.4297455549240112, "reward_std": 2.8692647218704224, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.736194610595703, "rewards/no_repetition_reward_func": -0.29863667488098145, "rewards/verse_reward_func": -0.0078125, "step": 1834 }, { "completion_length": 250.4375, "epoch": 14.68, "grad_norm": 2.046875, "kl": 3.950467109680176, "learning_rate": 4.450530272341709e-05, "loss": 0.158, "reward": 2.049817442893982, "reward_std": 2.896643877029419, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.33883535861969, "rewards/no_repetition_reward_func": -0.2890178859233856, "rewards/verse_reward_func": 0.0, "step": 1835 }, { "completion_length": 252.21875, "epoch": 14.688, "grad_norm": 2.1875, "kl": 4.065912485122681, "learning_rate": 4.449656584417108e-05, "loss": 0.1626, "reward": 2.3970214128494263, "reward_std": 2.939082980155945, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6588934659957886, "rewards/no_repetition_reward_func": -0.2618721127510071, "rewards/verse_reward_func": 0.0, "step": 1836 }, { "completion_length": 245.015625, "epoch": 14.696, "grad_norm": 1.96875, "kl": 3.9754998683929443, "learning_rate": 4.4487822883395805e-05, "loss": 0.159, "reward": 1.4907759428024292, "reward_std": 2.417338788509369, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7189738154411316, "rewards/no_repetition_reward_func": -0.2203853502869606, "rewards/verse_reward_func": -0.0078125, "step": 1837 }, { "completion_length": 251.953125, "epoch": 14.704, "grad_norm": 2.71875, "kl": 2.4655613899230957, "learning_rate": 4.447907384381843e-05, "loss": 0.0986, "reward": 2.81820547580719, "reward_std": 2.948077917098999, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1382399797439575, "rewards/no_repetition_reward_func": -0.3200347274541855, "rewards/verse_reward_func": 0.0, "step": 1838 }, { "completion_length": 251.875, "epoch": 14.712, "grad_norm": 3.390625, "kl": 3.240431070327759, "learning_rate": 4.447031872816804e-05, "loss": 0.1296, "reward": 2.4244571924209595, "reward_std": 3.010670304298401, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.706884741783142, "rewards/no_repetition_reward_func": -0.28242744505405426, "rewards/verse_reward_func": 0.0, "step": 1839 }, { "completion_length": 248.390625, "epoch": 14.72, "grad_norm": 2.109375, "kl": 3.0757944583892822, "learning_rate": 4.4461557539175594e-05, "loss": 0.123, "reward": 2.5173128843307495, "reward_std": 2.7114415168762207, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.759902000427246, "rewards/no_repetition_reward_func": -0.24258901178836823, "rewards/verse_reward_func": 0.0, "step": 1840 }, { "completion_length": 248.28125, "epoch": 14.728, "grad_norm": 1.9609375, "kl": 3.380487084388733, "learning_rate": 4.445279027957395e-05, "loss": 0.1352, "reward": 1.9766022562980652, "reward_std": 3.0480031967163086, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2500163316726685, "rewards/no_repetition_reward_func": -0.25778916478157043, "rewards/verse_reward_func": -0.015625, "step": 1841 }, { "completion_length": 256.0, "epoch": 14.736, "grad_norm": 2.03125, "kl": 2.772907853126526, "learning_rate": 4.444401695209788e-05, "loss": 0.1109, "reward": 2.941509962081909, "reward_std": 3.030388832092285, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.234591841697693, "rewards/no_repetition_reward_func": -0.2930818945169449, "rewards/verse_reward_func": 0.0, "step": 1842 }, { "completion_length": 255.109375, "epoch": 14.744, "grad_norm": 2.484375, "kl": 3.5471608638763428, "learning_rate": 4.443523755948401e-05, "loss": 0.1419, "reward": 2.473457455635071, "reward_std": 2.7015466690063477, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7639594078063965, "rewards/no_repetition_reward_func": -0.2905019521713257, "rewards/verse_reward_func": 0.0, "step": 1843 }, { "completion_length": 251.296875, "epoch": 14.752, "grad_norm": 3.078125, "kl": 3.173816680908203, "learning_rate": 4.4426452104470903e-05, "loss": 0.127, "reward": 3.055323839187622, "reward_std": 3.1059885025024414, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3407124280929565, "rewards/no_repetition_reward_func": -0.28538845479488373, "rewards/verse_reward_func": 0.0, "step": 1844 }, { "completion_length": 253.828125, "epoch": 14.76, "grad_norm": 7.21875, "kl": 5.704965353012085, "learning_rate": 4.441766058979898e-05, "loss": 0.2282, "reward": 2.3820245265960693, "reward_std": 2.75676691532135, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6417534351348877, "rewards/no_repetition_reward_func": -0.2597290128469467, "rewards/verse_reward_func": 0.0, "step": 1845 }, { "completion_length": 248.671875, "epoch": 14.768, "grad_norm": 2.71875, "kl": 3.9242098331451416, "learning_rate": 4.4408863018210564e-05, "loss": 0.157, "reward": 2.4091466665267944, "reward_std": 2.994455099105835, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.700688898563385, "rewards/no_repetition_reward_func": -0.28372974693775177, "rewards/verse_reward_func": -0.0078125, "step": 1846 }, { "completion_length": 242.015625, "epoch": 14.776, "grad_norm": 2.046875, "kl": 5.28900146484375, "learning_rate": 4.440005939244986e-05, "loss": 0.2116, "reward": 2.2642041444778442, "reward_std": 2.876192092895508, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5111364126205444, "rewards/no_repetition_reward_func": -0.24693214148283005, "rewards/verse_reward_func": 0.0, "step": 1847 }, { "completion_length": 256.0, "epoch": 14.784, "grad_norm": 2.609375, "kl": 5.180809259414673, "learning_rate": 4.439124971526297e-05, "loss": 0.2072, "reward": 1.8117353320121765, "reward_std": 2.7671796083450317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.064277708530426, "rewards/no_repetition_reward_func": -0.25254227966070175, "rewards/verse_reward_func": 0.0, "step": 1848 }, { "completion_length": 252.0625, "epoch": 14.792, "grad_norm": 1.53125, "kl": 4.622786521911621, "learning_rate": 4.4382433989397895e-05, "loss": 0.1849, "reward": 2.1147470474243164, "reward_std": 3.0137985944747925, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.408128261566162, "rewards/no_repetition_reward_func": -0.29338113963603973, "rewards/verse_reward_func": 0.0, "step": 1849 }, { "completion_length": 250.390625, "epoch": 14.8, "grad_norm": 5.03125, "kl": 4.557992696762085, "learning_rate": 4.4373612217604496e-05, "loss": 0.1823, "reward": 2.300025463104248, "reward_std": 2.8826818466186523, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6109598875045776, "rewards/no_repetition_reward_func": -0.29530946910381317, "rewards/verse_reward_func": -0.015625, "step": 1850 }, { "completion_length": 254.921875, "epoch": 14.808, "grad_norm": 3.125, "kl": 4.363868951797485, "learning_rate": 4.436478440263453e-05, "loss": 0.1746, "reward": 2.256150484085083, "reward_std": 3.125204920768738, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5696496963500977, "rewards/no_repetition_reward_func": -0.3134990930557251, "rewards/verse_reward_func": 0.0, "step": 1851 }, { "completion_length": 251.984375, "epoch": 14.816, "grad_norm": 2.8125, "kl": 3.4506146907806396, "learning_rate": 4.4355950547241645e-05, "loss": 0.138, "reward": 2.734968423843384, "reward_std": 3.1154600381851196, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.036012291908264, "rewards/no_repetition_reward_func": -0.30104397237300873, "rewards/verse_reward_func": 0.0, "step": 1852 }, { "completion_length": 248.390625, "epoch": 14.824, "grad_norm": 2.125, "kl": 4.102378487586975, "learning_rate": 4.434711065418137e-05, "loss": 0.1641, "reward": 3.027980327606201, "reward_std": 3.0138401985168457, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3145415782928467, "rewards/no_repetition_reward_func": -0.2865613251924515, "rewards/verse_reward_func": 0.0, "step": 1853 }, { "completion_length": 250.0, "epoch": 14.832, "grad_norm": 2.46875, "kl": 4.096634149551392, "learning_rate": 4.433826472621112e-05, "loss": 0.1639, "reward": 2.6889551877975464, "reward_std": 2.9878644943237305, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9684120416641235, "rewards/no_repetition_reward_func": -0.2638319432735443, "rewards/verse_reward_func": -0.015625, "step": 1854 }, { "completion_length": 249.296875, "epoch": 14.84, "grad_norm": 2.78125, "kl": 5.097870826721191, "learning_rate": 4.432941276609018e-05, "loss": 0.2039, "reward": 1.985856294631958, "reward_std": 2.6159902811050415, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.242559790611267, "rewards/no_repetition_reward_func": -0.24889087677001953, "rewards/verse_reward_func": -0.0078125, "step": 1855 }, { "completion_length": 251.3125, "epoch": 14.848, "grad_norm": 3.859375, "kl": 4.948017358779907, "learning_rate": 4.4320554776579747e-05, "loss": 0.1979, "reward": 2.3628143072128296, "reward_std": 3.0431692600250244, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.665433645248413, "rewards/no_repetition_reward_func": -0.2869942635297775, "rewards/verse_reward_func": -0.015625, "step": 1856 }, { "completion_length": 239.8125, "epoch": 14.856, "grad_norm": 2.171875, "kl": 4.599810600280762, "learning_rate": 4.431169076044286e-05, "loss": 0.184, "reward": 2.8366806507110596, "reward_std": 3.2231770753860474, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.126090168952942, "rewards/no_repetition_reward_func": -0.28940968215465546, "rewards/verse_reward_func": 0.0, "step": 1857 }, { "completion_length": 244.90625, "epoch": 14.864, "grad_norm": 2.890625, "kl": 4.627578258514404, "learning_rate": 4.4302820720444456e-05, "loss": 0.1851, "reward": 2.296171009540558, "reward_std": 2.786367654800415, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.541451573371887, "rewards/no_repetition_reward_func": -0.24528054893016815, "rewards/verse_reward_func": 0.0, "step": 1858 }, { "completion_length": 249.796875, "epoch": 14.872, "grad_norm": 3.0625, "kl": 4.776402711868286, "learning_rate": 4.429394465935136e-05, "loss": 0.1911, "reward": 1.4702594876289368, "reward_std": 2.709187626838684, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7683289647102356, "rewards/no_repetition_reward_func": -0.29025696218013763, "rewards/verse_reward_func": -0.0078125, "step": 1859 }, { "completion_length": 245.578125, "epoch": 14.88, "grad_norm": 1.921875, "kl": 5.087552070617676, "learning_rate": 4.428506257993226e-05, "loss": 0.2035, "reward": 1.88869708776474, "reward_std": 3.074682831764221, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1450915336608887, "rewards/no_repetition_reward_func": -0.24858197569847107, "rewards/verse_reward_func": -0.0078125, "step": 1860 }, { "completion_length": 249.765625, "epoch": 14.888, "grad_norm": 6.0625, "kl": 5.0192787647247314, "learning_rate": 4.427617448495772e-05, "loss": 0.2008, "reward": 1.753590703010559, "reward_std": 2.516616106033325, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.015731453895569, "rewards/no_repetition_reward_func": -0.2621408700942993, "rewards/verse_reward_func": 0.0, "step": 1861 }, { "completion_length": 243.109375, "epoch": 14.896, "grad_norm": 1.8515625, "kl": 3.8709921836853027, "learning_rate": 4.4267280377200205e-05, "loss": 0.1548, "reward": 2.1341161727905273, "reward_std": 3.1095060110092163, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.441476583480835, "rewards/no_repetition_reward_func": -0.2839231342077255, "rewards/verse_reward_func": -0.0234375, "step": 1862 }, { "completion_length": 248.734375, "epoch": 14.904, "grad_norm": 1.8203125, "kl": 3.9121187925338745, "learning_rate": 4.425838025943403e-05, "loss": 0.1565, "reward": 2.4694554805755615, "reward_std": 2.9350932836532593, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7146151065826416, "rewards/no_repetition_reward_func": -0.24515949189662933, "rewards/verse_reward_func": 0.0, "step": 1863 }, { "completion_length": 245.6875, "epoch": 14.912, "grad_norm": 3.046875, "kl": 3.710153579711914, "learning_rate": 4.424947413443539e-05, "loss": 0.1484, "reward": 1.8261713981628418, "reward_std": 2.745505690574646, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.100281536579132, "rewards/no_repetition_reward_func": -0.26629768311977386, "rewards/verse_reward_func": -0.0078125, "step": 1864 }, { "completion_length": 254.71875, "epoch": 14.92, "grad_norm": 1.984375, "kl": 3.2060375213623047, "learning_rate": 4.4240562004982364e-05, "loss": 0.1282, "reward": 2.2043367624282837, "reward_std": 2.9244617223739624, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.490149736404419, "rewards/no_repetition_reward_func": -0.28581298887729645, "rewards/verse_reward_func": 0.0, "step": 1865 }, { "completion_length": 252.296875, "epoch": 14.928, "grad_norm": 3.859375, "kl": 2.787001609802246, "learning_rate": 4.423164387385489e-05, "loss": 0.1115, "reward": 3.033717393875122, "reward_std": 3.2611695528030396, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3291003704071045, "rewards/no_repetition_reward_func": -0.2953827530145645, "rewards/verse_reward_func": 0.0, "step": 1866 }, { "completion_length": 248.609375, "epoch": 14.936, "grad_norm": 3.296875, "kl": 2.81294322013855, "learning_rate": 4.422271974383479e-05, "loss": 0.1125, "reward": 2.5519602298736572, "reward_std": 2.7808061838150024, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.822941780090332, "rewards/no_repetition_reward_func": -0.2631688714027405, "rewards/verse_reward_func": -0.0078125, "step": 1867 }, { "completion_length": 246.609375, "epoch": 14.943999999999999, "grad_norm": 2.859375, "kl": 3.1912113428115845, "learning_rate": 4.4213789617705746e-05, "loss": 0.1276, "reward": 2.6248862743377686, "reward_std": 2.7810055017471313, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.880353808403015, "rewards/no_repetition_reward_func": -0.2554675042629242, "rewards/verse_reward_func": 0.0, "step": 1868 }, { "completion_length": 255.296875, "epoch": 14.952, "grad_norm": 3.078125, "kl": 3.205079197883606, "learning_rate": 4.420485349825332e-05, "loss": 0.1282, "reward": 2.9114489555358887, "reward_std": 3.1877580881118774, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1780483722686768, "rewards/no_repetition_reward_func": -0.26659920811653137, "rewards/verse_reward_func": 0.0, "step": 1869 }, { "completion_length": 253.9375, "epoch": 14.96, "grad_norm": 2.96875, "kl": 3.4468188285827637, "learning_rate": 4.4195911388264946e-05, "loss": 0.1379, "reward": 2.8370156288146973, "reward_std": 2.917057991027832, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1079938411712646, "rewards/no_repetition_reward_func": -0.2709781378507614, "rewards/verse_reward_func": 0.0, "step": 1870 }, { "completion_length": 253.28125, "epoch": 14.968, "grad_norm": 2.578125, "kl": 3.4750524759292603, "learning_rate": 4.41869632905299e-05, "loss": 0.139, "reward": 1.873529076576233, "reward_std": 2.685386538505554, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1658542156219482, "rewards/no_repetition_reward_func": -0.29232506453990936, "rewards/verse_reward_func": 0.0, "step": 1871 }, { "completion_length": 248.25, "epoch": 14.975999999999999, "grad_norm": 2.1875, "kl": 3.5758755207061768, "learning_rate": 4.417800920783937e-05, "loss": 0.143, "reward": 2.6642918586730957, "reward_std": 2.815982699394226, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.949272632598877, "rewards/no_repetition_reward_func": -0.2849808782339096, "rewards/verse_reward_func": 0.0, "step": 1872 }, { "completion_length": 252.640625, "epoch": 14.984, "grad_norm": 2.8125, "kl": 4.544521689414978, "learning_rate": 4.4169049142986376e-05, "loss": 0.1818, "reward": 2.0366923809051514, "reward_std": 2.638840079307556, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3031647205352783, "rewards/no_repetition_reward_func": -0.2664725184440613, "rewards/verse_reward_func": 0.0, "step": 1873 }, { "completion_length": 254.171875, "epoch": 14.992, "grad_norm": 4.34375, "kl": 4.749820709228516, "learning_rate": 4.4160083098765815e-05, "loss": 0.19, "reward": 2.450315833091736, "reward_std": 3.0567514896392822, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.726682424545288, "rewards/no_repetition_reward_func": -0.2685542181134224, "rewards/verse_reward_func": -0.0078125, "step": 1874 }, { "completion_length": 248.9375, "epoch": 15.0, "grad_norm": 0.875, "kl": 4.73206090927124, "learning_rate": 4.415111107797445e-05, "loss": 0.1893, "reward": 2.554244041442871, "reward_std": 3.2521865367889404, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.812561511993408, "rewards/no_repetition_reward_func": -0.2583175301551819, "rewards/verse_reward_func": 0.0, "step": 1875 }, { "completion_length": 251.921875, "epoch": 15.008, "grad_norm": 2.90625, "kl": 4.531435251235962, "learning_rate": 4.414213308341092e-05, "loss": 0.1813, "reward": 2.729182720184326, "reward_std": 2.921720862388611, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.993027091026306, "rewards/no_repetition_reward_func": -0.263844333589077, "rewards/verse_reward_func": 0.0, "step": 1876 }, { "completion_length": 244.671875, "epoch": 15.016, "grad_norm": 4.3125, "kl": 4.536525130271912, "learning_rate": 4.413314911787569e-05, "loss": 0.1815, "reward": 2.229756236076355, "reward_std": 2.738888382911682, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4858208894729614, "rewards/no_repetition_reward_func": -0.24043989181518555, "rewards/verse_reward_func": -0.015625, "step": 1877 }, { "completion_length": 253.5, "epoch": 15.024, "grad_norm": 3.78125, "kl": 3.3333545923233032, "learning_rate": 4.4124159184171134e-05, "loss": 0.1333, "reward": 2.793879508972168, "reward_std": 2.891487717628479, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1043639183044434, "rewards/no_repetition_reward_func": -0.31048448383808136, "rewards/verse_reward_func": 0.0, "step": 1878 }, { "completion_length": 252.03125, "epoch": 15.032, "grad_norm": 2.765625, "kl": 4.532201051712036, "learning_rate": 4.411516328510145e-05, "loss": 0.1813, "reward": 2.8042054176330566, "reward_std": 2.895455241203308, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0967854261398315, "rewards/no_repetition_reward_func": -0.28476740419864655, "rewards/verse_reward_func": -0.0078125, "step": 1879 }, { "completion_length": 256.0, "epoch": 15.04, "grad_norm": 2.5625, "kl": 4.593896389007568, "learning_rate": 4.410616142347273e-05, "loss": 0.1838, "reward": 2.8596420288085938, "reward_std": 3.2948659658432007, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1500972509384155, "rewards/no_repetition_reward_func": -0.29045508801937103, "rewards/verse_reward_func": 0.0, "step": 1880 }, { "completion_length": 256.0, "epoch": 15.048, "grad_norm": 3.0, "kl": 4.181808114051819, "learning_rate": 4.409715360209289e-05, "loss": 0.1673, "reward": 2.225751042366028, "reward_std": 2.9429705142974854, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.509200096130371, "rewards/no_repetition_reward_func": -0.26782407611608505, "rewards/verse_reward_func": -0.015625, "step": 1881 }, { "completion_length": 250.484375, "epoch": 15.056, "grad_norm": 4.40625, "kl": 4.841065526008606, "learning_rate": 4.4088139823771744e-05, "loss": 0.1936, "reward": 2.316160798072815, "reward_std": 2.3947535157203674, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.558337390422821, "rewards/no_repetition_reward_func": -0.2421766147017479, "rewards/verse_reward_func": 0.0, "step": 1882 }, { "completion_length": 251.84375, "epoch": 15.064, "grad_norm": 3.4375, "kl": 4.696600437164307, "learning_rate": 4.407912009132093e-05, "loss": 0.1879, "reward": 2.6073551177978516, "reward_std": 2.933571457862854, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.870538115501404, "rewards/no_repetition_reward_func": -0.26318296045064926, "rewards/verse_reward_func": 0.0, "step": 1883 }, { "completion_length": 242.65625, "epoch": 15.072, "grad_norm": 2.328125, "kl": 5.920075416564941, "learning_rate": 4.407009440755396e-05, "loss": 0.2368, "reward": 2.2604880332946777, "reward_std": 3.0735535621643066, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5679364800453186, "rewards/no_repetition_reward_func": -0.2683858275413513, "rewards/verse_reward_func": -0.0390625, "step": 1884 }, { "completion_length": 247.09375, "epoch": 15.08, "grad_norm": 2.140625, "kl": 4.161958456039429, "learning_rate": 4.40610627752862e-05, "loss": 0.1665, "reward": 2.281181812286377, "reward_std": 2.75810968875885, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5475757122039795, "rewards/no_repetition_reward_func": -0.26639416068792343, "rewards/verse_reward_func": 0.0, "step": 1885 }, { "completion_length": 256.0, "epoch": 15.088, "grad_norm": 1.8515625, "kl": 3.830838918685913, "learning_rate": 4.4052025197334864e-05, "loss": 0.1532, "reward": 2.940195679664612, "reward_std": 2.66694712638855, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.208266019821167, "rewards/no_repetition_reward_func": -0.26807017624378204, "rewards/verse_reward_func": 0.0, "step": 1886 }, { "completion_length": 243.671875, "epoch": 15.096, "grad_norm": 2.671875, "kl": 5.416364908218384, "learning_rate": 4.404298167651905e-05, "loss": 0.2167, "reward": 2.105177164077759, "reward_std": 2.873661160469055, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.369183301925659, "rewards/no_repetition_reward_func": -0.2561938762664795, "rewards/verse_reward_func": -0.0078125, "step": 1887 }, { "completion_length": 255.40625, "epoch": 15.104, "grad_norm": 1.8203125, "kl": 4.347419023513794, "learning_rate": 4.403393221565966e-05, "loss": 0.1739, "reward": 2.323427677154541, "reward_std": 3.0234490633010864, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.619462490081787, "rewards/no_repetition_reward_func": -0.2725972980260849, "rewards/verse_reward_func": -0.0234375, "step": 1888 }, { "completion_length": 253.8125, "epoch": 15.112, "grad_norm": 3.15625, "kl": 3.7790915966033936, "learning_rate": 4.40248768175795e-05, "loss": 0.1512, "reward": 2.883841037750244, "reward_std": 3.2332721948623657, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.206042528152466, "rewards/no_repetition_reward_func": -0.3065764755010605, "rewards/verse_reward_func": -0.015625, "step": 1889 }, { "completion_length": 251.875, "epoch": 15.12, "grad_norm": 1.390625, "kl": 4.2918713092803955, "learning_rate": 4.401581548510318e-05, "loss": 0.1717, "reward": 2.270224094390869, "reward_std": 2.995190143585205, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.569191336631775, "rewards/no_repetition_reward_func": -0.2833421677350998, "rewards/verse_reward_func": -0.015625, "step": 1890 }, { "completion_length": 255.453125, "epoch": 15.128, "grad_norm": 1.6171875, "kl": 4.5571818351745605, "learning_rate": 4.4006748221057206e-05, "loss": 0.1823, "reward": 1.7804439663887024, "reward_std": 2.716550588607788, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.06441193819046, "rewards/no_repetition_reward_func": -0.2839677780866623, "rewards/verse_reward_func": 0.0, "step": 1891 }, { "completion_length": 254.171875, "epoch": 15.136, "grad_norm": 1.546875, "kl": 4.331311225891113, "learning_rate": 4.3997675028269906e-05, "loss": 0.1733, "reward": 2.109216570854187, "reward_std": 2.5870800018310547, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3898308277130127, "rewards/no_repetition_reward_func": -0.2806141674518585, "rewards/verse_reward_func": 0.0, "step": 1892 }, { "completion_length": 248.484375, "epoch": 15.144, "grad_norm": 2.03125, "kl": 4.220902442932129, "learning_rate": 4.3988595909571464e-05, "loss": 0.1688, "reward": 2.420858144760132, "reward_std": 2.812934398651123, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.66280460357666, "rewards/no_repetition_reward_func": -0.24194657057523727, "rewards/verse_reward_func": 0.0, "step": 1893 }, { "completion_length": 245.921875, "epoch": 15.152, "grad_norm": 3.265625, "kl": 2.9288421869277954, "learning_rate": 4.3979510867793917e-05, "loss": 0.1172, "reward": 2.4544533491134644, "reward_std": 2.867854595184326, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7228652238845825, "rewards/no_repetition_reward_func": -0.2684117928147316, "rewards/verse_reward_func": 0.0, "step": 1894 }, { "completion_length": 254.21875, "epoch": 15.16, "grad_norm": 5.625, "kl": 3.4401066303253174, "learning_rate": 4.3970419905771145e-05, "loss": 0.1376, "reward": 3.253246545791626, "reward_std": 3.4306578636169434, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5709320306777954, "rewards/no_repetition_reward_func": -0.3098730742931366, "rewards/verse_reward_func": -0.0078125, "step": 1895 }, { "completion_length": 248.3125, "epoch": 15.168, "grad_norm": 1.46875, "kl": 4.5326573848724365, "learning_rate": 4.396132302633886e-05, "loss": 0.1813, "reward": 2.1803849935531616, "reward_std": 3.2329858541488647, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.475848615169525, "rewards/no_repetition_reward_func": -0.27983856201171875, "rewards/verse_reward_func": -0.015625, "step": 1896 }, { "completion_length": 253.421875, "epoch": 15.176, "grad_norm": 1.859375, "kl": 5.212777137756348, "learning_rate": 4.395222023233466e-05, "loss": 0.2085, "reward": 2.4821897745132446, "reward_std": 3.073082447052002, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.788223147392273, "rewards/no_repetition_reward_func": -0.3060333877801895, "rewards/verse_reward_func": 0.0, "step": 1897 }, { "completion_length": 247.71875, "epoch": 15.184, "grad_norm": 3.4375, "kl": 4.607198476791382, "learning_rate": 4.394311152659796e-05, "loss": 0.1843, "reward": 2.1931151747703552, "reward_std": 2.637712240219116, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4560368061065674, "rewards/no_repetition_reward_func": -0.262921467423439, "rewards/verse_reward_func": 0.0, "step": 1898 }, { "completion_length": 243.453125, "epoch": 15.192, "grad_norm": 2.484375, "kl": 4.661084175109863, "learning_rate": 4.393399691197e-05, "loss": 0.1864, "reward": 2.1909136176109314, "reward_std": 3.016323685646057, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.479037880897522, "rewards/no_repetition_reward_func": -0.288124218583107, "rewards/verse_reward_func": 0.0, "step": 1899 }, { "completion_length": 250.015625, "epoch": 15.2, "grad_norm": 4.15625, "kl": 5.231050968170166, "learning_rate": 4.3924876391293915e-05, "loss": 0.2092, "reward": 1.9974160194396973, "reward_std": 2.5526092052459717, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.245129942893982, "rewards/no_repetition_reward_func": -0.24771391600370407, "rewards/verse_reward_func": 0.0, "step": 1900 }, { "completion_length": 246.3125, "epoch": 15.208, "grad_norm": 3.703125, "kl": 4.385200262069702, "learning_rate": 4.391574996741463e-05, "loss": 0.1754, "reward": 2.4095932245254517, "reward_std": 2.961822032928467, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6634762287139893, "rewards/no_repetition_reward_func": -0.2460707277059555, "rewards/verse_reward_func": -0.0078125, "step": 1901 }, { "completion_length": 248.171875, "epoch": 15.216, "grad_norm": 3.015625, "kl": 4.038858890533447, "learning_rate": 4.390661764317895e-05, "loss": 0.1616, "reward": 2.767369270324707, "reward_std": 2.991055488586426, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.063042998313904, "rewards/no_repetition_reward_func": -0.29567359387874603, "rewards/verse_reward_func": 0.0, "step": 1902 }, { "completion_length": 243.734375, "epoch": 15.224, "grad_norm": 3.671875, "kl": 3.9687020778656006, "learning_rate": 4.38974794214355e-05, "loss": 0.1587, "reward": 1.8985273838043213, "reward_std": 2.423047721385956, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1652644872665405, "rewards/no_repetition_reward_func": -0.2667369693517685, "rewards/verse_reward_func": 0.0, "step": 1903 }, { "completion_length": 252.1875, "epoch": 15.232, "grad_norm": 2.21875, "kl": 3.924084782600403, "learning_rate": 4.388833530503473e-05, "loss": 0.157, "reward": 2.1461784839630127, "reward_std": 2.9792336225509644, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.432167887687683, "rewards/no_repetition_reward_func": -0.2859894782304764, "rewards/verse_reward_func": 0.0, "step": 1904 }, { "completion_length": 248.609375, "epoch": 15.24, "grad_norm": 1.890625, "kl": 3.7125208377838135, "learning_rate": 4.387918529682898e-05, "loss": 0.1485, "reward": 2.172814726829529, "reward_std": 2.474555730819702, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.473851501941681, "rewards/no_repetition_reward_func": -0.29322443902492523, "rewards/verse_reward_func": -0.0078125, "step": 1905 }, { "completion_length": 249.1875, "epoch": 15.248, "grad_norm": 3.25, "kl": 3.41409432888031, "learning_rate": 4.387002939967237e-05, "loss": 0.1366, "reward": 2.6360021829605103, "reward_std": 3.2203450202941895, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.945147752761841, "rewards/no_repetition_reward_func": -0.30133335292339325, "rewards/verse_reward_func": -0.0078125, "step": 1906 }, { "completion_length": 254.703125, "epoch": 15.256, "grad_norm": 2.296875, "kl": 3.7533833980560303, "learning_rate": 4.386086761642091e-05, "loss": 0.1501, "reward": 2.400071620941162, "reward_std": 2.957008123397827, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6867438554763794, "rewards/no_repetition_reward_func": -0.28667210042476654, "rewards/verse_reward_func": 0.0, "step": 1907 }, { "completion_length": 242.875, "epoch": 15.264, "grad_norm": 3.21875, "kl": 3.3113869428634644, "learning_rate": 4.3851699949932396e-05, "loss": 0.1325, "reward": 2.4913793802261353, "reward_std": 2.82290256023407, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.772277593612671, "rewards/no_repetition_reward_func": -0.2808980941772461, "rewards/verse_reward_func": 0.0, "step": 1908 }, { "completion_length": 253.109375, "epoch": 15.272, "grad_norm": 1.5546875, "kl": 4.303768873214722, "learning_rate": 4.3842526403066486e-05, "loss": 0.1722, "reward": 2.509332060813904, "reward_std": 3.417597770690918, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.795004725456238, "rewards/no_repetition_reward_func": -0.2856725752353668, "rewards/verse_reward_func": 0.0, "step": 1909 }, { "completion_length": 245.53125, "epoch": 15.28, "grad_norm": 3.9375, "kl": 2.8169608116149902, "learning_rate": 4.3833346978684675e-05, "loss": 0.1127, "reward": 3.265145182609558, "reward_std": 3.1535102128982544, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5584676265716553, "rewards/no_repetition_reward_func": -0.2855100482702255, "rewards/verse_reward_func": -0.0078125, "step": 1910 }, { "completion_length": 243.8125, "epoch": 15.288, "grad_norm": 2.8125, "kl": 3.593822717666626, "learning_rate": 4.382416167965028e-05, "loss": 0.1438, "reward": 2.5144923329353333, "reward_std": 2.8530243635177612, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.815654933452606, "rewards/no_repetition_reward_func": -0.28553760051727295, "rewards/verse_reward_func": -0.015625, "step": 1911 }, { "completion_length": 247.609375, "epoch": 15.296, "grad_norm": 2.390625, "kl": 4.771560192108154, "learning_rate": 4.381497050882845e-05, "loss": 0.1909, "reward": 2.1835964918136597, "reward_std": 3.0253257751464844, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.440984845161438, "rewards/no_repetition_reward_func": -0.25738830119371414, "rewards/verse_reward_func": 0.0, "step": 1912 }, { "completion_length": 250.09375, "epoch": 15.304, "grad_norm": 2.390625, "kl": 4.728482723236084, "learning_rate": 4.380577346908618e-05, "loss": 0.1891, "reward": 2.015505790710449, "reward_std": 2.725213408470154, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2532848119735718, "rewards/no_repetition_reward_func": -0.23777902126312256, "rewards/verse_reward_func": 0.0, "step": 1913 }, { "completion_length": 247.625, "epoch": 15.312, "grad_norm": 4.03125, "kl": 5.505231618881226, "learning_rate": 4.379657056329228e-05, "loss": 0.2202, "reward": 2.1163841485977173, "reward_std": 2.928795099258423, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3871248960494995, "rewards/no_repetition_reward_func": -0.2707407772541046, "rewards/verse_reward_func": 0.0, "step": 1914 }, { "completion_length": 252.90625, "epoch": 15.32, "grad_norm": 3.5625, "kl": 4.251887321472168, "learning_rate": 4.3787361794317405e-05, "loss": 0.1701, "reward": 2.718013048171997, "reward_std": 3.32116961479187, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.020382046699524, "rewards/no_repetition_reward_func": -0.30236899852752686, "rewards/verse_reward_func": 0.0, "step": 1915 }, { "completion_length": 242.140625, "epoch": 15.328, "grad_norm": 2.90625, "kl": 6.014037132263184, "learning_rate": 4.3778147165034025e-05, "loss": 0.2406, "reward": 1.9815815687179565, "reward_std": 2.8168119192123413, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2414259910583496, "rewards/no_repetition_reward_func": -0.2598443478345871, "rewards/verse_reward_func": 0.0, "step": 1916 }, { "completion_length": 250.328125, "epoch": 15.336, "grad_norm": 4.4375, "kl": 5.419339060783386, "learning_rate": 4.376892667831644e-05, "loss": 0.2168, "reward": 2.0263261795043945, "reward_std": 2.655854821205139, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.302597165107727, "rewards/no_repetition_reward_func": -0.26064593344926834, "rewards/verse_reward_func": -0.015625, "step": 1917 }, { "completion_length": 253.5, "epoch": 15.344, "grad_norm": 3.515625, "kl": 3.8267600536346436, "learning_rate": 4.375970033704077e-05, "loss": 0.1531, "reward": 2.9901148080825806, "reward_std": 3.0067862272262573, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2818915843963623, "rewards/no_repetition_reward_func": -0.2917768359184265, "rewards/verse_reward_func": 0.0, "step": 1918 }, { "completion_length": 251.875, "epoch": 15.352, "grad_norm": 1.84375, "kl": 4.422020077705383, "learning_rate": 4.375046814408499e-05, "loss": 0.1769, "reward": 2.5016796588897705, "reward_std": 2.8148467540740967, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7703840732574463, "rewards/no_repetition_reward_func": -0.26870445907115936, "rewards/verse_reward_func": 0.0, "step": 1919 }, { "completion_length": 249.15625, "epoch": 15.36, "grad_norm": 1.9140625, "kl": 4.484491586685181, "learning_rate": 4.374123010232888e-05, "loss": 0.1794, "reward": 1.7850871086120605, "reward_std": 2.8338329792022705, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0788137912750244, "rewards/no_repetition_reward_func": -0.2781016528606415, "rewards/verse_reward_func": -0.015625, "step": 1920 }, { "completion_length": 242.015625, "epoch": 15.368, "grad_norm": 1.828125, "kl": 3.9763710498809814, "learning_rate": 4.373198621465404e-05, "loss": 0.1591, "reward": 2.1851909160614014, "reward_std": 2.837754964828491, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.44238543510437, "rewards/no_repetition_reward_func": -0.25719480216503143, "rewards/verse_reward_func": 0.0, "step": 1921 }, { "completion_length": 252.984375, "epoch": 15.376, "grad_norm": 2.34375, "kl": 4.335798025131226, "learning_rate": 4.372273648394389e-05, "loss": 0.1734, "reward": 2.4258484840393066, "reward_std": 3.2498821020126343, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.712462902069092, "rewards/no_repetition_reward_func": -0.27880212664604187, "rewards/verse_reward_func": -0.0078125, "step": 1922 }, { "completion_length": 254.375, "epoch": 15.384, "grad_norm": 2.9375, "kl": 4.5338757038116455, "learning_rate": 4.37134809130837e-05, "loss": 0.1814, "reward": 3.1548917293548584, "reward_std": 3.460691452026367, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.483484983444214, "rewards/no_repetition_reward_func": -0.320780485868454, "rewards/verse_reward_func": -0.0078125, "step": 1923 }, { "completion_length": 251.3125, "epoch": 15.392, "grad_norm": 5.21875, "kl": 3.7421762943267822, "learning_rate": 4.370421950496054e-05, "loss": 0.1497, "reward": 2.768610119819641, "reward_std": 3.032158851623535, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.051107168197632, "rewards/no_repetition_reward_func": -0.28249718248844147, "rewards/verse_reward_func": 0.0, "step": 1924 }, { "completion_length": 251.453125, "epoch": 15.4, "grad_norm": 2.390625, "kl": 4.822558164596558, "learning_rate": 4.36949522624633e-05, "loss": 0.1929, "reward": 2.18838232755661, "reward_std": 2.7282036542892456, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.458312153816223, "rewards/no_repetition_reward_func": -0.26992981135845184, "rewards/verse_reward_func": 0.0, "step": 1925 }, { "completion_length": 253.234375, "epoch": 15.408, "grad_norm": 2.8125, "kl": 4.014338135719299, "learning_rate": 4.368567918848269e-05, "loss": 0.1606, "reward": 3.002095580101013, "reward_std": 3.0627713203430176, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.275018811225891, "rewards/no_repetition_reward_func": -0.27292345464229584, "rewards/verse_reward_func": 0.0, "step": 1926 }, { "completion_length": 248.90625, "epoch": 15.416, "grad_norm": 2.5625, "kl": 4.402298212051392, "learning_rate": 4.3676400285911256e-05, "loss": 0.1761, "reward": 2.181672692298889, "reward_std": 2.9000786542892456, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4376134872436523, "rewards/no_repetition_reward_func": -0.25594085454940796, "rewards/verse_reward_func": 0.0, "step": 1927 }, { "completion_length": 248.0625, "epoch": 15.424, "grad_norm": 3.0, "kl": 4.081200003623962, "learning_rate": 4.3667115557643336e-05, "loss": 0.1632, "reward": 2.989501714706421, "reward_std": 3.241212248802185, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.278168201446533, "rewards/no_repetition_reward_func": -0.2886667847633362, "rewards/verse_reward_func": 0.0, "step": 1928 }, { "completion_length": 249.765625, "epoch": 15.432, "grad_norm": 1.6484375, "kl": 5.271691560745239, "learning_rate": 4.3657825006575106e-05, "loss": 0.2109, "reward": 2.167717218399048, "reward_std": 3.058754801750183, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.435531795024872, "rewards/no_repetition_reward_func": -0.26781467348337173, "rewards/verse_reward_func": 0.0, "step": 1929 }, { "completion_length": 248.609375, "epoch": 15.44, "grad_norm": 3.671875, "kl": 5.780256509780884, "learning_rate": 4.3648528635604556e-05, "loss": 0.2312, "reward": 1.773254632949829, "reward_std": 2.6782466173171997, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0259182453155518, "rewards/no_repetition_reward_func": -0.23703864961862564, "rewards/verse_reward_func": -0.015625, "step": 1930 }, { "completion_length": 256.0, "epoch": 15.448, "grad_norm": 4.625, "kl": 4.371606826782227, "learning_rate": 4.363922644763147e-05, "loss": 0.1749, "reward": 2.5364439487457275, "reward_std": 2.2873001098632812, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.801063060760498, "rewards/no_repetition_reward_func": -0.26461896300315857, "rewards/verse_reward_func": 0.0, "step": 1931 }, { "completion_length": 249.03125, "epoch": 15.456, "grad_norm": 1.78125, "kl": 5.115919351577759, "learning_rate": 4.362991844555749e-05, "loss": 0.2046, "reward": 2.7217509746551514, "reward_std": 2.8447054624557495, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9835731983184814, "rewards/no_repetition_reward_func": -0.26182226836681366, "rewards/verse_reward_func": 0.0, "step": 1932 }, { "completion_length": 247.203125, "epoch": 15.464, "grad_norm": 3.5625, "kl": 4.6056342124938965, "learning_rate": 4.3620604632286024e-05, "loss": 0.1842, "reward": 1.8213541507720947, "reward_std": 2.5315921306610107, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0594805479049683, "rewards/no_repetition_reward_func": -0.23812635242938995, "rewards/verse_reward_func": 0.0, "step": 1933 }, { "completion_length": 254.09375, "epoch": 15.472, "grad_norm": 2.484375, "kl": 4.536587715148926, "learning_rate": 4.361128501072231e-05, "loss": 0.1815, "reward": 2.8992295265197754, "reward_std": 3.4515867233276367, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2055424451828003, "rewards/no_repetition_reward_func": -0.2828752398490906, "rewards/verse_reward_func": -0.0234375, "step": 1934 }, { "completion_length": 252.875, "epoch": 15.48, "grad_norm": 2.921875, "kl": 4.416388034820557, "learning_rate": 4.3601959583773415e-05, "loss": 0.1767, "reward": 2.397129237651825, "reward_std": 2.797735810279846, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.673172414302826, "rewards/no_repetition_reward_func": -0.2760431617498398, "rewards/verse_reward_func": 0.0, "step": 1935 }, { "completion_length": 251.484375, "epoch": 15.488, "grad_norm": 3.78125, "kl": 4.603592395782471, "learning_rate": 4.35926283543482e-05, "loss": 0.1841, "reward": 2.28049236536026, "reward_std": 2.588022232055664, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.568165898323059, "rewards/no_repetition_reward_func": -0.28767336905002594, "rewards/verse_reward_func": 0.0, "step": 1936 }, { "completion_length": 256.0, "epoch": 15.496, "grad_norm": 2.640625, "kl": 3.9685909748077393, "learning_rate": 4.358329132535733e-05, "loss": 0.1587, "reward": 2.613938808441162, "reward_std": 3.139711380004883, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9137908220291138, "rewards/no_repetition_reward_func": -0.29985208809375763, "rewards/verse_reward_func": 0.0, "step": 1937 }, { "completion_length": 244.015625, "epoch": 15.504, "grad_norm": 4.28125, "kl": 3.6361342668533325, "learning_rate": 4.35739484997133e-05, "loss": 0.1454, "reward": 2.9562315940856934, "reward_std": 3.2454217672348022, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.237965703010559, "rewards/no_repetition_reward_func": -0.2739216536283493, "rewards/verse_reward_func": -0.0078125, "step": 1938 }, { "completion_length": 249.0, "epoch": 15.512, "grad_norm": 2.515625, "kl": 3.3106751441955566, "learning_rate": 4.356459988033039e-05, "loss": 0.1324, "reward": 2.3192033767700195, "reward_std": 2.9060747623443604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6016876697540283, "rewards/no_repetition_reward_func": -0.2824844568967819, "rewards/verse_reward_func": 0.0, "step": 1939 }, { "completion_length": 252.09375, "epoch": 15.52, "grad_norm": 3.484375, "kl": 3.567946672439575, "learning_rate": 4.355524547012471e-05, "loss": 0.1427, "reward": 2.76688814163208, "reward_std": 3.0252751111984253, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.069984793663025, "rewards/no_repetition_reward_func": -0.29528407752513885, "rewards/verse_reward_func": -0.0078125, "step": 1940 }, { "completion_length": 252.515625, "epoch": 15.528, "grad_norm": 2.765625, "kl": 3.3430105447769165, "learning_rate": 4.354588527201414e-05, "loss": 0.1337, "reward": 3.2398449182510376, "reward_std": 3.2068936824798584, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.536606788635254, "rewards/no_repetition_reward_func": -0.2967619299888611, "rewards/verse_reward_func": 0.0, "step": 1941 }, { "completion_length": 251.0625, "epoch": 15.536, "grad_norm": 3.40625, "kl": 4.70851993560791, "learning_rate": 4.353651928891842e-05, "loss": 0.1883, "reward": 2.12472665309906, "reward_std": 2.8912397623062134, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.425307035446167, "rewards/no_repetition_reward_func": -0.300580233335495, "rewards/verse_reward_func": 0.0, "step": 1942 }, { "completion_length": 247.59375, "epoch": 15.544, "grad_norm": 1.71875, "kl": 4.551942348480225, "learning_rate": 4.352714752375906e-05, "loss": 0.1821, "reward": 2.7611074447631836, "reward_std": 3.2555607557296753, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0359328985214233, "rewards/no_repetition_reward_func": -0.2748256325721741, "rewards/verse_reward_func": 0.0, "step": 1943 }, { "completion_length": 248.59375, "epoch": 15.552, "grad_norm": 3.9375, "kl": 5.562926769256592, "learning_rate": 4.351776997945936e-05, "loss": 0.2225, "reward": 2.271789073944092, "reward_std": 2.5765098333358765, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5597381591796875, "rewards/no_repetition_reward_func": -0.28013670444488525, "rewards/verse_reward_func": -0.0078125, "step": 1944 }, { "completion_length": 251.0, "epoch": 15.56, "grad_norm": 3.390625, "kl": 3.9454039335250854, "learning_rate": 4.350838665894446e-05, "loss": 0.1578, "reward": 3.0472158193588257, "reward_std": 2.944372773170471, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3321704864501953, "rewards/no_repetition_reward_func": -0.2849546819925308, "rewards/verse_reward_func": 0.0, "step": 1945 }, { "completion_length": 256.0, "epoch": 15.568, "grad_norm": 3.828125, "kl": 5.0523985624313354, "learning_rate": 4.3498997565141267e-05, "loss": 0.2021, "reward": 2.369828999042511, "reward_std": 2.89030385017395, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6626380681991577, "rewards/no_repetition_reward_func": -0.28499653935432434, "rewards/verse_reward_func": -0.0078125, "step": 1946 }, { "completion_length": 245.578125, "epoch": 15.576, "grad_norm": 1.9765625, "kl": 4.624791145324707, "learning_rate": 4.348960270097851e-05, "loss": 0.185, "reward": 2.871240973472595, "reward_std": 2.9992635250091553, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1556233167648315, "rewards/no_repetition_reward_func": -0.2843824103474617, "rewards/verse_reward_func": 0.0, "step": 1947 }, { "completion_length": 251.046875, "epoch": 15.584, "grad_norm": 3.84375, "kl": 4.882596015930176, "learning_rate": 4.348020206938672e-05, "loss": 0.1953, "reward": 2.5311899185180664, "reward_std": 2.2062649726867676, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8103060722351074, "rewards/no_repetition_reward_func": -0.271303728222847, "rewards/verse_reward_func": -0.0078125, "step": 1948 }, { "completion_length": 254.234375, "epoch": 15.592, "grad_norm": 1.75, "kl": 4.732769250869751, "learning_rate": 4.3470795673298206e-05, "loss": 0.1893, "reward": 2.2005144357681274, "reward_std": 2.9487167596817017, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4639862775802612, "rewards/no_repetition_reward_func": -0.2634718641638756, "rewards/verse_reward_func": 0.0, "step": 1949 }, { "completion_length": 247.421875, "epoch": 15.6, "grad_norm": 2.515625, "kl": 4.52947211265564, "learning_rate": 4.3461383515647106e-05, "loss": 0.1812, "reward": 2.3382840156555176, "reward_std": 3.0953516960144043, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6190015077590942, "rewards/no_repetition_reward_func": -0.280717596411705, "rewards/verse_reward_func": 0.0, "step": 1950 }, { "completion_length": 254.484375, "epoch": 15.608, "grad_norm": 1.921875, "kl": 4.650259494781494, "learning_rate": 4.345196559936932e-05, "loss": 0.186, "reward": 2.418691635131836, "reward_std": 2.6909754276275635, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6910871267318726, "rewards/no_repetition_reward_func": -0.25677046179771423, "rewards/verse_reward_func": -0.015625, "step": 1951 }, { "completion_length": 250.09375, "epoch": 15.616, "grad_norm": 1.921875, "kl": 4.498561143875122, "learning_rate": 4.3442541927402566e-05, "loss": 0.1799, "reward": 2.409904420375824, "reward_std": 2.397567629814148, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.664545953273773, "rewards/no_repetition_reward_func": -0.25464149564504623, "rewards/verse_reward_func": 0.0, "step": 1952 }, { "completion_length": 249.375, "epoch": 15.624, "grad_norm": 1.9296875, "kl": 4.28048300743103, "learning_rate": 4.3433112502686355e-05, "loss": 0.1712, "reward": 2.6278034448623657, "reward_std": 2.8868753910064697, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9071340560913086, "rewards/no_repetition_reward_func": -0.27151794731616974, "rewards/verse_reward_func": -0.0078125, "step": 1953 }, { "completion_length": 253.375, "epoch": 15.632, "grad_norm": 1.2421875, "kl": 4.895526885986328, "learning_rate": 4.3423677328161996e-05, "loss": 0.1958, "reward": 2.0899171829223633, "reward_std": 3.136238217353821, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.363005757331848, "rewards/no_repetition_reward_func": -0.26527608931064606, "rewards/verse_reward_func": -0.0078125, "step": 1954 }, { "completion_length": 249.640625, "epoch": 15.64, "grad_norm": 1.9609375, "kl": 4.366714954376221, "learning_rate": 4.3414236406772584e-05, "loss": 0.1747, "reward": 2.308618903160095, "reward_std": 3.002771019935608, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5784831047058105, "rewards/no_repetition_reward_func": -0.26986412703990936, "rewards/verse_reward_func": 0.0, "step": 1955 }, { "completion_length": 256.0, "epoch": 15.648, "grad_norm": 8.375, "kl": 4.9999778270721436, "learning_rate": 4.3404789741463e-05, "loss": 0.2, "reward": 2.058157980442047, "reward_std": 2.9569430351257324, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3305609226226807, "rewards/no_repetition_reward_func": -0.27240297198295593, "rewards/verse_reward_func": 0.0, "step": 1956 }, { "completion_length": 252.140625, "epoch": 15.656, "grad_norm": 1.515625, "kl": 3.7666057348251343, "learning_rate": 4.3395337335179945e-05, "loss": 0.1507, "reward": 2.6250863075256348, "reward_std": 2.947804570198059, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.936606526374817, "rewards/no_repetition_reward_func": -0.29589517414569855, "rewards/verse_reward_func": -0.015625, "step": 1957 }, { "completion_length": 252.546875, "epoch": 15.664, "grad_norm": 2.609375, "kl": 3.746150493621826, "learning_rate": 4.338587919087187e-05, "loss": 0.1498, "reward": 2.3785486221313477, "reward_std": 3.120712995529175, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6863811016082764, "rewards/no_repetition_reward_func": -0.30783237516880035, "rewards/verse_reward_func": 0.0, "step": 1958 }, { "completion_length": 253.671875, "epoch": 15.672, "grad_norm": 2.375, "kl": 5.311643362045288, "learning_rate": 4.3376415311489056e-05, "loss": 0.2125, "reward": 1.8172283172607422, "reward_std": 2.5802093744277954, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.084168314933777, "rewards/no_repetition_reward_func": -0.2669401168823242, "rewards/verse_reward_func": 0.0, "step": 1959 }, { "completion_length": 251.546875, "epoch": 15.68, "grad_norm": 2.109375, "kl": 4.367805480957031, "learning_rate": 4.336694569998354e-05, "loss": 0.1747, "reward": 2.3341182470321655, "reward_std": 3.05521559715271, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6307636499404907, "rewards/no_repetition_reward_func": -0.2966454029083252, "rewards/verse_reward_func": 0.0, "step": 1960 }, { "completion_length": 251.625, "epoch": 15.688, "grad_norm": 1.1640625, "kl": 3.7506661415100098, "learning_rate": 4.335747035930916e-05, "loss": 0.15, "reward": 2.1130565404891968, "reward_std": 2.8980079889297485, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.354825496673584, "rewards/no_repetition_reward_func": -0.233956441283226, "rewards/verse_reward_func": -0.0078125, "step": 1961 }, { "completion_length": 251.578125, "epoch": 15.696, "grad_norm": 6.46875, "kl": 5.626330614089966, "learning_rate": 4.334798929242155e-05, "loss": 0.2251, "reward": 1.4031208753585815, "reward_std": 2.1796507239341736, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6704459190368652, "rewards/no_repetition_reward_func": -0.2516999989748001, "rewards/verse_reward_func": -0.015625, "step": 1962 }, { "completion_length": 246.484375, "epoch": 15.704, "grad_norm": 2.875, "kl": 4.463950872421265, "learning_rate": 4.3338502502278134e-05, "loss": 0.1786, "reward": 1.8979089856147766, "reward_std": 2.58233904838562, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1786863803863525, "rewards/no_repetition_reward_func": -0.249527245759964, "rewards/verse_reward_func": -0.03125, "step": 1963 }, { "completion_length": 253.90625, "epoch": 15.712, "grad_norm": 2.6875, "kl": 3.801874041557312, "learning_rate": 4.3329009991838084e-05, "loss": 0.1521, "reward": 2.6338316202163696, "reward_std": 2.749624729156494, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.916925549507141, "rewards/no_repetition_reward_func": -0.28309397399425507, "rewards/verse_reward_func": 0.0, "step": 1964 }, { "completion_length": 252.875, "epoch": 15.72, "grad_norm": 3.890625, "kl": 3.367870569229126, "learning_rate": 4.331951176406239e-05, "loss": 0.1347, "reward": 2.917152762413025, "reward_std": 2.838918685913086, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.233766794204712, "rewards/no_repetition_reward_func": -0.3166138827800751, "rewards/verse_reward_func": 0.0, "step": 1965 }, { "completion_length": 250.828125, "epoch": 15.728, "grad_norm": 3.390625, "kl": 4.659736156463623, "learning_rate": 4.3310007821913836e-05, "loss": 0.1864, "reward": 1.449320912361145, "reward_std": 2.5617053508758545, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7476667165756226, "rewards/no_repetition_reward_func": -0.2827208787202835, "rewards/verse_reward_func": -0.015625, "step": 1966 }, { "completion_length": 256.0, "epoch": 15.736, "grad_norm": 2.71875, "kl": 4.194053411483765, "learning_rate": 4.330049816835694e-05, "loss": 0.1678, "reward": 2.1081700325012207, "reward_std": 2.2959336042404175, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.346763849258423, "rewards/no_repetition_reward_func": -0.23859386146068573, "rewards/verse_reward_func": 0.0, "step": 1967 }, { "completion_length": 252.15625, "epoch": 15.744, "grad_norm": 3.90625, "kl": 3.440519332885742, "learning_rate": 4.3290982806358046e-05, "loss": 0.1376, "reward": 2.8931998014450073, "reward_std": 3.4139212369918823, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1777459383010864, "rewards/no_repetition_reward_func": -0.2845461368560791, "rewards/verse_reward_func": 0.0, "step": 1968 }, { "completion_length": 248.203125, "epoch": 15.752, "grad_norm": 1.6015625, "kl": 4.254030227661133, "learning_rate": 4.3281461738885274e-05, "loss": 0.1702, "reward": 1.7430872321128845, "reward_std": 2.53483247756958, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9559172987937927, "rewards/no_repetition_reward_func": -0.20501763373613358, "rewards/verse_reward_func": -0.0078125, "step": 1969 }, { "completion_length": 244.328125, "epoch": 15.76, "grad_norm": 4.03125, "kl": 3.0615835189819336, "learning_rate": 4.3271934968908514e-05, "loss": 0.1225, "reward": 2.636647343635559, "reward_std": 3.1606961488723755, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9142372608184814, "rewards/no_repetition_reward_func": -0.2775898575782776, "rewards/verse_reward_func": 0.0, "step": 1970 }, { "completion_length": 248.609375, "epoch": 15.768, "grad_norm": 2.4375, "kl": 3.8852884769439697, "learning_rate": 4.3262402499399404e-05, "loss": 0.1554, "reward": 2.012938380241394, "reward_std": 3.0235801935195923, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2981162667274475, "rewards/no_repetition_reward_func": -0.27736541628837585, "rewards/verse_reward_func": -0.0078125, "step": 1971 }, { "completion_length": 253.09375, "epoch": 15.776, "grad_norm": 4.46875, "kl": 4.676281571388245, "learning_rate": 4.325286433333142e-05, "loss": 0.1871, "reward": 1.7938870191574097, "reward_std": 2.507715106010437, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0383352041244507, "rewards/no_repetition_reward_func": -0.23663564771413803, "rewards/verse_reward_func": -0.0078125, "step": 1972 }, { "completion_length": 246.1875, "epoch": 15.784, "grad_norm": 2.921875, "kl": 2.9422214031219482, "learning_rate": 4.3243320473679785e-05, "loss": 0.1177, "reward": 2.959365963935852, "reward_std": 2.9565658569335938, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2432082891464233, "rewards/no_repetition_reward_func": -0.2838422358036041, "rewards/verse_reward_func": 0.0, "step": 1973 }, { "completion_length": 250.0, "epoch": 15.792, "grad_norm": 4.0625, "kl": 5.37203311920166, "learning_rate": 4.323377092342148e-05, "loss": 0.2149, "reward": 1.5855215191841125, "reward_std": 2.6637980937957764, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8340381979942322, "rewards/no_repetition_reward_func": -0.24070428311824799, "rewards/verse_reward_func": -0.0078125, "step": 1974 }, { "completion_length": 248.171875, "epoch": 15.8, "grad_norm": 3.53125, "kl": 3.9619189500808716, "learning_rate": 4.3224215685535294e-05, "loss": 0.1585, "reward": 2.256523013114929, "reward_std": 3.31779682636261, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.529141664505005, "rewards/no_repetition_reward_func": -0.2569936290383339, "rewards/verse_reward_func": -0.015625, "step": 1975 }, { "completion_length": 245.796875, "epoch": 15.808, "grad_norm": 2.34375, "kl": 3.268476724624634, "learning_rate": 4.321465476300177e-05, "loss": 0.1307, "reward": 2.392178237438202, "reward_std": 2.5054941177368164, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6400293111801147, "rewards/no_repetition_reward_func": -0.24003854393959045, "rewards/verse_reward_func": -0.0078125, "step": 1976 }, { "completion_length": 246.046875, "epoch": 15.816, "grad_norm": 2.328125, "kl": 4.069667458534241, "learning_rate": 4.3205088158803226e-05, "loss": 0.1628, "reward": 2.1872477531433105, "reward_std": 2.344725251197815, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4798280000686646, "rewards/no_repetition_reward_func": -0.26914268732070923, "rewards/verse_reward_func": -0.0234375, "step": 1977 }, { "completion_length": 249.515625, "epoch": 15.824, "grad_norm": 1.8515625, "kl": 3.8402979373931885, "learning_rate": 4.319551587592376e-05, "loss": 0.1536, "reward": 1.9208924770355225, "reward_std": 2.5809158086776733, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.171612560749054, "rewards/no_repetition_reward_func": -0.24290768057107925, "rewards/verse_reward_func": -0.0078125, "step": 1978 }, { "completion_length": 250.609375, "epoch": 15.832, "grad_norm": 1.6796875, "kl": 3.9259971380233765, "learning_rate": 4.318593791734924e-05, "loss": 0.157, "reward": 1.6694721579551697, "reward_std": 2.7407898902893066, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8858425617218018, "rewards/no_repetition_reward_func": -0.21637049317359924, "rewards/verse_reward_func": 0.0, "step": 1979 }, { "completion_length": 252.46875, "epoch": 15.84, "grad_norm": 2.109375, "kl": 4.325313687324524, "learning_rate": 4.31763542860673e-05, "loss": 0.173, "reward": 2.6802098751068115, "reward_std": 3.1546664237976074, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9617995023727417, "rewards/no_repetition_reward_func": -0.258152037858963, "rewards/verse_reward_func": -0.0234375, "step": 1980 }, { "completion_length": 251.640625, "epoch": 15.848, "grad_norm": 1.671875, "kl": 4.082546830177307, "learning_rate": 4.3166764985067343e-05, "loss": 0.1633, "reward": 1.8293527960777283, "reward_std": 2.5938640832901, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0788639187812805, "rewards/no_repetition_reward_func": -0.2495112270116806, "rewards/verse_reward_func": 0.0, "step": 1981 }, { "completion_length": 254.5625, "epoch": 15.856, "grad_norm": 2.015625, "kl": 4.749397277832031, "learning_rate": 4.3157170017340545e-05, "loss": 0.19, "reward": 2.0503318309783936, "reward_std": 2.7902292013168335, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.291667938232422, "rewards/no_repetition_reward_func": -0.2413361594080925, "rewards/verse_reward_func": 0.0, "step": 1982 }, { "completion_length": 252.765625, "epoch": 15.864, "grad_norm": 3.203125, "kl": 2.8974578380584717, "learning_rate": 4.314756938587984e-05, "loss": 0.1159, "reward": 2.809098482131958, "reward_std": 2.7088942527770996, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.138185977935791, "rewards/no_repetition_reward_func": -0.32908739149570465, "rewards/verse_reward_func": 0.0, "step": 1983 }, { "completion_length": 256.0, "epoch": 15.872, "grad_norm": 1.921875, "kl": 4.167138338088989, "learning_rate": 4.3137963093679945e-05, "loss": 0.1667, "reward": 2.6023213863372803, "reward_std": 2.9206383228302, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.898052215576172, "rewards/no_repetition_reward_func": -0.2957307696342468, "rewards/verse_reward_func": 0.0, "step": 1984 }, { "completion_length": 256.0, "epoch": 15.88, "grad_norm": 2.671875, "kl": 4.572241544723511, "learning_rate": 4.3128351143737335e-05, "loss": 0.1829, "reward": 2.0053515434265137, "reward_std": 2.8076164722442627, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2722856998443604, "rewards/no_repetition_reward_func": -0.2669341564178467, "rewards/verse_reward_func": 0.0, "step": 1985 }, { "completion_length": 238.703125, "epoch": 15.888, "grad_norm": 3.734375, "kl": 3.203515887260437, "learning_rate": 4.3118733539050244e-05, "loss": 0.1281, "reward": 2.488276481628418, "reward_std": 3.1856768131256104, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8000574111938477, "rewards/no_repetition_reward_func": -0.3117811381816864, "rewards/verse_reward_func": 0.0, "step": 1986 }, { "completion_length": 247.59375, "epoch": 15.896, "grad_norm": 219.0, "kl": 13.467916488647461, "learning_rate": 4.310911028261867e-05, "loss": 0.5387, "reward": 1.5924710631370544, "reward_std": 2.5694520473480225, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8591606616973877, "rewards/no_repetition_reward_func": -0.25106464326381683, "rewards/verse_reward_func": -0.015625, "step": 1987 }, { "completion_length": 251.015625, "epoch": 15.904, "grad_norm": 3.578125, "kl": 3.9665470123291016, "learning_rate": 4.3099481377444384e-05, "loss": 0.1587, "reward": 2.5437588691711426, "reward_std": 2.924347996711731, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.823539137840271, "rewards/no_repetition_reward_func": -0.2719678580760956, "rewards/verse_reward_func": -0.0078125, "step": 1988 }, { "completion_length": 242.375, "epoch": 15.912, "grad_norm": 1.921875, "kl": 4.62802791595459, "learning_rate": 4.308984682653092e-05, "loss": 0.1851, "reward": 1.862101972103119, "reward_std": 2.8860541582107544, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1436949968338013, "rewards/no_repetition_reward_func": -0.2737804651260376, "rewards/verse_reward_func": -0.0078125, "step": 1989 }, { "completion_length": 250.546875, "epoch": 15.92, "grad_norm": 1.4140625, "kl": 4.3254714012146, "learning_rate": 4.3080206632883554e-05, "loss": 0.173, "reward": 2.1868484020233154, "reward_std": 3.004486918449402, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5066983699798584, "rewards/no_repetition_reward_func": -0.3042248636484146, "rewards/verse_reward_func": -0.015625, "step": 1990 }, { "completion_length": 252.8125, "epoch": 15.928, "grad_norm": 2.375, "kl": 4.114288330078125, "learning_rate": 4.307056079950934e-05, "loss": 0.1646, "reward": 1.930272102355957, "reward_std": 2.7537083625793457, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.211776375770569, "rewards/no_repetition_reward_func": -0.2815043479204178, "rewards/verse_reward_func": 0.0, "step": 1991 }, { "completion_length": 256.0, "epoch": 15.936, "grad_norm": 1.703125, "kl": 3.8989473581314087, "learning_rate": 4.306090932941708e-05, "loss": 0.156, "reward": 2.20322847366333, "reward_std": 2.9499322175979614, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4669398069381714, "rewards/no_repetition_reward_func": -0.2637113109230995, "rewards/verse_reward_func": 0.0, "step": 1992 }, { "completion_length": 250.09375, "epoch": 15.943999999999999, "grad_norm": 5.21875, "kl": 5.543377161026001, "learning_rate": 4.305125222561736e-05, "loss": 0.2217, "reward": 2.1621336936950684, "reward_std": 2.7380876541137695, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.427311658859253, "rewards/no_repetition_reward_func": -0.26517799496650696, "rewards/verse_reward_func": 0.0, "step": 1993 }, { "completion_length": 243.15625, "epoch": 15.952, "grad_norm": 3.96875, "kl": 4.756883144378662, "learning_rate": 4.304158949112247e-05, "loss": 0.1903, "reward": 2.7938389778137207, "reward_std": 3.241580843925476, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.082369089126587, "rewards/no_repetition_reward_func": -0.28853006660938263, "rewards/verse_reward_func": 0.0, "step": 1994 }, { "completion_length": 255.21875, "epoch": 15.96, "grad_norm": 2.25, "kl": 4.033013224601746, "learning_rate": 4.303192112894652e-05, "loss": 0.1613, "reward": 2.5403307676315308, "reward_std": 3.1592081785202026, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.831570625305176, "rewards/no_repetition_reward_func": -0.29124005138874054, "rewards/verse_reward_func": 0.0, "step": 1995 }, { "completion_length": 232.265625, "epoch": 15.968, "grad_norm": 2.625, "kl": 3.7388107776641846, "learning_rate": 4.302224714210532e-05, "loss": 0.1496, "reward": 2.417954206466675, "reward_std": 3.0184186697006226, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.689143419265747, "rewards/no_repetition_reward_func": -0.24775183200836182, "rewards/verse_reward_func": -0.0234375, "step": 1996 }, { "completion_length": 239.875, "epoch": 15.975999999999999, "grad_norm": 5.40625, "kl": 5.001095294952393, "learning_rate": 4.301256753361649e-05, "loss": 0.2, "reward": 2.043823540210724, "reward_std": 2.7438331842422485, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.305271625518799, "rewards/no_repetition_reward_func": -0.2536356747150421, "rewards/verse_reward_func": -0.0078125, "step": 1997 }, { "completion_length": 253.328125, "epoch": 15.984, "grad_norm": 4.53125, "kl": 5.212447881698608, "learning_rate": 4.3002882306499345e-05, "loss": 0.2085, "reward": 1.4110397696495056, "reward_std": 2.173193097114563, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.694887936115265, "rewards/no_repetition_reward_func": -0.2838480770587921, "rewards/verse_reward_func": 0.0, "step": 1998 }, { "completion_length": 254.859375, "epoch": 15.992, "grad_norm": 1.4921875, "kl": 4.926250219345093, "learning_rate": 4.2993191463774997e-05, "loss": 0.1971, "reward": 2.497469425201416, "reward_std": 2.976551055908203, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7778931856155396, "rewards/no_repetition_reward_func": -0.2804238945245743, "rewards/verse_reward_func": 0.0, "step": 1999 }, { "completion_length": 256.0, "epoch": 16.0, "grad_norm": 2.640625, "kl": 4.554437637329102, "learning_rate": 4.2983495008466276e-05, "loss": 0.1822, "reward": 2.3587318658828735, "reward_std": 2.9353495836257935, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.616880416870117, "rewards/no_repetition_reward_func": -0.25814834237098694, "rewards/verse_reward_func": 0.0, "step": 2000 }, { "completion_length": 250.296875, "epoch": 16.008, "grad_norm": 3.171875, "kl": 4.086626052856445, "learning_rate": 4.297379294359781e-05, "loss": 0.1635, "reward": 2.8751347064971924, "reward_std": 3.535637855529785, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.163716197013855, "rewards/no_repetition_reward_func": -0.2885814607143402, "rewards/verse_reward_func": 0.0, "step": 2001 }, { "completion_length": 245.390625, "epoch": 16.016, "grad_norm": 3.484375, "kl": 4.414862871170044, "learning_rate": 4.296408527219592e-05, "loss": 0.1766, "reward": 2.2594146728515625, "reward_std": 3.08645761013031, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.558635115623474, "rewards/no_repetition_reward_func": -0.2914083003997803, "rewards/verse_reward_func": -0.0078125, "step": 2002 }, { "completion_length": 245.1875, "epoch": 16.024, "grad_norm": 1.671875, "kl": 4.391742944717407, "learning_rate": 4.295437199728871e-05, "loss": 0.1757, "reward": 2.0620169639587402, "reward_std": 3.08146333694458, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.302562117576599, "rewards/no_repetition_reward_func": -0.23273248970508575, "rewards/verse_reward_func": -0.0078125, "step": 2003 }, { "completion_length": 256.0, "epoch": 16.032, "grad_norm": 1.984375, "kl": 4.093780279159546, "learning_rate": 4.294465312190603e-05, "loss": 0.1638, "reward": 2.737395405769348, "reward_std": 3.0245018005371094, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0026605129241943, "rewards/no_repetition_reward_func": -0.2652650475502014, "rewards/verse_reward_func": 0.0, "step": 2004 }, { "completion_length": 248.421875, "epoch": 16.04, "grad_norm": 2.078125, "kl": 4.834210991859436, "learning_rate": 4.293492864907947e-05, "loss": 0.1934, "reward": 2.265858769416809, "reward_std": 3.0050714015960693, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.532714068889618, "rewards/no_repetition_reward_func": -0.26685551553964615, "rewards/verse_reward_func": 0.0, "step": 2005 }, { "completion_length": 256.0, "epoch": 16.048, "grad_norm": 2.828125, "kl": 4.388290762901306, "learning_rate": 4.292519858184236e-05, "loss": 0.1755, "reward": 2.82116436958313, "reward_std": 3.15070378780365, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.097386956214905, "rewards/no_repetition_reward_func": -0.2684101313352585, "rewards/verse_reward_func": -0.0078125, "step": 2006 }, { "completion_length": 244.359375, "epoch": 16.056, "grad_norm": 2.484375, "kl": 5.320013761520386, "learning_rate": 4.291546292322979e-05, "loss": 0.2128, "reward": 2.150602102279663, "reward_std": 3.0160661935806274, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.396867036819458, "rewards/no_repetition_reward_func": -0.24626502394676208, "rewards/verse_reward_func": 0.0, "step": 2007 }, { "completion_length": 249.8125, "epoch": 16.064, "grad_norm": 2.4375, "kl": 4.435844302177429, "learning_rate": 4.290572167627859e-05, "loss": 0.1774, "reward": 2.558750808238983, "reward_std": 3.096150755882263, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.881718635559082, "rewards/no_repetition_reward_func": -0.30734288692474365, "rewards/verse_reward_func": -0.015625, "step": 2008 }, { "completion_length": 247.421875, "epoch": 16.072, "grad_norm": 5.96875, "kl": 5.353910446166992, "learning_rate": 4.289597484402732e-05, "loss": 0.2142, "reward": 2.0279675126075745, "reward_std": 2.561111569404602, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.267630100250244, "rewards/no_repetition_reward_func": -0.23184999823570251, "rewards/verse_reward_func": -0.0078125, "step": 2009 }, { "completion_length": 250.859375, "epoch": 16.08, "grad_norm": 3.578125, "kl": 3.433050751686096, "learning_rate": 4.2886222429516296e-05, "loss": 0.1373, "reward": 3.37116539478302, "reward_std": 3.3479437828063965, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.6854294538497925, "rewards/no_repetition_reward_func": -0.314264178276062, "rewards/verse_reward_func": 0.0, "step": 2010 }, { "completion_length": 246.234375, "epoch": 16.088, "grad_norm": 3.78125, "kl": 5.765387296676636, "learning_rate": 4.287646443578758e-05, "loss": 0.2306, "reward": 2.470949113368988, "reward_std": 2.8514513969421387, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.76553213596344, "rewards/no_repetition_reward_func": -0.28677038848400116, "rewards/verse_reward_func": -0.0078125, "step": 2011 }, { "completion_length": 251.609375, "epoch": 16.096, "grad_norm": 1.28125, "kl": 4.928059101104736, "learning_rate": 4.2866700865884954e-05, "loss": 0.1971, "reward": 2.7888697385787964, "reward_std": 2.9226471185684204, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.055670738220215, "rewards/no_repetition_reward_func": -0.2668011337518692, "rewards/verse_reward_func": 0.0, "step": 2012 }, { "completion_length": 255.28125, "epoch": 16.104, "grad_norm": 4.0, "kl": 4.420851230621338, "learning_rate": 4.285693172285396e-05, "loss": 0.1768, "reward": 2.5564465522766113, "reward_std": 2.7298468351364136, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.841257333755493, "rewards/no_repetition_reward_func": -0.2848109155893326, "rewards/verse_reward_func": 0.0, "step": 2013 }, { "completion_length": 253.703125, "epoch": 16.112, "grad_norm": 4.25, "kl": 4.9153172969818115, "learning_rate": 4.2847157009741856e-05, "loss": 0.1966, "reward": 2.4379292726516724, "reward_std": 2.4852548241615295, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7024906873703003, "rewards/no_repetition_reward_func": -0.2645612135529518, "rewards/verse_reward_func": 0.0, "step": 2014 }, { "completion_length": 251.4375, "epoch": 16.12, "grad_norm": 3.015625, "kl": 5.073746204376221, "learning_rate": 4.283737672959766e-05, "loss": 0.2029, "reward": 3.1052740812301636, "reward_std": 3.15091073513031, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.404602527618408, "rewards/no_repetition_reward_func": -0.291516050696373, "rewards/verse_reward_func": -0.0078125, "step": 2015 }, { "completion_length": 250.0625, "epoch": 16.128, "grad_norm": 1.703125, "kl": 5.306354522705078, "learning_rate": 4.2827590885472125e-05, "loss": 0.2123, "reward": 2.4128137826919556, "reward_std": 3.034698486328125, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.682408928871155, "rewards/no_repetition_reward_func": -0.2617823928594589, "rewards/verse_reward_func": -0.0078125, "step": 2016 }, { "completion_length": 245.9375, "epoch": 16.136, "grad_norm": 2.546875, "kl": 4.188196182250977, "learning_rate": 4.281779948041772e-05, "loss": 0.1675, "reward": 2.5709891319274902, "reward_std": 3.2484936714172363, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8593764305114746, "rewards/no_repetition_reward_func": -0.2883875221014023, "rewards/verse_reward_func": 0.0, "step": 2017 }, { "completion_length": 254.140625, "epoch": 16.144, "grad_norm": 2.59375, "kl": 4.977764129638672, "learning_rate": 4.2808002517488667e-05, "loss": 0.1991, "reward": 2.5786439180374146, "reward_std": 3.159010887145996, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.867642879486084, "rewards/no_repetition_reward_func": -0.27337392419576645, "rewards/verse_reward_func": -0.015625, "step": 2018 }, { "completion_length": 255.609375, "epoch": 16.152, "grad_norm": 1.515625, "kl": 4.272296190261841, "learning_rate": 4.279819999974091e-05, "loss": 0.1709, "reward": 2.536271572113037, "reward_std": 2.9628504514694214, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.78044331073761, "rewards/no_repetition_reward_func": -0.24417173862457275, "rewards/verse_reward_func": 0.0, "step": 2019 }, { "completion_length": 246.078125, "epoch": 16.16, "grad_norm": 2.625, "kl": 4.147577285766602, "learning_rate": 4.278839193023214e-05, "loss": 0.1659, "reward": 3.1951863765716553, "reward_std": 3.091111660003662, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4897125959396362, "rewards/no_repetition_reward_func": -0.2867136597633362, "rewards/verse_reward_func": -0.0078125, "step": 2020 }, { "completion_length": 252.390625, "epoch": 16.168, "grad_norm": 6.28125, "kl": 5.474232912063599, "learning_rate": 4.2778578312021754e-05, "loss": 0.219, "reward": 2.369078040122986, "reward_std": 2.9196895360946655, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.635619640350342, "rewards/no_repetition_reward_func": -0.2665414810180664, "rewards/verse_reward_func": 0.0, "step": 2021 }, { "completion_length": 252.328125, "epoch": 16.176, "grad_norm": 6.875, "kl": 6.119222164154053, "learning_rate": 4.2768759148170915e-05, "loss": 0.2448, "reward": 1.6328664422035217, "reward_std": 2.408880352973938, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8931422233581543, "rewards/no_repetition_reward_func": -0.26027578115463257, "rewards/verse_reward_func": 0.0, "step": 2022 }, { "completion_length": 251.609375, "epoch": 16.184, "grad_norm": 3.984375, "kl": 5.407779216766357, "learning_rate": 4.2758934441742496e-05, "loss": 0.2163, "reward": 1.9965678453445435, "reward_std": 2.817782163619995, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2448076605796814, "rewards/no_repetition_reward_func": -0.24823985993862152, "rewards/verse_reward_func": 0.0, "step": 2023 }, { "completion_length": 252.375, "epoch": 16.192, "grad_norm": 3.5, "kl": 5.571403980255127, "learning_rate": 4.274910419580108e-05, "loss": 0.2229, "reward": 1.9665269255638123, "reward_std": 2.566101908683777, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2773196697235107, "rewards/no_repetition_reward_func": -0.30298013985157013, "rewards/verse_reward_func": -0.0078125, "step": 2024 }, { "completion_length": 252.328125, "epoch": 16.2, "grad_norm": 2.171875, "kl": 4.281393527984619, "learning_rate": 4.273926841341302e-05, "loss": 0.1713, "reward": 2.4084794521331787, "reward_std": 2.717624306678772, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6794949769973755, "rewards/no_repetition_reward_func": -0.26320305466651917, "rewards/verse_reward_func": -0.0078125, "step": 2025 }, { "completion_length": 251.59375, "epoch": 16.208, "grad_norm": 1.578125, "kl": 4.790597438812256, "learning_rate": 4.272942709764638e-05, "loss": 0.1916, "reward": 2.2454808354377747, "reward_std": 2.830866575241089, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.543243646621704, "rewards/no_repetition_reward_func": -0.2821379154920578, "rewards/verse_reward_func": -0.015625, "step": 2026 }, { "completion_length": 246.359375, "epoch": 16.216, "grad_norm": 2.578125, "kl": 3.195361852645874, "learning_rate": 4.2719580251570915e-05, "loss": 0.1278, "reward": 2.77241849899292, "reward_std": 3.1359097957611084, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0431437492370605, "rewards/no_repetition_reward_func": -0.2629126012325287, "rewards/verse_reward_func": -0.0078125, "step": 2027 }, { "completion_length": 247.734375, "epoch": 16.224, "grad_norm": 6.375, "kl": 3.3876793384552, "learning_rate": 4.270972787825815e-05, "loss": 0.1355, "reward": 2.88943088054657, "reward_std": 3.4244437217712402, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.209802031517029, "rewards/no_repetition_reward_func": -0.31255868077278137, "rewards/verse_reward_func": -0.0078125, "step": 2028 }, { "completion_length": 254.515625, "epoch": 16.232, "grad_norm": 3.078125, "kl": 3.649744153022766, "learning_rate": 4.269986998078132e-05, "loss": 0.146, "reward": 2.383755922317505, "reward_std": 2.8266366720199585, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.666361093521118, "rewards/no_repetition_reward_func": -0.2826051712036133, "rewards/verse_reward_func": 0.0, "step": 2029 }, { "completion_length": 247.03125, "epoch": 16.24, "grad_norm": 2.796875, "kl": 3.8680763244628906, "learning_rate": 4.2690006562215384e-05, "loss": 0.1547, "reward": 2.3626118898391724, "reward_std": 3.229231357574463, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6331076622009277, "rewards/no_repetition_reward_func": -0.27049580961465836, "rewards/verse_reward_func": 0.0, "step": 2030 }, { "completion_length": 253.03125, "epoch": 16.248, "grad_norm": 4.6875, "kl": 3.0274815559387207, "learning_rate": 4.268013762563702e-05, "loss": 0.1211, "reward": 2.8118960857391357, "reward_std": 3.3725428581237793, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1634823083877563, "rewards/no_repetition_reward_func": -0.33596134185791016, "rewards/verse_reward_func": -0.015625, "step": 2031 }, { "completion_length": 249.984375, "epoch": 16.256, "grad_norm": 2.140625, "kl": 4.511402606964111, "learning_rate": 4.267026317412461e-05, "loss": 0.1805, "reward": 1.9874294996261597, "reward_std": 3.1439127922058105, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2489556670188904, "rewards/no_repetition_reward_func": -0.26152610033750534, "rewards/verse_reward_func": 0.0, "step": 2032 }, { "completion_length": 247.421875, "epoch": 16.264, "grad_norm": 3.234375, "kl": 3.142000436782837, "learning_rate": 4.266038321075831e-05, "loss": 0.1257, "reward": 2.630218744277954, "reward_std": 2.963623285293579, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.912332057952881, "rewards/no_repetition_reward_func": -0.28211329877376556, "rewards/verse_reward_func": 0.0, "step": 2033 }, { "completion_length": 252.984375, "epoch": 16.272, "grad_norm": 1.546875, "kl": 4.833336353302002, "learning_rate": 4.265049773861991e-05, "loss": 0.1933, "reward": 2.20418381690979, "reward_std": 3.2023708820343018, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.507301688194275, "rewards/no_repetition_reward_func": -0.295305535197258, "rewards/verse_reward_func": -0.0078125, "step": 2034 }, { "completion_length": 245.609375, "epoch": 16.28, "grad_norm": 4.5, "kl": 4.70029091835022, "learning_rate": 4.264060676079302e-05, "loss": 0.188, "reward": 2.499764919281006, "reward_std": 3.025909185409546, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8600825667381287, "rewards/no_repetition_reward_func": -0.3446926772594452, "rewards/verse_reward_func": -0.015625, "step": 2035 }, { "completion_length": 243.0, "epoch": 16.288, "grad_norm": 4.65625, "kl": 4.615906119346619, "learning_rate": 4.263071028036288e-05, "loss": 0.1846, "reward": 3.1310107707977295, "reward_std": 3.0606343746185303, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3862364292144775, "rewards/no_repetition_reward_func": -0.25522561371326447, "rewards/verse_reward_func": 0.0, "step": 2036 }, { "completion_length": 253.625, "epoch": 16.296, "grad_norm": 4.0625, "kl": 5.269867420196533, "learning_rate": 4.26208083004165e-05, "loss": 0.2108, "reward": 2.809082269668579, "reward_std": 2.8221702575683594, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0979068279266357, "rewards/no_repetition_reward_func": -0.2888248860836029, "rewards/verse_reward_func": 0.0, "step": 2037 }, { "completion_length": 234.703125, "epoch": 16.304, "grad_norm": 6.6875, "kl": 6.450574159622192, "learning_rate": 4.261090082404258e-05, "loss": 0.258, "reward": 2.0574368238449097, "reward_std": 3.095715641975403, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.323627471923828, "rewards/no_repetition_reward_func": -0.25056570768356323, "rewards/verse_reward_func": -0.015625, "step": 2038 }, { "completion_length": 247.671875, "epoch": 16.312, "grad_norm": 3.671875, "kl": 5.169969081878662, "learning_rate": 4.260098785433154e-05, "loss": 0.2068, "reward": 2.0839508175849915, "reward_std": 2.740188479423523, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3455851078033447, "rewards/no_repetition_reward_func": -0.2616342604160309, "rewards/verse_reward_func": 0.0, "step": 2039 }, { "completion_length": 246.703125, "epoch": 16.32, "grad_norm": 2.15625, "kl": 5.372858047485352, "learning_rate": 4.259106939437551e-05, "loss": 0.2149, "reward": 2.4662697315216064, "reward_std": 3.058826208114624, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.75318706035614, "rewards/no_repetition_reward_func": -0.279104620218277, "rewards/verse_reward_func": -0.0078125, "step": 2040 }, { "completion_length": 254.28125, "epoch": 16.328, "grad_norm": 2.0625, "kl": 4.019784927368164, "learning_rate": 4.258114544726835e-05, "loss": 0.1608, "reward": 3.0463876724243164, "reward_std": 3.1270869970321655, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.31401526927948, "rewards/no_repetition_reward_func": -0.26762740314006805, "rewards/verse_reward_func": 0.0, "step": 2041 }, { "completion_length": 244.609375, "epoch": 16.336, "grad_norm": 2.75, "kl": 4.574379920959473, "learning_rate": 4.2571216016105614e-05, "loss": 0.183, "reward": 2.6236209869384766, "reward_std": 3.093941330909729, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9227583408355713, "rewards/no_repetition_reward_func": -0.29913730919361115, "rewards/verse_reward_func": 0.0, "step": 2042 }, { "completion_length": 252.65625, "epoch": 16.344, "grad_norm": 2.21875, "kl": 3.9945971965789795, "learning_rate": 4.256128110398457e-05, "loss": 0.1598, "reward": 3.1972328424453735, "reward_std": 3.1127134561538696, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4986839294433594, "rewards/no_repetition_reward_func": -0.3014509379863739, "rewards/verse_reward_func": 0.0, "step": 2043 }, { "completion_length": 242.390625, "epoch": 16.352, "grad_norm": 4.09375, "kl": 4.868911027908325, "learning_rate": 4.2551340714004203e-05, "loss": 0.1948, "reward": 1.875517725944519, "reward_std": 2.5547244548797607, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.132022976875305, "rewards/no_repetition_reward_func": -0.24869263172149658, "rewards/verse_reward_func": -0.0078125, "step": 2044 }, { "completion_length": 246.921875, "epoch": 16.36, "grad_norm": 2.625, "kl": 4.4144710302352905, "learning_rate": 4.254139484926519e-05, "loss": 0.1766, "reward": 2.525757074356079, "reward_std": 3.0347644090652466, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.780967354774475, "rewards/no_repetition_reward_func": -0.2473979890346527, "rewards/verse_reward_func": -0.0078125, "step": 2045 }, { "completion_length": 246.40625, "epoch": 16.368, "grad_norm": 3.28125, "kl": 4.2652587890625, "learning_rate": 4.253144351286994e-05, "loss": 0.1706, "reward": 1.6022335290908813, "reward_std": 2.7358654737472534, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.873840868473053, "rewards/no_repetition_reward_func": -0.263794869184494, "rewards/verse_reward_func": -0.0078125, "step": 2046 }, { "completion_length": 249.28125, "epoch": 16.376, "grad_norm": 2.5, "kl": 3.858295440673828, "learning_rate": 4.252148670792254e-05, "loss": 0.1543, "reward": 2.5449153184890747, "reward_std": 2.8586437702178955, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8284837007522583, "rewards/no_repetition_reward_func": -0.2679433077573776, "rewards/verse_reward_func": -0.015625, "step": 2047 }, { "completion_length": 250.765625, "epoch": 16.384, "grad_norm": 1.734375, "kl": 4.366656303405762, "learning_rate": 4.2511524437528825e-05, "loss": 0.1747, "reward": 1.8639637231826782, "reward_std": 2.8828846216201782, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1320101022720337, "rewards/no_repetition_reward_func": -0.26023387908935547, "rewards/verse_reward_func": -0.0078125, "step": 2048 }, { "completion_length": 252.890625, "epoch": 16.392, "grad_norm": 2.40625, "kl": 4.1549341678619385, "learning_rate": 4.250155670479628e-05, "loss": 0.1662, "reward": 2.010650396347046, "reward_std": 2.931458830833435, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2785940766334534, "rewards/no_repetition_reward_func": -0.26794371008872986, "rewards/verse_reward_func": 0.0, "step": 2049 }, { "completion_length": 249.15625, "epoch": 16.4, "grad_norm": 2.171875, "kl": 5.195744514465332, "learning_rate": 4.249158351283414e-05, "loss": 0.2078, "reward": 1.999165117740631, "reward_std": 2.899580717086792, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.253142476081848, "rewards/no_repetition_reward_func": -0.2461647391319275, "rewards/verse_reward_func": -0.0078125, "step": 2050 }, { "completion_length": 251.90625, "epoch": 16.408, "grad_norm": 1.34375, "kl": 4.266171216964722, "learning_rate": 4.248160486475331e-05, "loss": 0.1706, "reward": 2.126065254211426, "reward_std": 2.5809249877929688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3990688920021057, "rewards/no_repetition_reward_func": -0.2651911675930023, "rewards/verse_reward_func": -0.0078125, "step": 2051 }, { "completion_length": 255.6875, "epoch": 16.416, "grad_norm": 2.234375, "kl": 3.545332193374634, "learning_rate": 4.247162076366643e-05, "loss": 0.1418, "reward": 2.9184563159942627, "reward_std": 2.821559429168701, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.198183536529541, "rewards/no_repetition_reward_func": -0.27972738444805145, "rewards/verse_reward_func": 0.0, "step": 2052 }, { "completion_length": 254.265625, "epoch": 16.424, "grad_norm": 3.5625, "kl": 3.493695855140686, "learning_rate": 4.2461631212687816e-05, "loss": 0.1397, "reward": 3.2468258142471313, "reward_std": 3.1272717714309692, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.530661463737488, "rewards/no_repetition_reward_func": -0.2838355153799057, "rewards/verse_reward_func": 0.0, "step": 2053 }, { "completion_length": 237.34375, "epoch": 16.432, "grad_norm": 3.109375, "kl": 3.5717915296554565, "learning_rate": 4.245163621493349e-05, "loss": 0.1429, "reward": 2.854575276374817, "reward_std": 2.9577949047088623, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1720741987228394, "rewards/no_repetition_reward_func": -0.29406142234802246, "rewards/verse_reward_func": -0.0234375, "step": 2054 }, { "completion_length": 254.25, "epoch": 16.44, "grad_norm": 2.015625, "kl": 4.269031763076782, "learning_rate": 4.244163577352116e-05, "loss": 0.1708, "reward": 2.4511032104492188, "reward_std": 2.9097940921783447, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7201571464538574, "rewards/no_repetition_reward_func": -0.2690541595220566, "rewards/verse_reward_func": 0.0, "step": 2055 }, { "completion_length": 249.40625, "epoch": 16.448, "grad_norm": 2.265625, "kl": 4.232261657714844, "learning_rate": 4.2431629891570266e-05, "loss": 0.1693, "reward": 2.5612733364105225, "reward_std": 2.713319420814514, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8185524940490723, "rewards/no_repetition_reward_func": -0.24946680665016174, "rewards/verse_reward_func": -0.0078125, "step": 2056 }, { "completion_length": 256.0, "epoch": 16.456, "grad_norm": 2.625, "kl": 4.752001047134399, "learning_rate": 4.242161857220193e-05, "loss": 0.1901, "reward": 2.723758578300476, "reward_std": 2.8846943378448486, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.039431154727936, "rewards/no_repetition_reward_func": -0.30786000192165375, "rewards/verse_reward_func": -0.0078125, "step": 2057 }, { "completion_length": 256.0, "epoch": 16.464, "grad_norm": 1.5390625, "kl": 5.186808347702026, "learning_rate": 4.241160181853894e-05, "loss": 0.2075, "reward": 2.734494209289551, "reward_std": 3.0267571210861206, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0017892122268677, "rewards/no_repetition_reward_func": -0.26729491353034973, "rewards/verse_reward_func": 0.0, "step": 2058 }, { "completion_length": 249.09375, "epoch": 16.472, "grad_norm": 3.796875, "kl": 3.997673988342285, "learning_rate": 4.240157963370582e-05, "loss": 0.1599, "reward": 2.696718215942383, "reward_std": 2.660806894302368, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9834682941436768, "rewards/no_repetition_reward_func": -0.28675025701522827, "rewards/verse_reward_func": 0.0, "step": 2059 }, { "completion_length": 248.359375, "epoch": 16.48, "grad_norm": 1.859375, "kl": 3.884015917778015, "learning_rate": 4.2391552020828775e-05, "loss": 0.1554, "reward": 3.076121687889099, "reward_std": 3.089428663253784, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3583149909973145, "rewards/no_repetition_reward_func": -0.28219321370124817, "rewards/verse_reward_func": 0.0, "step": 2060 }, { "completion_length": 256.0, "epoch": 16.488, "grad_norm": 3.78125, "kl": 4.159807443618774, "learning_rate": 4.238151898303569e-05, "loss": 0.1664, "reward": 3.6045920848846436, "reward_std": 3.2429648637771606, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.9294577836990356, "rewards/no_repetition_reward_func": -0.3248656839132309, "rewards/verse_reward_func": 0.0, "step": 2061 }, { "completion_length": 256.0, "epoch": 16.496, "grad_norm": 3.109375, "kl": 5.207328796386719, "learning_rate": 4.237148052345616e-05, "loss": 0.2083, "reward": 2.315606415271759, "reward_std": 2.6427769660949707, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5863360166549683, "rewards/no_repetition_reward_func": -0.27072930335998535, "rewards/verse_reward_func": 0.0, "step": 2062 }, { "completion_length": 249.890625, "epoch": 16.504, "grad_norm": 2.890625, "kl": 4.323992490768433, "learning_rate": 4.236143664522146e-05, "loss": 0.173, "reward": 3.08819580078125, "reward_std": 2.8888285160064697, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.379940390586853, "rewards/no_repetition_reward_func": -0.291744664311409, "rewards/verse_reward_func": 0.0, "step": 2063 }, { "completion_length": 252.265625, "epoch": 16.512, "grad_norm": 2.375, "kl": 4.981703162193298, "learning_rate": 4.2351387351464565e-05, "loss": 0.1993, "reward": 2.4848493337631226, "reward_std": 3.2073421478271484, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7888686656951904, "rewards/no_repetition_reward_func": -0.3040192127227783, "rewards/verse_reward_func": 0.0, "step": 2064 }, { "completion_length": 249.71875, "epoch": 16.52, "grad_norm": 4.90625, "kl": 6.496856451034546, "learning_rate": 4.234133264532012e-05, "loss": 0.2599, "reward": 1.7318256497383118, "reward_std": 2.934008836746216, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.974881887435913, "rewards/no_repetition_reward_func": -0.24305617064237595, "rewards/verse_reward_func": 0.0, "step": 2065 }, { "completion_length": 256.0, "epoch": 16.528, "grad_norm": 2.453125, "kl": 5.200654745101929, "learning_rate": 4.2331272529924495e-05, "loss": 0.208, "reward": 2.590374708175659, "reward_std": 2.4853951930999756, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.911047577857971, "rewards/no_repetition_reward_func": -0.32067298889160156, "rewards/verse_reward_func": 0.0, "step": 2066 }, { "completion_length": 249.765625, "epoch": 16.536, "grad_norm": 3.34375, "kl": 5.277480125427246, "learning_rate": 4.232120700841571e-05, "loss": 0.2111, "reward": 2.0285441279411316, "reward_std": 2.9200117588043213, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2898577451705933, "rewards/no_repetition_reward_func": -0.2535010576248169, "rewards/verse_reward_func": -0.0078125, "step": 2067 }, { "completion_length": 253.0, "epoch": 16.544, "grad_norm": 2.015625, "kl": 4.523921489715576, "learning_rate": 4.231113608393348e-05, "loss": 0.181, "reward": 2.696680784225464, "reward_std": 3.208040714263916, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9998703002929688, "rewards/no_repetition_reward_func": -0.2953769117593765, "rewards/verse_reward_func": -0.0078125, "step": 2068 }, { "completion_length": 252.25, "epoch": 16.552, "grad_norm": 2.515625, "kl": 4.637538433074951, "learning_rate": 4.230105975961921e-05, "loss": 0.1855, "reward": 2.2172257900238037, "reward_std": 2.952393651008606, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5116140842437744, "rewards/no_repetition_reward_func": -0.29438838362693787, "rewards/verse_reward_func": 0.0, "step": 2069 }, { "completion_length": 256.0, "epoch": 16.56, "grad_norm": 3.78125, "kl": 5.690377235412598, "learning_rate": 4.2290978038616e-05, "loss": 0.2276, "reward": 2.0709436535835266, "reward_std": 2.847243070602417, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.337841033935547, "rewards/no_repetition_reward_func": -0.2668973505496979, "rewards/verse_reward_func": 0.0, "step": 2070 }, { "completion_length": 250.296875, "epoch": 16.568, "grad_norm": 3.375, "kl": 3.815871000289917, "learning_rate": 4.2280890924068625e-05, "loss": 0.1526, "reward": 3.0611979961395264, "reward_std": 3.2761528491973877, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3549485206604004, "rewards/no_repetition_reward_func": -0.2859380692243576, "rewards/verse_reward_func": -0.0078125, "step": 2071 }, { "completion_length": 253.046875, "epoch": 16.576, "grad_norm": 2.609375, "kl": 3.8118810653686523, "learning_rate": 4.2270798419123534e-05, "loss": 0.1525, "reward": 2.3662081956863403, "reward_std": 2.7776939868927, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6532983779907227, "rewards/no_repetition_reward_func": -0.2870900332927704, "rewards/verse_reward_func": 0.0, "step": 2072 }, { "completion_length": 247.765625, "epoch": 16.584, "grad_norm": 1.8046875, "kl": 3.9163992404937744, "learning_rate": 4.226070052692886e-05, "loss": 0.1567, "reward": 2.4268110394477844, "reward_std": 3.137030005455017, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.731924891471863, "rewards/no_repetition_reward_func": -0.297301709651947, "rewards/verse_reward_func": -0.0078125, "step": 2073 }, { "completion_length": 246.21875, "epoch": 16.592, "grad_norm": 2.09375, "kl": 4.852595090866089, "learning_rate": 4.225059725063444e-05, "loss": 0.1941, "reward": 2.6075077056884766, "reward_std": 3.189648151397705, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8634860515594482, "rewards/no_repetition_reward_func": -0.24816596508026123, "rewards/verse_reward_func": -0.0078125, "step": 2074 }, { "completion_length": 249.6875, "epoch": 16.6, "grad_norm": 2.171875, "kl": 3.9553208351135254, "learning_rate": 4.224048859339175e-05, "loss": 0.1582, "reward": 2.4616575241088867, "reward_std": 2.898295760154724, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.72109055519104, "rewards/no_repetition_reward_func": -0.25162068754434586, "rewards/verse_reward_func": -0.0078125, "step": 2075 }, { "completion_length": 250.703125, "epoch": 16.608, "grad_norm": 3.46875, "kl": 5.9399120807647705, "learning_rate": 4.223037455835397e-05, "loss": 0.2376, "reward": 1.7262810468673706, "reward_std": 2.9260313510894775, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9901602864265442, "rewards/no_repetition_reward_func": -0.25606685876846313, "rewards/verse_reward_func": -0.0078125, "step": 2076 }, { "completion_length": 240.6875, "epoch": 16.616, "grad_norm": 2.109375, "kl": 4.391205072402954, "learning_rate": 4.2220255148675956e-05, "loss": 0.1756, "reward": 2.251049518585205, "reward_std": 2.970975160598755, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.493998408317566, "rewards/no_repetition_reward_func": -0.23513632267713547, "rewards/verse_reward_func": -0.0078125, "step": 2077 }, { "completion_length": 254.515625, "epoch": 16.624, "grad_norm": 3.203125, "kl": 4.689785003662109, "learning_rate": 4.221013036751424e-05, "loss": 0.1876, "reward": 2.5841790437698364, "reward_std": 3.1239975690841675, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.877399206161499, "rewards/no_repetition_reward_func": -0.2932201325893402, "rewards/verse_reward_func": 0.0, "step": 2078 }, { "completion_length": 252.21875, "epoch": 16.632, "grad_norm": 3.03125, "kl": 3.5313881635665894, "learning_rate": 4.220000021802702e-05, "loss": 0.1413, "reward": 2.9031591415405273, "reward_std": 2.929560661315918, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1792889833450317, "rewards/no_repetition_reward_func": -0.2761297821998596, "rewards/verse_reward_func": 0.0, "step": 2079 }, { "completion_length": 251.109375, "epoch": 16.64, "grad_norm": 4.5625, "kl": 5.972508907318115, "learning_rate": 4.218986470337419e-05, "loss": 0.2389, "reward": 1.7110028266906738, "reward_std": 2.755354881286621, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9638479948043823, "rewards/no_repetition_reward_func": -0.24503262341022491, "rewards/verse_reward_func": -0.0078125, "step": 2080 }, { "completion_length": 251.453125, "epoch": 16.648, "grad_norm": 2.609375, "kl": 3.8204874992370605, "learning_rate": 4.217972382671729e-05, "loss": 0.1528, "reward": 2.247734785079956, "reward_std": 2.86251699924469, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.551078677177429, "rewards/no_repetition_reward_func": -0.2877188175916672, "rewards/verse_reward_func": -0.015625, "step": 2081 }, { "completion_length": 248.25, "epoch": 16.656, "grad_norm": 2.203125, "kl": 3.677204966545105, "learning_rate": 4.2169577591219545e-05, "loss": 0.1471, "reward": 2.641665458679199, "reward_std": 2.8010603189468384, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.928171157836914, "rewards/no_repetition_reward_func": -0.2865055352449417, "rewards/verse_reward_func": 0.0, "step": 2082 }, { "completion_length": 246.703125, "epoch": 16.664, "grad_norm": 3.125, "kl": 3.6886640787124634, "learning_rate": 4.2159426000045854e-05, "loss": 0.1475, "reward": 2.546074330806732, "reward_std": 3.1941720247268677, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.823378801345825, "rewards/no_repetition_reward_func": -0.27730458974838257, "rewards/verse_reward_func": 0.0, "step": 2083 }, { "completion_length": 241.921875, "epoch": 16.672, "grad_norm": 2.40625, "kl": 3.239431858062744, "learning_rate": 4.2149269056362794e-05, "loss": 0.1296, "reward": 2.901359438896179, "reward_std": 2.997981548309326, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.231445074081421, "rewards/no_repetition_reward_func": -0.3066481202840805, "rewards/verse_reward_func": -0.0234375, "step": 2084 }, { "completion_length": 252.484375, "epoch": 16.68, "grad_norm": 2.015625, "kl": 4.408324241638184, "learning_rate": 4.213910676333859e-05, "loss": 0.1763, "reward": 2.5051286220550537, "reward_std": 3.320035457611084, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8391408920288086, "rewards/no_repetition_reward_func": -0.3261997401714325, "rewards/verse_reward_func": -0.0078125, "step": 2085 }, { "completion_length": 245.53125, "epoch": 16.688, "grad_norm": 1.90625, "kl": 3.8089364767074585, "learning_rate": 4.212893912414316e-05, "loss": 0.1524, "reward": 2.1606476306915283, "reward_std": 2.8117634057998657, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.453681707382202, "rewards/no_repetition_reward_func": -0.27740897238254547, "rewards/verse_reward_func": -0.015625, "step": 2086 }, { "completion_length": 252.609375, "epoch": 16.696, "grad_norm": 1.3671875, "kl": 4.341447114944458, "learning_rate": 4.2118766141948066e-05, "loss": 0.1737, "reward": 2.283588171005249, "reward_std": 2.8908944129943848, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.556026816368103, "rewards/no_repetition_reward_func": -0.27243854105472565, "rewards/verse_reward_func": 0.0, "step": 2087 }, { "completion_length": 253.984375, "epoch": 16.704, "grad_norm": 2.75, "kl": 3.97412371635437, "learning_rate": 4.2108587819926554e-05, "loss": 0.159, "reward": 2.774699091911316, "reward_std": 3.3356415033340454, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1234833002090454, "rewards/no_repetition_reward_func": -0.3487842381000519, "rewards/verse_reward_func": 0.0, "step": 2088 }, { "completion_length": 255.9375, "epoch": 16.712, "grad_norm": 2.109375, "kl": 4.2533674240112305, "learning_rate": 4.209840416125353e-05, "loss": 0.1701, "reward": 3.0480926036834717, "reward_std": 2.9063085317611694, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3259332180023193, "rewards/no_repetition_reward_func": -0.2778404951095581, "rewards/verse_reward_func": 0.0, "step": 2089 }, { "completion_length": 254.578125, "epoch": 16.72, "grad_norm": 3.5625, "kl": 4.4359118938446045, "learning_rate": 4.208821516910557e-05, "loss": 0.1774, "reward": 2.89801561832428, "reward_std": 3.451295495033264, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.237781524658203, "rewards/no_repetition_reward_func": -0.33976611495018005, "rewards/verse_reward_func": 0.0, "step": 2090 }, { "completion_length": 249.890625, "epoch": 16.728, "grad_norm": 2.1875, "kl": 3.827446460723877, "learning_rate": 4.20780208466609e-05, "loss": 0.1531, "reward": 2.8213703632354736, "reward_std": 3.1072285175323486, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.091143012046814, "rewards/no_repetition_reward_func": -0.26977264881134033, "rewards/verse_reward_func": 0.0, "step": 2091 }, { "completion_length": 255.59375, "epoch": 16.736, "grad_norm": 2.546875, "kl": 5.019176959991455, "learning_rate": 4.206782119709942e-05, "loss": 0.2008, "reward": 2.483715295791626, "reward_std": 2.9098875522613525, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.788218855857849, "rewards/no_repetition_reward_func": -0.30450354516506195, "rewards/verse_reward_func": 0.0, "step": 2092 }, { "completion_length": 254.671875, "epoch": 16.744, "grad_norm": 5.625, "kl": 5.510411262512207, "learning_rate": 4.2057616223602684e-05, "loss": 0.2204, "reward": 2.268204629421234, "reward_std": 2.9646143913269043, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.560528516769409, "rewards/no_repetition_reward_func": -0.29232390224933624, "rewards/verse_reward_func": 0.0, "step": 2093 }, { "completion_length": 250.8125, "epoch": 16.752, "grad_norm": 3.046875, "kl": 4.4534752368927, "learning_rate": 4.204740592935392e-05, "loss": 0.1781, "reward": 3.1412923336029053, "reward_std": 3.1474286317825317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.402976155281067, "rewards/no_repetition_reward_func": -0.26168377697467804, "rewards/verse_reward_func": 0.0, "step": 2094 }, { "completion_length": 252.34375, "epoch": 16.76, "grad_norm": 4.9375, "kl": 5.62320876121521, "learning_rate": 4.2037190317538e-05, "loss": 0.2249, "reward": 2.361557722091675, "reward_std": 2.281059145927429, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6255250573158264, "rewards/no_repetition_reward_func": -0.25615496933460236, "rewards/verse_reward_func": -0.0078125, "step": 2095 }, { "completion_length": 252.890625, "epoch": 16.768, "grad_norm": 2.171875, "kl": 5.608487606048584, "learning_rate": 4.202696939134146e-05, "loss": 0.2243, "reward": 2.992198944091797, "reward_std": 3.2224396467208862, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.297128677368164, "rewards/no_repetition_reward_func": -0.3049294799566269, "rewards/verse_reward_func": 0.0, "step": 2096 }, { "completion_length": 249.34375, "epoch": 16.776, "grad_norm": 2.984375, "kl": 4.669370651245117, "learning_rate": 4.2016743153952505e-05, "loss": 0.1868, "reward": 2.8275163173675537, "reward_std": 2.9847073554992676, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.126006603240967, "rewards/no_repetition_reward_func": -0.29067784547805786, "rewards/verse_reward_func": -0.0078125, "step": 2097 }, { "completion_length": 256.0, "epoch": 16.784, "grad_norm": 7.15625, "kl": 6.463136672973633, "learning_rate": 4.200651160856098e-05, "loss": 0.2585, "reward": 2.50049090385437, "reward_std": 2.854138493537903, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8045494556427, "rewards/no_repetition_reward_func": -0.2962457239627838, "rewards/verse_reward_func": -0.0078125, "step": 2098 }, { "completion_length": 243.234375, "epoch": 16.792, "grad_norm": 3.09375, "kl": 4.9562554359436035, "learning_rate": 4.19962747583584e-05, "loss": 0.1983, "reward": 2.6100181341171265, "reward_std": 3.014990210533142, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.904729127883911, "rewards/no_repetition_reward_func": -0.27127355337142944, "rewards/verse_reward_func": -0.0234375, "step": 2099 }, { "completion_length": 249.0625, "epoch": 16.8, "grad_norm": 2.1875, "kl": 5.16352653503418, "learning_rate": 4.198603260653792e-05, "loss": 0.2065, "reward": 2.88774037361145, "reward_std": 3.3270784616470337, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1778796911239624, "rewards/no_repetition_reward_func": -0.2901392877101898, "rewards/verse_reward_func": 0.0, "step": 2100 }, { "completion_length": 246.484375, "epoch": 16.808, "grad_norm": 6.0625, "kl": 5.680513858795166, "learning_rate": 4.197578515629435e-05, "loss": 0.2272, "reward": 2.109585762023926, "reward_std": 2.0511703491210938, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3940394520759583, "rewards/no_repetition_reward_func": -0.2688286006450653, "rewards/verse_reward_func": -0.015625, "step": 2101 }, { "completion_length": 250.453125, "epoch": 16.816, "grad_norm": 1.5625, "kl": 4.26495885848999, "learning_rate": 4.196553241082418e-05, "loss": 0.1706, "reward": 2.365383267402649, "reward_std": 3.0220444202423096, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.719957709312439, "rewards/no_repetition_reward_func": -0.3389492779970169, "rewards/verse_reward_func": -0.015625, "step": 2102 }, { "completion_length": 249.703125, "epoch": 16.824, "grad_norm": 3.0, "kl": 4.1745641231536865, "learning_rate": 4.1955274373325506e-05, "loss": 0.167, "reward": 2.2847520112991333, "reward_std": 2.787240505218506, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.538102626800537, "rewards/no_repetition_reward_func": -0.2533506006002426, "rewards/verse_reward_func": 0.0, "step": 2103 }, { "completion_length": 255.5625, "epoch": 16.832, "grad_norm": 3.15625, "kl": 3.9303137063980103, "learning_rate": 4.194501104699812e-05, "loss": 0.1572, "reward": 2.806238293647766, "reward_std": 3.3359835147857666, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1095874309539795, "rewards/no_repetition_reward_func": -0.28772400319576263, "rewards/verse_reward_func": -0.015625, "step": 2104 }, { "completion_length": 255.8125, "epoch": 16.84, "grad_norm": 2.390625, "kl": 3.149811267852783, "learning_rate": 4.193474243504343e-05, "loss": 0.126, "reward": 2.6938036680221558, "reward_std": 3.0043888092041016, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.014058828353882, "rewards/no_repetition_reward_func": -0.3046301007270813, "rewards/verse_reward_func": -0.015625, "step": 2105 }, { "completion_length": 250.40625, "epoch": 16.848, "grad_norm": 3.140625, "kl": 3.845616579055786, "learning_rate": 4.192446854066452e-05, "loss": 0.1538, "reward": 2.591640591621399, "reward_std": 3.013952136039734, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.860838770866394, "rewards/no_repetition_reward_func": -0.26919813454151154, "rewards/verse_reward_func": 0.0, "step": 2106 }, { "completion_length": 251.625, "epoch": 16.856, "grad_norm": 3.90625, "kl": 3.4016449451446533, "learning_rate": 4.1914189367066094e-05, "loss": 0.1361, "reward": 2.9363166093826294, "reward_std": 3.1915974617004395, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2367578744888306, "rewards/no_repetition_reward_func": -0.30044125020504, "rewards/verse_reward_func": 0.0, "step": 2107 }, { "completion_length": 252.25, "epoch": 16.864, "grad_norm": 2.328125, "kl": 3.691403865814209, "learning_rate": 4.1903904917454516e-05, "loss": 0.1477, "reward": 2.8031781911849976, "reward_std": 3.199105978012085, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.119254946708679, "rewards/no_repetition_reward_func": -0.3160766661167145, "rewards/verse_reward_func": 0.0, "step": 2108 }, { "completion_length": 251.015625, "epoch": 16.872, "grad_norm": 2.5625, "kl": 4.142445683479309, "learning_rate": 4.18936151950378e-05, "loss": 0.1657, "reward": 2.4618499279022217, "reward_std": 3.1364868879318237, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7355493307113647, "rewards/no_repetition_reward_func": -0.2658868134021759, "rewards/verse_reward_func": -0.0078125, "step": 2109 }, { "completion_length": 252.40625, "epoch": 16.88, "grad_norm": 2.28125, "kl": 4.572130441665649, "learning_rate": 4.188332020302561e-05, "loss": 0.1829, "reward": 2.110969662666321, "reward_std": 2.903277635574341, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3864612579345703, "rewards/no_repetition_reward_func": -0.2754916846752167, "rewards/verse_reward_func": 0.0, "step": 2110 }, { "completion_length": 245.09375, "epoch": 16.888, "grad_norm": 1.703125, "kl": 4.016703128814697, "learning_rate": 4.187301994462924e-05, "loss": 0.1607, "reward": 2.0937612056732178, "reward_std": 2.2362653017044067, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.368022918701172, "rewards/no_repetition_reward_func": -0.2586364895105362, "rewards/verse_reward_func": -0.015625, "step": 2111 }, { "completion_length": 251.78125, "epoch": 16.896, "grad_norm": 2.0625, "kl": 5.038103818893433, "learning_rate": 4.1862714423061624e-05, "loss": 0.2015, "reward": 1.5326982736587524, "reward_std": 2.396694779396057, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.751484990119934, "rewards/no_repetition_reward_func": -0.21097415685653687, "rewards/verse_reward_func": -0.0078125, "step": 2112 }, { "completion_length": 252.8125, "epoch": 16.904, "grad_norm": 2.078125, "kl": 4.820409297943115, "learning_rate": 4.185240364153734e-05, "loss": 0.1928, "reward": 2.052635073661804, "reward_std": 2.9578652381896973, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3505613803863525, "rewards/no_repetition_reward_func": -0.2979263588786125, "rewards/verse_reward_func": 0.0, "step": 2113 }, { "completion_length": 249.796875, "epoch": 16.912, "grad_norm": 3.953125, "kl": 3.79077684879303, "learning_rate": 4.184208760327263e-05, "loss": 0.1516, "reward": 3.484455108642578, "reward_std": 3.327366352081299, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.8084518909454346, "rewards/no_repetition_reward_func": -0.3239968866109848, "rewards/verse_reward_func": 0.0, "step": 2114 }, { "completion_length": 247.484375, "epoch": 16.92, "grad_norm": 2.5, "kl": 4.545894384384155, "learning_rate": 4.183176631148534e-05, "loss": 0.1818, "reward": 2.2222540378570557, "reward_std": 2.866396188735962, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4855085611343384, "rewards/no_repetition_reward_func": -0.2632545381784439, "rewards/verse_reward_func": 0.0, "step": 2115 }, { "completion_length": 256.0, "epoch": 16.928, "grad_norm": 2.1875, "kl": 3.8625255823135376, "learning_rate": 4.1821439769395e-05, "loss": 0.1545, "reward": 2.8111300468444824, "reward_std": 2.9493215084075928, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0960744619369507, "rewards/no_repetition_reward_func": -0.28494448959827423, "rewards/verse_reward_func": 0.0, "step": 2116 }, { "completion_length": 254.765625, "epoch": 16.936, "grad_norm": 4.21875, "kl": 4.2986204624176025, "learning_rate": 4.181110798022271e-05, "loss": 0.1719, "reward": 3.0101312398910522, "reward_std": 2.830516815185547, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.306038022041321, "rewards/no_repetition_reward_func": -0.2802818566560745, "rewards/verse_reward_func": -0.015625, "step": 2117 }, { "completion_length": 256.0, "epoch": 16.944, "grad_norm": 2.625, "kl": 5.1632184982299805, "learning_rate": 4.180077094719128e-05, "loss": 0.2065, "reward": 2.6624226570129395, "reward_std": 3.1277769804000854, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9581990242004395, "rewards/no_repetition_reward_func": -0.2957761883735657, "rewards/verse_reward_func": 0.0, "step": 2118 }, { "completion_length": 247.703125, "epoch": 16.951999999999998, "grad_norm": 2.953125, "kl": 4.378296613693237, "learning_rate": 4.179042867352511e-05, "loss": 0.1751, "reward": 2.7963714599609375, "reward_std": 2.8036248683929443, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.054713726043701, "rewards/no_repetition_reward_func": -0.2583421319723129, "rewards/verse_reward_func": 0.0, "step": 2119 }, { "completion_length": 241.3125, "epoch": 16.96, "grad_norm": 3.28125, "kl": 5.09554386138916, "learning_rate": 4.178008116245024e-05, "loss": 0.2038, "reward": 2.0615645051002502, "reward_std": 2.78755259513855, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3366751670837402, "rewards/no_repetition_reward_func": -0.27511073648929596, "rewards/verse_reward_func": 0.0, "step": 2120 }, { "completion_length": 253.78125, "epoch": 16.968, "grad_norm": 1.8515625, "kl": 4.971356153488159, "learning_rate": 4.176972841719435e-05, "loss": 0.1989, "reward": 2.245313048362732, "reward_std": 2.717431426048279, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4747246503829956, "rewards/no_repetition_reward_func": -0.22941157221794128, "rewards/verse_reward_func": 0.0, "step": 2121 }, { "completion_length": 249.25, "epoch": 16.976, "grad_norm": 2.734375, "kl": 3.791730761528015, "learning_rate": 4.1759370440986775e-05, "loss": 0.1517, "reward": 3.006233334541321, "reward_std": 3.047988772392273, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.293795347213745, "rewards/no_repetition_reward_func": -0.27974940091371536, "rewards/verse_reward_func": -0.0078125, "step": 2122 }, { "completion_length": 253.671875, "epoch": 16.984, "grad_norm": 8.5625, "kl": 5.9479405879974365, "learning_rate": 4.174900723705845e-05, "loss": 0.2379, "reward": 1.9616936445236206, "reward_std": 2.498745083808899, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.214458703994751, "rewards/no_repetition_reward_func": -0.24495256692171097, "rewards/verse_reward_func": -0.0078125, "step": 2123 }, { "completion_length": 253.4375, "epoch": 16.992, "grad_norm": 3.03125, "kl": 4.207118988037109, "learning_rate": 4.1738638808641936e-05, "loss": 0.1683, "reward": 2.6392014026641846, "reward_std": 3.2620620727539062, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9167155027389526, "rewards/no_repetition_reward_func": -0.27751418948173523, "rewards/verse_reward_func": 0.0, "step": 2124 }, { "completion_length": 242.6875, "epoch": 17.0, "grad_norm": 2.578125, "kl": 4.485718131065369, "learning_rate": 4.172826515897146e-05, "loss": 0.1794, "reward": 2.4705634117126465, "reward_std": 2.712360978126526, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7346761226654053, "rewards/no_repetition_reward_func": -0.25630033016204834, "rewards/verse_reward_func": -0.0078125, "step": 2125 }, { "completion_length": 247.875, "epoch": 17.008, "grad_norm": 2.421875, "kl": 4.3876261711120605, "learning_rate": 4.171788629128284e-05, "loss": 0.1755, "reward": 2.675535798072815, "reward_std": 3.380781650543213, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.978708267211914, "rewards/no_repetition_reward_func": -0.3031724989414215, "rewards/verse_reward_func": 0.0, "step": 2126 }, { "completion_length": 251.703125, "epoch": 17.016, "grad_norm": 4.125, "kl": 5.246678829193115, "learning_rate": 4.170750220881354e-05, "loss": 0.2099, "reward": 2.2531200647354126, "reward_std": 3.379201292991638, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6110414266586304, "rewards/no_repetition_reward_func": -0.357921302318573, "rewards/verse_reward_func": 0.0, "step": 2127 }, { "completion_length": 249.25, "epoch": 17.024, "grad_norm": 4.0625, "kl": 4.118128180503845, "learning_rate": 4.169711291480266e-05, "loss": 0.1647, "reward": 2.698296546936035, "reward_std": 2.938825488090515, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.964276671409607, "rewards/no_repetition_reward_func": -0.26598016917705536, "rewards/verse_reward_func": 0.0, "step": 2128 }, { "completion_length": 249.5, "epoch": 17.032, "grad_norm": 1.8515625, "kl": 4.1175127029418945, "learning_rate": 4.168671841249091e-05, "loss": 0.1647, "reward": 2.8805909156799316, "reward_std": 3.2003650665283203, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1882333755493164, "rewards/no_repetition_reward_func": -0.30764253437519073, "rewards/verse_reward_func": 0.0, "step": 2129 }, { "completion_length": 250.953125, "epoch": 17.04, "grad_norm": 1.75, "kl": 4.906309604644775, "learning_rate": 4.1676318705120616e-05, "loss": 0.1963, "reward": 2.1147918105125427, "reward_std": 2.9196784496307373, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4121406078338623, "rewards/no_repetition_reward_func": -0.29734864830970764, "rewards/verse_reward_func": 0.0, "step": 2130 }, { "completion_length": 252.15625, "epoch": 17.048, "grad_norm": 2.8125, "kl": 4.644791126251221, "learning_rate": 4.166591379593575e-05, "loss": 0.1858, "reward": 2.5603045225143433, "reward_std": 2.8469573259353638, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8270792961120605, "rewards/no_repetition_reward_func": -0.2511497661471367, "rewards/verse_reward_func": -0.015625, "step": 2131 }, { "completion_length": 247.6875, "epoch": 17.056, "grad_norm": 9.0, "kl": 6.098752498626709, "learning_rate": 4.16555036881819e-05, "loss": 0.244, "reward": 2.263812780380249, "reward_std": 2.8431448936462402, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5377233028411865, "rewards/no_repetition_reward_func": -0.26609787344932556, "rewards/verse_reward_func": -0.0078125, "step": 2132 }, { "completion_length": 252.9375, "epoch": 17.064, "grad_norm": 1.8671875, "kl": 4.692458868026733, "learning_rate": 4.1645088385106266e-05, "loss": 0.1877, "reward": 2.327327847480774, "reward_std": 3.0487676858901978, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.610887885093689, "rewards/no_repetition_reward_func": -0.2757475972175598, "rewards/verse_reward_func": -0.0078125, "step": 2133 }, { "completion_length": 256.0, "epoch": 17.072, "grad_norm": 2.0, "kl": 5.36226224899292, "learning_rate": 4.1634667889957676e-05, "loss": 0.2145, "reward": 2.366718053817749, "reward_std": 2.997954845428467, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.687317728996277, "rewards/no_repetition_reward_func": -0.31278713047504425, "rewards/verse_reward_func": -0.0078125, "step": 2134 }, { "completion_length": 251.3125, "epoch": 17.08, "grad_norm": 2.765625, "kl": 4.505186319351196, "learning_rate": 4.162424220598658e-05, "loss": 0.1802, "reward": 2.868725061416626, "reward_std": 3.3862143754959106, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1874842643737793, "rewards/no_repetition_reward_func": -0.30313409864902496, "rewards/verse_reward_func": -0.015625, "step": 2135 }, { "completion_length": 253.890625, "epoch": 17.088, "grad_norm": 1.2421875, "kl": 4.768741846084595, "learning_rate": 4.161381133644505e-05, "loss": 0.1907, "reward": 2.7360000610351562, "reward_std": 3.0222702026367188, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0327926874160767, "rewards/no_repetition_reward_func": -0.2889802008867264, "rewards/verse_reward_func": -0.0078125, "step": 2136 }, { "completion_length": 255.421875, "epoch": 17.096, "grad_norm": 3.0, "kl": 3.873328447341919, "learning_rate": 4.160337528458676e-05, "loss": 0.1549, "reward": 3.1816935539245605, "reward_std": 3.397389054298401, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.443312168121338, "rewards/no_repetition_reward_func": -0.26161856949329376, "rewards/verse_reward_func": 0.0, "step": 2137 }, { "completion_length": 245.96875, "epoch": 17.104, "grad_norm": 3.875, "kl": 5.0496156215667725, "learning_rate": 4.1592934053667004e-05, "loss": 0.202, "reward": 2.500556230545044, "reward_std": 2.8549180030822754, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7618443965911865, "rewards/no_repetition_reward_func": -0.2612880542874336, "rewards/verse_reward_func": 0.0, "step": 2138 }, { "completion_length": 251.859375, "epoch": 17.112, "grad_norm": 3.203125, "kl": 5.745846748352051, "learning_rate": 4.1582487646942706e-05, "loss": 0.2298, "reward": 2.0572420358657837, "reward_std": 2.649729371070862, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.276799440383911, "rewards/no_repetition_reward_func": -0.21174488216638565, "rewards/verse_reward_func": -0.0078125, "step": 2139 }, { "completion_length": 241.703125, "epoch": 17.12, "grad_norm": 4.4375, "kl": 5.591476917266846, "learning_rate": 4.157203606767238e-05, "loss": 0.2237, "reward": 2.2521450519561768, "reward_std": 2.792680501937866, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.521549701690674, "rewards/no_repetition_reward_func": -0.26940469443798065, "rewards/verse_reward_func": 0.0, "step": 2140 }, { "completion_length": 250.96875, "epoch": 17.128, "grad_norm": 3.3125, "kl": 4.40018367767334, "learning_rate": 4.156157931911619e-05, "loss": 0.176, "reward": 2.9696109294891357, "reward_std": 2.993193507194519, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2463767528533936, "rewards/no_repetition_reward_func": -0.26895324885845184, "rewards/verse_reward_func": -0.0078125, "step": 2141 }, { "completion_length": 250.84375, "epoch": 17.136, "grad_norm": 1.8984375, "kl": 5.086517333984375, "learning_rate": 4.155111740453588e-05, "loss": 0.2035, "reward": 2.2721132040023804, "reward_std": 3.05912983417511, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.563849151134491, "rewards/no_repetition_reward_func": -0.2917358875274658, "rewards/verse_reward_func": 0.0, "step": 2142 }, { "completion_length": 252.796875, "epoch": 17.144, "grad_norm": 2.21875, "kl": 4.473219156265259, "learning_rate": 4.154065032719481e-05, "loss": 0.1789, "reward": 2.8314473628997803, "reward_std": 3.2401185035705566, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.098798632621765, "rewards/no_repetition_reward_func": -0.2595387175679207, "rewards/verse_reward_func": -0.0078125, "step": 2143 }, { "completion_length": 247.65625, "epoch": 17.152, "grad_norm": 1.375, "kl": 4.956548452377319, "learning_rate": 4.1530178090357976e-05, "loss": 0.1983, "reward": 2.02340430021286, "reward_std": 2.805241584777832, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.306469678878784, "rewards/no_repetition_reward_func": -0.27525292336940765, "rewards/verse_reward_func": -0.0078125, "step": 2144 }, { "completion_length": 252.34375, "epoch": 17.16, "grad_norm": 2.375, "kl": 4.824646472930908, "learning_rate": 4.1519700697291944e-05, "loss": 0.193, "reward": 2.529452681541443, "reward_std": 2.800560235977173, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8130170106887817, "rewards/no_repetition_reward_func": -0.2757517993450165, "rewards/verse_reward_func": -0.0078125, "step": 2145 }, { "completion_length": 253.171875, "epoch": 17.168, "grad_norm": 1.9765625, "kl": 5.465849161148071, "learning_rate": 4.150921815126493e-05, "loss": 0.2186, "reward": 2.1860474944114685, "reward_std": 3.097660779953003, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4449095726013184, "rewards/no_repetition_reward_func": -0.2588620036840439, "rewards/verse_reward_func": 0.0, "step": 2146 }, { "completion_length": 252.96875, "epoch": 17.176, "grad_norm": 2.859375, "kl": 4.041864991188049, "learning_rate": 4.149873045554671e-05, "loss": 0.1617, "reward": 2.9560835361480713, "reward_std": 3.2485716342926025, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.269108772277832, "rewards/no_repetition_reward_func": -0.3130252808332443, "rewards/verse_reward_func": 0.0, "step": 2147 }, { "completion_length": 248.40625, "epoch": 17.184, "grad_norm": 2.78125, "kl": 4.752201080322266, "learning_rate": 4.148823761340871e-05, "loss": 0.1901, "reward": 2.3584271669387817, "reward_std": 2.595867156982422, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6124629974365234, "rewards/no_repetition_reward_func": -0.25403590500354767, "rewards/verse_reward_func": 0.0, "step": 2148 }, { "completion_length": 256.0, "epoch": 17.192, "grad_norm": 2.890625, "kl": 3.5842678546905518, "learning_rate": 4.1477739628123934e-05, "loss": 0.1434, "reward": 2.89387583732605, "reward_std": 3.09299099445343, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1985397338867188, "rewards/no_repetition_reward_func": -0.3046637624502182, "rewards/verse_reward_func": 0.0, "step": 2149 }, { "completion_length": 250.8125, "epoch": 17.2, "grad_norm": 3.828125, "kl": 4.699519872665405, "learning_rate": 4.146723650296701e-05, "loss": 0.188, "reward": 2.05034202337265, "reward_std": 2.677911162376404, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2981892824172974, "rewards/no_repetition_reward_func": -0.24784750491380692, "rewards/verse_reward_func": 0.0, "step": 2150 }, { "completion_length": 254.5625, "epoch": 17.208, "grad_norm": 3.1875, "kl": 4.405236005783081, "learning_rate": 4.145672824121416e-05, "loss": 0.1762, "reward": 2.7540119886398315, "reward_std": 3.1159571409225464, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0750060081481934, "rewards/no_repetition_reward_func": -0.32099398970603943, "rewards/verse_reward_func": 0.0, "step": 2151 }, { "completion_length": 246.828125, "epoch": 17.216, "grad_norm": 2.3125, "kl": 4.24078106880188, "learning_rate": 4.144621484614319e-05, "loss": 0.1696, "reward": 2.3880739212036133, "reward_std": 3.1021538972854614, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6582528352737427, "rewards/no_repetition_reward_func": -0.262366458773613, "rewards/verse_reward_func": -0.0078125, "step": 2152 }, { "completion_length": 247.734375, "epoch": 17.224, "grad_norm": 3.8125, "kl": 3.495063304901123, "learning_rate": 4.1435696321033554e-05, "loss": 0.1398, "reward": 3.1647188663482666, "reward_std": 3.0393251180648804, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4635560512542725, "rewards/no_repetition_reward_func": -0.298837274312973, "rewards/verse_reward_func": 0.0, "step": 2153 }, { "completion_length": 254.125, "epoch": 17.232, "grad_norm": 3.3125, "kl": 4.219391345977783, "learning_rate": 4.142517266916625e-05, "loss": 0.1688, "reward": 3.0091960430145264, "reward_std": 3.168182134628296, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2924469709396362, "rewards/no_repetition_reward_func": -0.28325097262859344, "rewards/verse_reward_func": 0.0, "step": 2154 }, { "completion_length": 254.484375, "epoch": 17.24, "grad_norm": 2.28125, "kl": 4.791657567024231, "learning_rate": 4.1414643893823914e-05, "loss": 0.1917, "reward": 2.4755464792251587, "reward_std": 3.0037275552749634, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7878177165985107, "rewards/no_repetition_reward_func": -0.30445875227451324, "rewards/verse_reward_func": -0.0078125, "step": 2155 }, { "completion_length": 248.5, "epoch": 17.248, "grad_norm": 4.03125, "kl": 4.325294256210327, "learning_rate": 4.140410999829076e-05, "loss": 0.173, "reward": 3.0842278003692627, "reward_std": 3.344544529914856, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4157145023345947, "rewards/no_repetition_reward_func": -0.33148662745952606, "rewards/verse_reward_func": 0.0, "step": 2156 }, { "completion_length": 256.0, "epoch": 17.256, "grad_norm": 2.984375, "kl": 5.006987810134888, "learning_rate": 4.139357098585262e-05, "loss": 0.2003, "reward": 2.8537248373031616, "reward_std": 2.7133806943893433, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.104186773300171, "rewards/no_repetition_reward_func": -0.2504620924592018, "rewards/verse_reward_func": 0.0, "step": 2157 }, { "completion_length": 256.0, "epoch": 17.264, "grad_norm": 2.046875, "kl": 4.894481182098389, "learning_rate": 4.1383026859796905e-05, "loss": 0.1958, "reward": 2.4120118618011475, "reward_std": 3.114297389984131, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.685348629951477, "rewards/no_repetition_reward_func": -0.25771181285381317, "rewards/verse_reward_func": -0.015625, "step": 2158 }, { "completion_length": 245.375, "epoch": 17.272, "grad_norm": 4.0625, "kl": 6.26292610168457, "learning_rate": 4.137247762341262e-05, "loss": 0.2505, "reward": 1.8038525581359863, "reward_std": 2.7345162630081177, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0856791734695435, "rewards/no_repetition_reward_func": -0.2740141749382019, "rewards/verse_reward_func": -0.0078125, "step": 2159 }, { "completion_length": 251.84375, "epoch": 17.28, "grad_norm": 2.203125, "kl": 5.578222274780273, "learning_rate": 4.136192327999037e-05, "loss": 0.2231, "reward": 1.8333059549331665, "reward_std": 2.8729244470596313, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1088715195655823, "rewards/no_repetition_reward_func": -0.2755655348300934, "rewards/verse_reward_func": 0.0, "step": 2160 }, { "completion_length": 254.484375, "epoch": 17.288, "grad_norm": 2.296875, "kl": 3.6717034578323364, "learning_rate": 4.135136383282237e-05, "loss": 0.1469, "reward": 3.071879744529724, "reward_std": 2.909413695335388, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3559728860855103, "rewards/no_repetition_reward_func": -0.2840932756662369, "rewards/verse_reward_func": 0.0, "step": 2161 }, { "completion_length": 249.234375, "epoch": 17.296, "grad_norm": 1.6484375, "kl": 4.749706149101257, "learning_rate": 4.1340799285202376e-05, "loss": 0.19, "reward": 2.5516043305397034, "reward_std": 2.821066379547119, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.840725600719452, "rewards/no_repetition_reward_func": -0.281308576464653, "rewards/verse_reward_func": -0.0078125, "step": 2162 }, { "completion_length": 255.234375, "epoch": 17.304, "grad_norm": 2.109375, "kl": 4.243583679199219, "learning_rate": 4.13302296404258e-05, "loss": 0.1697, "reward": 2.72853946685791, "reward_std": 3.2818257808685303, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.035010814666748, "rewards/no_repetition_reward_func": -0.2986588776111603, "rewards/verse_reward_func": -0.0078125, "step": 2163 }, { "completion_length": 253.703125, "epoch": 17.312, "grad_norm": 2.703125, "kl": 6.457730293273926, "learning_rate": 4.131965490178959e-05, "loss": 0.2583, "reward": 2.095037341117859, "reward_std": 2.8992598056793213, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3306955099105835, "rewards/no_repetition_reward_func": -0.23565802723169327, "rewards/verse_reward_func": 0.0, "step": 2164 }, { "completion_length": 246.703125, "epoch": 17.32, "grad_norm": 2.265625, "kl": 4.786019802093506, "learning_rate": 4.130907507259233e-05, "loss": 0.1914, "reward": 2.407847046852112, "reward_std": 2.9781404733657837, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.716954469680786, "rewards/no_repetition_reward_func": -0.3012949377298355, "rewards/verse_reward_func": -0.0078125, "step": 2165 }, { "completion_length": 248.328125, "epoch": 17.328, "grad_norm": 6.1875, "kl": 4.542564511299133, "learning_rate": 4.129849015613415e-05, "loss": 0.1817, "reward": 1.9924708604812622, "reward_std": 2.37640643119812, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.207657814025879, "rewards/no_repetition_reward_func": -0.21518690884113312, "rewards/verse_reward_func": 0.0, "step": 2166 }, { "completion_length": 247.515625, "epoch": 17.336, "grad_norm": 1.75, "kl": 4.255354881286621, "learning_rate": 4.1287900155716784e-05, "loss": 0.1702, "reward": 2.4392127990722656, "reward_std": 3.09393048286438, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6979615688323975, "rewards/no_repetition_reward_func": -0.25874873995780945, "rewards/verse_reward_func": 0.0, "step": 2167 }, { "completion_length": 254.921875, "epoch": 17.344, "grad_norm": 2.3125, "kl": 4.068496465682983, "learning_rate": 4.127730507464356e-05, "loss": 0.1627, "reward": 2.801608443260193, "reward_std": 3.183234691619873, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.08173930644989, "rewards/no_repetition_reward_func": -0.28013060986995697, "rewards/verse_reward_func": 0.0, "step": 2168 }, { "completion_length": 249.390625, "epoch": 17.352, "grad_norm": 2.390625, "kl": 4.467053174972534, "learning_rate": 4.126670491621938e-05, "loss": 0.1787, "reward": 1.6719983220100403, "reward_std": 2.5389684438705444, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9393301010131836, "rewards/no_repetition_reward_func": -0.2595193386077881, "rewards/verse_reward_func": -0.0078125, "step": 2169 }, { "completion_length": 256.0, "epoch": 17.36, "grad_norm": 1.9453125, "kl": 3.593240261077881, "learning_rate": 4.125609968375072e-05, "loss": 0.1437, "reward": 2.820358395576477, "reward_std": 2.632313370704651, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1117606163024902, "rewards/no_repetition_reward_func": -0.2914021909236908, "rewards/verse_reward_func": 0.0, "step": 2170 }, { "completion_length": 248.4375, "epoch": 17.368, "grad_norm": 6.53125, "kl": 5.130595684051514, "learning_rate": 4.124548938054568e-05, "loss": 0.2052, "reward": 2.259803354740143, "reward_std": 3.041944742202759, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.598373055458069, "rewards/no_repetition_reward_func": -0.3385697901248932, "rewards/verse_reward_func": 0.0, "step": 2171 }, { "completion_length": 251.265625, "epoch": 17.376, "grad_norm": 2.390625, "kl": 4.262453198432922, "learning_rate": 4.123487400991388e-05, "loss": 0.1705, "reward": 2.643121600151062, "reward_std": 3.1638561487197876, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9471932649612427, "rewards/no_repetition_reward_func": -0.28844648599624634, "rewards/verse_reward_func": -0.015625, "step": 2172 }, { "completion_length": 246.125, "epoch": 17.384, "grad_norm": 4.75, "kl": 4.354579329490662, "learning_rate": 4.122425357516658e-05, "loss": 0.1742, "reward": 2.784691095352173, "reward_std": 3.2079843282699585, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0686213970184326, "rewards/no_repetition_reward_func": -0.2839304357767105, "rewards/verse_reward_func": 0.0, "step": 2173 }, { "completion_length": 256.0, "epoch": 17.392, "grad_norm": 2.046875, "kl": 4.6328699588775635, "learning_rate": 4.121362807961658e-05, "loss": 0.1853, "reward": 2.534272313117981, "reward_std": 2.8585199117660522, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.797175168991089, "rewards/no_repetition_reward_func": -0.26290301978588104, "rewards/verse_reward_func": 0.0, "step": 2174 }, { "completion_length": 248.0, "epoch": 17.4, "grad_norm": 1.703125, "kl": 4.517661690711975, "learning_rate": 4.1202997526578276e-05, "loss": 0.1807, "reward": 2.4036121368408203, "reward_std": 2.976213574409485, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6662062406539917, "rewards/no_repetition_reward_func": -0.25478144735097885, "rewards/verse_reward_func": -0.0078125, "step": 2175 }, { "completion_length": 240.078125, "epoch": 17.408, "grad_norm": 2.046875, "kl": 4.2620580196380615, "learning_rate": 4.119236191936764e-05, "loss": 0.1705, "reward": 2.2464520931243896, "reward_std": 3.110324263572693, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5170648097991943, "rewards/no_repetition_reward_func": -0.26280028373003006, "rewards/verse_reward_func": -0.0078125, "step": 2176 }, { "completion_length": 250.4375, "epoch": 17.416, "grad_norm": 2.640625, "kl": 3.767265796661377, "learning_rate": 4.118172126130221e-05, "loss": 0.1507, "reward": 2.4724828600883484, "reward_std": 3.041109561920166, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7734568119049072, "rewards/no_repetition_reward_func": -0.28534914553165436, "rewards/verse_reward_func": -0.015625, "step": 2177 }, { "completion_length": 252.390625, "epoch": 17.424, "grad_norm": 2.0, "kl": 4.4818620681762695, "learning_rate": 4.117107555570111e-05, "loss": 0.1793, "reward": 2.621318221092224, "reward_std": 3.1676387786865234, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8935601711273193, "rewards/no_repetition_reward_func": -0.2722420394420624, "rewards/verse_reward_func": 0.0, "step": 2178 }, { "completion_length": 245.078125, "epoch": 17.432, "grad_norm": 2.1875, "kl": 4.020500898361206, "learning_rate": 4.116042480588505e-05, "loss": 0.1608, "reward": 2.5208966732025146, "reward_std": 2.988616704940796, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7898306846618652, "rewards/no_repetition_reward_func": -0.26893408596515656, "rewards/verse_reward_func": 0.0, "step": 2179 }, { "completion_length": 256.0, "epoch": 17.44, "grad_norm": 3.71875, "kl": 4.86724853515625, "learning_rate": 4.1149769015176275e-05, "loss": 0.1947, "reward": 1.8640042543411255, "reward_std": 2.68819260597229, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1679563522338867, "rewards/no_repetition_reward_func": -0.30395203828811646, "rewards/verse_reward_func": 0.0, "step": 2180 }, { "completion_length": 254.296875, "epoch": 17.448, "grad_norm": 3.1875, "kl": 4.351797580718994, "learning_rate": 4.113910818689864e-05, "loss": 0.1741, "reward": 3.027562975883484, "reward_std": 3.239531993865967, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.314828872680664, "rewards/no_repetition_reward_func": -0.2872658520936966, "rewards/verse_reward_func": 0.0, "step": 2181 }, { "completion_length": 255.0, "epoch": 17.456, "grad_norm": 1.2734375, "kl": 4.618480205535889, "learning_rate": 4.112844232437757e-05, "loss": 0.1847, "reward": 1.9580370783805847, "reward_std": 2.6155306100845337, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.230980157852173, "rewards/no_repetition_reward_func": -0.26513054966926575, "rewards/verse_reward_func": -0.0078125, "step": 2182 }, { "completion_length": 252.359375, "epoch": 17.464, "grad_norm": 2.578125, "kl": 4.140326499938965, "learning_rate": 4.1117771430940035e-05, "loss": 0.1656, "reward": 2.9289554357528687, "reward_std": 3.2195258140563965, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2447078227996826, "rewards/no_repetition_reward_func": -0.307939812541008, "rewards/verse_reward_func": -0.0078125, "step": 2183 }, { "completion_length": 256.0, "epoch": 17.472, "grad_norm": 3.265625, "kl": 6.07608962059021, "learning_rate": 4.1107095509914584e-05, "loss": 0.243, "reward": 2.433380365371704, "reward_std": 3.0355910062789917, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7276424169540405, "rewards/no_repetition_reward_func": -0.2786368280649185, "rewards/verse_reward_func": -0.015625, "step": 2184 }, { "completion_length": 253.328125, "epoch": 17.48, "grad_norm": 2.1875, "kl": 4.587411880493164, "learning_rate": 4.109641456463135e-05, "loss": 0.1835, "reward": 2.3178000450134277, "reward_std": 2.8729575872421265, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5809643268585205, "rewards/no_repetition_reward_func": -0.24753927439451218, "rewards/verse_reward_func": -0.015625, "step": 2185 }, { "completion_length": 252.375, "epoch": 17.488, "grad_norm": 3.671875, "kl": 4.067588806152344, "learning_rate": 4.108572859842201e-05, "loss": 0.1627, "reward": 2.5925278663635254, "reward_std": 2.64117169380188, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.842682719230652, "rewards/no_repetition_reward_func": -0.250154510140419, "rewards/verse_reward_func": 0.0, "step": 2186 }, { "completion_length": 242.265625, "epoch": 17.496, "grad_norm": 1.7265625, "kl": 4.808663845062256, "learning_rate": 4.107503761461983e-05, "loss": 0.1923, "reward": 2.3780952095985413, "reward_std": 2.8922311067581177, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.642038106918335, "rewards/no_repetition_reward_func": -0.25613050907850266, "rewards/verse_reward_func": -0.0078125, "step": 2187 }, { "completion_length": 252.3125, "epoch": 17.504, "grad_norm": 2.75, "kl": 4.289061784744263, "learning_rate": 4.106434161655962e-05, "loss": 0.1716, "reward": 2.8154284954071045, "reward_std": 3.1813822984695435, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.104027509689331, "rewards/no_repetition_reward_func": -0.28859902918338776, "rewards/verse_reward_func": 0.0, "step": 2188 }, { "completion_length": 250.609375, "epoch": 17.512, "grad_norm": 3.3125, "kl": 5.566397428512573, "learning_rate": 4.105364060757776e-05, "loss": 0.2227, "reward": 2.4456793069839478, "reward_std": 3.020794630050659, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.744419574737549, "rewards/no_repetition_reward_func": -0.2909277230501175, "rewards/verse_reward_func": -0.0078125, "step": 2189 }, { "completion_length": 243.21875, "epoch": 17.52, "grad_norm": 2.140625, "kl": 5.561991214752197, "learning_rate": 4.104293459101222e-05, "loss": 0.2225, "reward": 2.6687055826187134, "reward_std": 3.2324787378311157, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.942288398742676, "rewards/no_repetition_reward_func": -0.26577039062976837, "rewards/verse_reward_func": -0.0078125, "step": 2190 }, { "completion_length": 251.953125, "epoch": 17.528, "grad_norm": 3.046875, "kl": 4.487979888916016, "learning_rate": 4.1032223570202474e-05, "loss": 0.1795, "reward": 3.1298965215682983, "reward_std": 3.355760931968689, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4689074754714966, "rewards/no_repetition_reward_func": -0.3311983346939087, "rewards/verse_reward_func": -0.0078125, "step": 2191 }, { "completion_length": 252.296875, "epoch": 17.536, "grad_norm": 2.484375, "kl": 4.151376962661743, "learning_rate": 4.1021507548489625e-05, "loss": 0.1661, "reward": 2.4465125799179077, "reward_std": 3.125458598136902, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.707392930984497, "rewards/no_repetition_reward_func": -0.2608804702758789, "rewards/verse_reward_func": 0.0, "step": 2192 }, { "completion_length": 255.46875, "epoch": 17.544, "grad_norm": 5.53125, "kl": 4.854943037033081, "learning_rate": 4.1010786529216284e-05, "loss": 0.1942, "reward": 2.0159390568733215, "reward_std": 2.381387710571289, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.253080368041992, "rewards/no_repetition_reward_func": -0.23714125901460648, "rewards/verse_reward_func": 0.0, "step": 2193 }, { "completion_length": 253.875, "epoch": 17.552, "grad_norm": 1.8984375, "kl": 4.671477556228638, "learning_rate": 4.1000060515726647e-05, "loss": 0.1869, "reward": 3.1366329193115234, "reward_std": 3.2373944520950317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4373767375946045, "rewards/no_repetition_reward_func": -0.29293105006217957, "rewards/verse_reward_func": -0.0078125, "step": 2194 }, { "completion_length": 250.3125, "epoch": 17.56, "grad_norm": 3.21875, "kl": 4.898998975753784, "learning_rate": 4.098932951136645e-05, "loss": 0.196, "reward": 2.03059720993042, "reward_std": 2.680599808692932, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.284151077270508, "rewards/no_repetition_reward_func": -0.25355391204357147, "rewards/verse_reward_func": 0.0, "step": 2195 }, { "completion_length": 246.4375, "epoch": 17.568, "grad_norm": 1.6171875, "kl": 4.541055679321289, "learning_rate": 4.097859351948301e-05, "loss": 0.1816, "reward": 2.357798933982849, "reward_std": 3.1661354303359985, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.649786949157715, "rewards/no_repetition_reward_func": -0.2841756343841553, "rewards/verse_reward_func": -0.0078125, "step": 2196 }, { "completion_length": 248.625, "epoch": 17.576, "grad_norm": 3.28125, "kl": 3.967406988143921, "learning_rate": 4.0967852543425175e-05, "loss": 0.1587, "reward": 2.893718123435974, "reward_std": 2.780806541442871, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1604944467544556, "rewards/no_repetition_reward_func": -0.26677630096673965, "rewards/verse_reward_func": 0.0, "step": 2197 }, { "completion_length": 250.9375, "epoch": 17.584, "grad_norm": 5.53125, "kl": 4.990006685256958, "learning_rate": 4.095710658654337e-05, "loss": 0.1996, "reward": 2.1337908506393433, "reward_std": 2.947660803794861, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4195241928100586, "rewards/no_repetition_reward_func": -0.2857333570718765, "rewards/verse_reward_func": 0.0, "step": 2198 }, { "completion_length": 250.984375, "epoch": 17.592, "grad_norm": 2.4375, "kl": 4.164749622344971, "learning_rate": 4.094635565218955e-05, "loss": 0.1666, "reward": 2.4716851711273193, "reward_std": 2.8675143718719482, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.717555284500122, "rewards/no_repetition_reward_func": -0.2458701729774475, "rewards/verse_reward_func": 0.0, "step": 2199 }, { "completion_length": 247.625, "epoch": 17.6, "grad_norm": 2.328125, "kl": 4.912242412567139, "learning_rate": 4.093559974371725e-05, "loss": 0.1965, "reward": 2.476871132850647, "reward_std": 2.6941877603530884, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7763708233833313, "rewards/no_repetition_reward_func": -0.29168701171875, "rewards/verse_reward_func": -0.0078125, "step": 2200 }, { "completion_length": 254.859375, "epoch": 17.608, "grad_norm": 3.421875, "kl": 3.814639687538147, "learning_rate": 4.0924838864481516e-05, "loss": 0.1526, "reward": 3.668330192565918, "reward_std": 3.2776745557785034, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 4.00974702835083, "rewards/no_repetition_reward_func": -0.3257918804883957, "rewards/verse_reward_func": -0.015625, "step": 2201 }, { "completion_length": 243.34375, "epoch": 17.616, "grad_norm": 2.734375, "kl": 4.968204498291016, "learning_rate": 4.0914073017838996e-05, "loss": 0.1987, "reward": 2.075056791305542, "reward_std": 2.6093562841415405, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.297365665435791, "rewards/no_repetition_reward_func": -0.22230901569128036, "rewards/verse_reward_func": 0.0, "step": 2202 }, { "completion_length": 245.984375, "epoch": 17.624, "grad_norm": 2.5625, "kl": 4.087998867034912, "learning_rate": 4.090330220714785e-05, "loss": 0.1635, "reward": 2.890721559524536, "reward_std": 3.2534185647964478, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.174811840057373, "rewards/no_repetition_reward_func": -0.27627770602703094, "rewards/verse_reward_func": -0.0078125, "step": 2203 }, { "completion_length": 246.9375, "epoch": 17.632, "grad_norm": 3.484375, "kl": 3.7636239528656006, "learning_rate": 4.0892526435767795e-05, "loss": 0.1505, "reward": 2.6786874532699585, "reward_std": 2.911014199256897, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9305026531219482, "rewards/no_repetition_reward_func": -0.25181519985198975, "rewards/verse_reward_func": 0.0, "step": 2204 }, { "completion_length": 240.609375, "epoch": 17.64, "grad_norm": 2.890625, "kl": 4.595811367034912, "learning_rate": 4.088174570706011e-05, "loss": 0.1838, "reward": 2.2768309116363525, "reward_std": 2.9825648069381714, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.531467318534851, "rewards/no_repetition_reward_func": -0.25463642179965973, "rewards/verse_reward_func": 0.0, "step": 2205 }, { "completion_length": 256.0, "epoch": 17.648, "grad_norm": 2.875, "kl": 4.52730131149292, "learning_rate": 4.0870960024387596e-05, "loss": 0.1811, "reward": 2.5326489210128784, "reward_std": 3.088709592819214, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8063493967056274, "rewards/no_repetition_reward_func": -0.27370060980319977, "rewards/verse_reward_func": 0.0, "step": 2206 }, { "completion_length": 255.859375, "epoch": 17.656, "grad_norm": 2.40625, "kl": 4.821739435195923, "learning_rate": 4.0860169391114625e-05, "loss": 0.1929, "reward": 2.931147575378418, "reward_std": 3.2746537923812866, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.254456400871277, "rewards/no_repetition_reward_func": -0.315496489405632, "rewards/verse_reward_func": -0.0078125, "step": 2207 }, { "completion_length": 252.6875, "epoch": 17.664, "grad_norm": 2.296875, "kl": 4.5804736614227295, "learning_rate": 4.084937381060708e-05, "loss": 0.1832, "reward": 2.548119366168976, "reward_std": 3.0568037033081055, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8387749195098877, "rewards/no_repetition_reward_func": -0.2828429341316223, "rewards/verse_reward_func": -0.0078125, "step": 2208 }, { "completion_length": 254.640625, "epoch": 17.672, "grad_norm": 3.53125, "kl": 5.958988666534424, "learning_rate": 4.083857328623243e-05, "loss": 0.2384, "reward": 2.1023523807525635, "reward_std": 2.8245768547058105, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3839762210845947, "rewards/no_repetition_reward_func": -0.26599907875061035, "rewards/verse_reward_func": -0.015625, "step": 2209 }, { "completion_length": 250.53125, "epoch": 17.68, "grad_norm": 3.484375, "kl": 5.326699495315552, "learning_rate": 4.082776782135964e-05, "loss": 0.2131, "reward": 2.3871558904647827, "reward_std": 2.5662025213241577, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.671810030937195, "rewards/no_repetition_reward_func": -0.2846542149782181, "rewards/verse_reward_func": 0.0, "step": 2210 }, { "completion_length": 243.8125, "epoch": 17.688, "grad_norm": 3.046875, "kl": 4.430163741111755, "learning_rate": 4.0816957419359264e-05, "loss": 0.1772, "reward": 2.3722957372665405, "reward_std": 2.8727389574050903, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6654542684555054, "rewards/no_repetition_reward_func": -0.2853460907936096, "rewards/verse_reward_func": -0.0078125, "step": 2211 }, { "completion_length": 252.59375, "epoch": 17.696, "grad_norm": 1.7890625, "kl": 4.84326434135437, "learning_rate": 4.080614208360336e-05, "loss": 0.1937, "reward": 2.9801716804504395, "reward_std": 3.08680522441864, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.254770517349243, "rewards/no_repetition_reward_func": -0.27459900081157684, "rewards/verse_reward_func": 0.0, "step": 2212 }, { "completion_length": 254.1875, "epoch": 17.704, "grad_norm": 1.7421875, "kl": 4.191472887992859, "learning_rate": 4.079532181746553e-05, "loss": 0.1677, "reward": 2.7489681243896484, "reward_std": 3.1277339458465576, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.062174439430237, "rewards/no_repetition_reward_func": -0.31320634484291077, "rewards/verse_reward_func": 0.0, "step": 2213 }, { "completion_length": 249.78125, "epoch": 17.712, "grad_norm": 3.796875, "kl": 3.9699116945266724, "learning_rate": 4.078449662432093e-05, "loss": 0.1588, "reward": 2.73721182346344, "reward_std": 3.3029983043670654, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0546408891677856, "rewards/no_repetition_reward_func": -0.3096165508031845, "rewards/verse_reward_func": -0.0078125, "step": 2214 }, { "completion_length": 253.328125, "epoch": 17.72, "grad_norm": 4.65625, "kl": 4.411782145500183, "learning_rate": 4.077366650754624e-05, "loss": 0.1765, "reward": 2.2037988901138306, "reward_std": 3.201733708381653, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5068734884262085, "rewards/no_repetition_reward_func": -0.30307458341121674, "rewards/verse_reward_func": 0.0, "step": 2215 }, { "completion_length": 249.1875, "epoch": 17.728, "grad_norm": 4.21875, "kl": 5.383717060089111, "learning_rate": 4.076283147051968e-05, "loss": 0.2153, "reward": 2.569001317024231, "reward_std": 3.2151808738708496, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.886982798576355, "rewards/no_repetition_reward_func": -0.3101690411567688, "rewards/verse_reward_func": -0.0078125, "step": 2216 }, { "completion_length": 249.421875, "epoch": 17.736, "grad_norm": 5.09375, "kl": 3.4062159061431885, "learning_rate": 4.075199151662101e-05, "loss": 0.1362, "reward": 2.9090102910995483, "reward_std": 3.167207956314087, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2193918228149414, "rewards/no_repetition_reward_func": -0.31038153171539307, "rewards/verse_reward_func": 0.0, "step": 2217 }, { "completion_length": 241.46875, "epoch": 17.744, "grad_norm": 2.078125, "kl": 4.690842866897583, "learning_rate": 4.0741146649231504e-05, "loss": 0.1876, "reward": 2.3846485018730164, "reward_std": 3.032559633255005, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6503910422325134, "rewards/no_repetition_reward_func": -0.25793005526065826, "rewards/verse_reward_func": -0.0078125, "step": 2218 }, { "completion_length": 244.796875, "epoch": 17.752, "grad_norm": 3.59375, "kl": 5.604577541351318, "learning_rate": 4.073029687173399e-05, "loss": 0.2242, "reward": 1.7402013540267944, "reward_std": 2.5997601747512817, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.024929106235504, "rewards/no_repetition_reward_func": -0.2691027969121933, "rewards/verse_reward_func": -0.015625, "step": 2219 }, { "completion_length": 242.53125, "epoch": 17.76, "grad_norm": 1.84375, "kl": 4.411088943481445, "learning_rate": 4.071944218751282e-05, "loss": 0.1764, "reward": 2.228161334991455, "reward_std": 2.9456875324249268, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4976091384887695, "rewards/no_repetition_reward_func": -0.26944783329963684, "rewards/verse_reward_func": 0.0, "step": 2220 }, { "completion_length": 249.75, "epoch": 17.768, "grad_norm": 2.140625, "kl": 4.651659250259399, "learning_rate": 4.070858259995387e-05, "loss": 0.1861, "reward": 2.253384590148926, "reward_std": 3.113759160041809, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.518585979938507, "rewards/no_repetition_reward_func": -0.26520147174596786, "rewards/verse_reward_func": 0.0, "step": 2221 }, { "completion_length": 246.203125, "epoch": 17.776, "grad_norm": 4.71875, "kl": 5.5438151359558105, "learning_rate": 4.069771811244457e-05, "loss": 0.2218, "reward": 1.9323875308036804, "reward_std": 2.809817314147949, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2298319935798645, "rewards/no_repetition_reward_func": -0.2661946266889572, "rewards/verse_reward_func": -0.03125, "step": 2222 }, { "completion_length": 243.5625, "epoch": 17.784, "grad_norm": 2.953125, "kl": 4.434919834136963, "learning_rate": 4.068684872837384e-05, "loss": 0.1774, "reward": 2.5880560874938965, "reward_std": 2.7068467140197754, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8549153804779053, "rewards/no_repetition_reward_func": -0.2668594568967819, "rewards/verse_reward_func": 0.0, "step": 2223 }, { "completion_length": 244.53125, "epoch": 17.792, "grad_norm": 2.328125, "kl": 4.009086847305298, "learning_rate": 4.067597445113216e-05, "loss": 0.1604, "reward": 2.630966901779175, "reward_std": 2.7562659978866577, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9166436195373535, "rewards/no_repetition_reward_func": -0.27005159854888916, "rewards/verse_reward_func": -0.015625, "step": 2224 }, { "completion_length": 245.609375, "epoch": 17.8, "grad_norm": 1.703125, "kl": 4.4740424156188965, "learning_rate": 4.066509528411152e-05, "loss": 0.179, "reward": 2.331758141517639, "reward_std": 2.9698641300201416, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.578770339488983, "rewards/no_repetition_reward_func": -0.24701237678527832, "rewards/verse_reward_func": 0.0, "step": 2225 }, { "completion_length": 255.921875, "epoch": 17.808, "grad_norm": 4.21875, "kl": 4.432789206504822, "learning_rate": 4.065421123070543e-05, "loss": 0.1773, "reward": 2.8381354808807373, "reward_std": 3.05876088142395, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1111562252044678, "rewards/no_repetition_reward_func": -0.27302053570747375, "rewards/verse_reward_func": 0.0, "step": 2226 }, { "completion_length": 249.4375, "epoch": 17.816, "grad_norm": 1.421875, "kl": 4.739309310913086, "learning_rate": 4.064332229430895e-05, "loss": 0.1896, "reward": 2.146334409713745, "reward_std": 2.91771137714386, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4119741916656494, "rewards/no_repetition_reward_func": -0.2578272819519043, "rewards/verse_reward_func": -0.0078125, "step": 2227 }, { "completion_length": 249.296875, "epoch": 17.824, "grad_norm": 1.7734375, "kl": 4.316218852996826, "learning_rate": 4.063242847831864e-05, "loss": 0.1726, "reward": 2.8338454961776733, "reward_std": 3.110709547996521, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.086434245109558, "rewards/no_repetition_reward_func": -0.25258877128362656, "rewards/verse_reward_func": 0.0, "step": 2228 }, { "completion_length": 252.8125, "epoch": 17.832, "grad_norm": 2.875, "kl": 4.217219829559326, "learning_rate": 4.062152978613258e-05, "loss": 0.1687, "reward": 2.510995864868164, "reward_std": 2.706445574760437, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.774619460105896, "rewards/no_repetition_reward_func": -0.25581105053424835, "rewards/verse_reward_func": -0.0078125, "step": 2229 }, { "completion_length": 248.90625, "epoch": 17.84, "grad_norm": 2.15625, "kl": 4.615151882171631, "learning_rate": 4.0610626221150394e-05, "loss": 0.1846, "reward": 1.9208794236183167, "reward_std": 2.668100118637085, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1707791090011597, "rewards/no_repetition_reward_func": -0.24208717793226242, "rewards/verse_reward_func": -0.0078125, "step": 2230 }, { "completion_length": 256.0, "epoch": 17.848, "grad_norm": 2.71875, "kl": 5.283535003662109, "learning_rate": 4.0599717786773204e-05, "loss": 0.2113, "reward": 1.8595112562179565, "reward_std": 2.876099467277527, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1396409273147583, "rewards/no_repetition_reward_func": -0.28012967109680176, "rewards/verse_reward_func": 0.0, "step": 2231 }, { "completion_length": 248.015625, "epoch": 17.856, "grad_norm": 2.90625, "kl": 3.885978937149048, "learning_rate": 4.058880448640367e-05, "loss": 0.1554, "reward": 1.8645960092544556, "reward_std": 2.6956710815429688, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0842167735099792, "rewards/no_repetition_reward_func": -0.21962089836597443, "rewards/verse_reward_func": 0.0, "step": 2232 }, { "completion_length": 253.09375, "epoch": 17.864, "grad_norm": 2.40625, "kl": 3.0338969230651855, "learning_rate": 4.057788632344593e-05, "loss": 0.1214, "reward": 3.169241786003113, "reward_std": 2.852250337600708, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.459825336933136, "rewards/no_repetition_reward_func": -0.2905834764242172, "rewards/verse_reward_func": 0.0, "step": 2233 }, { "completion_length": 249.34375, "epoch": 17.872, "grad_norm": 2.5, "kl": 3.4335319995880127, "learning_rate": 4.0566963301305705e-05, "loss": 0.1373, "reward": 2.5057109594345093, "reward_std": 2.7950998544692993, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7764264345169067, "rewards/no_repetition_reward_func": -0.25509047508239746, "rewards/verse_reward_func": -0.015625, "step": 2234 }, { "completion_length": 248.640625, "epoch": 17.88, "grad_norm": 2.09375, "kl": 3.2244080305099487, "learning_rate": 4.055603542339016e-05, "loss": 0.129, "reward": 2.512787342071533, "reward_std": 2.879202127456665, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7789158821105957, "rewards/no_repetition_reward_func": -0.25831620395183563, "rewards/verse_reward_func": -0.0078125, "step": 2235 }, { "completion_length": 252.953125, "epoch": 17.888, "grad_norm": 2.8125, "kl": 4.326337575912476, "learning_rate": 4.054510269310803e-05, "loss": 0.1731, "reward": 2.2249504923820496, "reward_std": 3.02669620513916, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5225524306297302, "rewards/no_repetition_reward_func": -0.29760199785232544, "rewards/verse_reward_func": 0.0, "step": 2236 }, { "completion_length": 250.71875, "epoch": 17.896, "grad_norm": 2.546875, "kl": 3.85943067073822, "learning_rate": 4.053416511386954e-05, "loss": 0.1544, "reward": 2.784965395927429, "reward_std": 3.3844425678253174, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0492048263549805, "rewards/no_repetition_reward_func": -0.2642393708229065, "rewards/verse_reward_func": 0.0, "step": 2237 }, { "completion_length": 252.96875, "epoch": 17.904, "grad_norm": 3.609375, "kl": 3.5775688886642456, "learning_rate": 4.0523222689086414e-05, "loss": 0.1431, "reward": 2.9692031145095825, "reward_std": 2.9251257181167603, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.237362265586853, "rewards/no_repetition_reward_func": -0.2681591808795929, "rewards/verse_reward_func": 0.0, "step": 2238 }, { "completion_length": 251.1875, "epoch": 17.912, "grad_norm": 2.421875, "kl": 3.97685968875885, "learning_rate": 4.051227542217192e-05, "loss": 0.1591, "reward": 2.393462896347046, "reward_std": 3.0463346242904663, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6489503383636475, "rewards/no_repetition_reward_func": -0.25548726320266724, "rewards/verse_reward_func": 0.0, "step": 2239 }, { "completion_length": 249.609375, "epoch": 17.92, "grad_norm": 1.7578125, "kl": 4.147407293319702, "learning_rate": 4.050132331654082e-05, "loss": 0.1659, "reward": 1.9881330728530884, "reward_std": 2.994964599609375, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.222405791282654, "rewards/no_repetition_reward_func": -0.23427274823188782, "rewards/verse_reward_func": 0.0, "step": 2240 }, { "completion_length": 251.296875, "epoch": 17.928, "grad_norm": 2.734375, "kl": 3.4605177640914917, "learning_rate": 4.0490366375609376e-05, "loss": 0.1384, "reward": 3.229568362236023, "reward_std": 2.7610933780670166, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.510312557220459, "rewards/no_repetition_reward_func": -0.28074416518211365, "rewards/verse_reward_func": 0.0, "step": 2241 }, { "completion_length": 251.78125, "epoch": 17.936, "grad_norm": 3.203125, "kl": 5.823463678359985, "learning_rate": 4.047940460279537e-05, "loss": 0.2329, "reward": 2.2871516942977905, "reward_std": 2.988090991973877, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5591307878494263, "rewards/no_repetition_reward_func": -0.26416662335395813, "rewards/verse_reward_func": -0.0078125, "step": 2242 }, { "completion_length": 256.0, "epoch": 17.944, "grad_norm": 1.6015625, "kl": 4.637423992156982, "learning_rate": 4.0468438001518084e-05, "loss": 0.1855, "reward": 2.8119969367980957, "reward_std": 2.8872427940368652, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.113661527633667, "rewards/no_repetition_reward_func": -0.30166465044021606, "rewards/verse_reward_func": 0.0, "step": 2243 }, { "completion_length": 248.890625, "epoch": 17.951999999999998, "grad_norm": 3.421875, "kl": 5.717351913452148, "learning_rate": 4.045746657519831e-05, "loss": 0.2287, "reward": 2.352181315422058, "reward_std": 2.916493773460388, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.635905385017395, "rewards/no_repetition_reward_func": -0.27591174840927124, "rewards/verse_reward_func": -0.0078125, "step": 2244 }, { "completion_length": 253.359375, "epoch": 17.96, "grad_norm": 4.28125, "kl": 5.564746141433716, "learning_rate": 4.044649032725836e-05, "loss": 0.2226, "reward": 2.345354676246643, "reward_std": 2.7298412322998047, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.619290351867676, "rewards/no_repetition_reward_func": -0.273935467004776, "rewards/verse_reward_func": 0.0, "step": 2245 }, { "completion_length": 253.84375, "epoch": 17.968, "grad_norm": 2.953125, "kl": 4.750872373580933, "learning_rate": 4.043550926112203e-05, "loss": 0.19, "reward": 2.9358103275299072, "reward_std": 2.9150733947753906, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.221926689147949, "rewards/no_repetition_reward_func": -0.28611643612384796, "rewards/verse_reward_func": 0.0, "step": 2246 }, { "completion_length": 252.921875, "epoch": 17.976, "grad_norm": 6.3125, "kl": 6.395753860473633, "learning_rate": 4.042452338021461e-05, "loss": 0.2558, "reward": 2.1133334636688232, "reward_std": 3.074994921684265, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4056400060653687, "rewards/no_repetition_reward_func": -0.2844940721988678, "rewards/verse_reward_func": -0.0078125, "step": 2247 }, { "completion_length": 253.5625, "epoch": 17.984, "grad_norm": 5.8125, "kl": 5.276507377624512, "learning_rate": 4.041353268796293e-05, "loss": 0.2111, "reward": 2.574878692626953, "reward_std": 2.8367773294448853, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8510533571243286, "rewards/no_repetition_reward_func": -0.27617472410202026, "rewards/verse_reward_func": 0.0, "step": 2248 }, { "completion_length": 255.859375, "epoch": 17.992, "grad_norm": 6.78125, "kl": 6.183892488479614, "learning_rate": 4.0402537187795274e-05, "loss": 0.2474, "reward": 2.511523485183716, "reward_std": 2.3774415254592896, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8174126148223877, "rewards/no_repetition_reward_func": -0.2980765849351883, "rewards/verse_reward_func": -0.0078125, "step": 2249 }, { "completion_length": 229.1875, "epoch": 18.0, "grad_norm": 3.546875, "kl": 4.779600143432617, "learning_rate": 4.039153688314145e-05, "loss": 0.1912, "reward": 2.5388309955596924, "reward_std": 2.869447112083435, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.825224280357361, "rewards/no_repetition_reward_func": -0.2785806804895401, "rewards/verse_reward_func": -0.0078125, "step": 2250 }, { "completion_length": 249.25, "epoch": 18.008, "grad_norm": 3.59375, "kl": 4.461413383483887, "learning_rate": 4.0380531777432794e-05, "loss": 0.1785, "reward": 3.1562836170196533, "reward_std": 3.1725059747695923, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.454642176628113, "rewards/no_repetition_reward_func": -0.2983586937189102, "rewards/verse_reward_func": 0.0, "step": 2251 }, { "completion_length": 247.5625, "epoch": 18.016, "grad_norm": 4.3125, "kl": 5.733757019042969, "learning_rate": 4.036952187410208e-05, "loss": 0.2294, "reward": 1.7137696743011475, "reward_std": 2.63175368309021, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9816293716430664, "rewards/no_repetition_reward_func": -0.26004738360643387, "rewards/verse_reward_func": -0.0078125, "step": 2252 }, { "completion_length": 255.59375, "epoch": 18.024, "grad_norm": 1.7578125, "kl": 5.070538759231567, "learning_rate": 4.035850717658362e-05, "loss": 0.2028, "reward": 2.453192949295044, "reward_std": 3.3152501583099365, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.732113242149353, "rewards/no_repetition_reward_func": -0.27892033755779266, "rewards/verse_reward_func": 0.0, "step": 2253 }, { "completion_length": 249.6875, "epoch": 18.032, "grad_norm": 2.234375, "kl": 5.535755634307861, "learning_rate": 4.0347487688313194e-05, "loss": 0.2214, "reward": 2.1603140234947205, "reward_std": 2.754939079284668, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4181305170059204, "rewards/no_repetition_reward_func": -0.25781650096178055, "rewards/verse_reward_func": 0.0, "step": 2254 }, { "completion_length": 256.0, "epoch": 18.04, "grad_norm": 4.40625, "kl": 4.557355642318726, "learning_rate": 4.033646341272811e-05, "loss": 0.1823, "reward": 3.0682687759399414, "reward_std": 3.4065663814544678, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4311851263046265, "rewards/no_repetition_reward_func": -0.35510386526584625, "rewards/verse_reward_func": -0.0078125, "step": 2255 }, { "completion_length": 255.265625, "epoch": 18.048, "grad_norm": 2.640625, "kl": 4.205412149429321, "learning_rate": 4.032543435326714e-05, "loss": 0.1682, "reward": 3.169613242149353, "reward_std": 3.313624382019043, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.494184374809265, "rewards/no_repetition_reward_func": -0.3245711922645569, "rewards/verse_reward_func": 0.0, "step": 2256 }, { "completion_length": 251.828125, "epoch": 18.056, "grad_norm": 3.8125, "kl": 5.342960357666016, "learning_rate": 4.031440051337056e-05, "loss": 0.2137, "reward": 1.6739038228988647, "reward_std": 2.543051242828369, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9486516118049622, "rewards/no_repetition_reward_func": -0.266935370862484, "rewards/verse_reward_func": -0.0078125, "step": 2257 }, { "completion_length": 247.40625, "epoch": 18.064, "grad_norm": 1.9921875, "kl": 4.510850191116333, "learning_rate": 4.030336189648014e-05, "loss": 0.1804, "reward": 2.2022452354431152, "reward_std": 2.837560534477234, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4709514379501343, "rewards/no_repetition_reward_func": -0.2608936280012131, "rewards/verse_reward_func": -0.0078125, "step": 2258 }, { "completion_length": 249.578125, "epoch": 18.072, "grad_norm": 2.5625, "kl": 4.671927452087402, "learning_rate": 4.029231850603914e-05, "loss": 0.1869, "reward": 2.058213472366333, "reward_std": 2.5562045574188232, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.31795197725296, "rewards/no_repetition_reward_func": -0.2519260346889496, "rewards/verse_reward_func": -0.0078125, "step": 2259 }, { "completion_length": 251.5625, "epoch": 18.08, "grad_norm": 2.453125, "kl": 3.6855413913726807, "learning_rate": 4.028127034549229e-05, "loss": 0.1474, "reward": 2.766045331954956, "reward_std": 2.9288944005966187, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0406681299209595, "rewards/no_repetition_reward_func": -0.27462299168109894, "rewards/verse_reward_func": 0.0, "step": 2260 }, { "completion_length": 250.734375, "epoch": 18.088, "grad_norm": 3.515625, "kl": 3.6939573287963867, "learning_rate": 4.027021741828584e-05, "loss": 0.1478, "reward": 2.3569568395614624, "reward_std": 2.7374656200408936, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6113996505737305, "rewards/no_repetition_reward_func": -0.2544429078698158, "rewards/verse_reward_func": 0.0, "step": 2261 }, { "completion_length": 252.46875, "epoch": 18.096, "grad_norm": 2.75, "kl": 3.947713017463684, "learning_rate": 4.0259159727867504e-05, "loss": 0.1579, "reward": 2.663640022277832, "reward_std": 3.2419973611831665, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9328243732452393, "rewards/no_repetition_reward_func": -0.26918433606624603, "rewards/verse_reward_func": 0.0, "step": 2262 }, { "completion_length": 254.453125, "epoch": 18.104, "grad_norm": 3.78125, "kl": 3.68610680103302, "learning_rate": 4.024809727768648e-05, "loss": 0.1474, "reward": 2.9250051975250244, "reward_std": 3.2521272897720337, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2281700372695923, "rewards/no_repetition_reward_func": -0.3031647801399231, "rewards/verse_reward_func": 0.0, "step": 2263 }, { "completion_length": 250.546875, "epoch": 18.112, "grad_norm": 1.6171875, "kl": 3.96231746673584, "learning_rate": 4.023703007119347e-05, "loss": 0.1585, "reward": 2.4633071422576904, "reward_std": 3.126283884048462, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7193567752838135, "rewards/no_repetition_reward_func": -0.2560497522354126, "rewards/verse_reward_func": 0.0, "step": 2264 }, { "completion_length": 249.609375, "epoch": 18.12, "grad_norm": 2.359375, "kl": 4.234004974365234, "learning_rate": 4.022595811184064e-05, "loss": 0.1694, "reward": 2.5927380323410034, "reward_std": 2.757033944129944, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8835405111312866, "rewards/no_repetition_reward_func": -0.2908025160431862, "rewards/verse_reward_func": 0.0, "step": 2265 }, { "completion_length": 252.40625, "epoch": 18.128, "grad_norm": 1.78125, "kl": 4.89944863319397, "learning_rate": 4.021488140308165e-05, "loss": 0.196, "reward": 2.515543222427368, "reward_std": 3.163578748703003, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8165032863616943, "rewards/no_repetition_reward_func": -0.30096006393432617, "rewards/verse_reward_func": 0.0, "step": 2266 }, { "completion_length": 256.0, "epoch": 18.136, "grad_norm": 3.40625, "kl": 3.806256890296936, "learning_rate": 4.020379994837164e-05, "loss": 0.1523, "reward": 2.6738219261169434, "reward_std": 3.07261323928833, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.975980520248413, "rewards/no_repetition_reward_func": -0.3021586090326309, "rewards/verse_reward_func": 0.0, "step": 2267 }, { "completion_length": 252.09375, "epoch": 18.144, "grad_norm": 2.890625, "kl": 5.020161390304565, "learning_rate": 4.019271375116722e-05, "loss": 0.2008, "reward": 2.578460693359375, "reward_std": 3.262411952018738, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9025933742523193, "rewards/no_repetition_reward_func": -0.31632019579410553, "rewards/verse_reward_func": -0.0078125, "step": 2268 }, { "completion_length": 251.25, "epoch": 18.152, "grad_norm": 1.765625, "kl": 5.065810918807983, "learning_rate": 4.0181622814926504e-05, "loss": 0.2026, "reward": 2.866317391395569, "reward_std": 3.1893911361694336, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1668286323547363, "rewards/no_repetition_reward_func": -0.3005111962556839, "rewards/verse_reward_func": 0.0, "step": 2269 }, { "completion_length": 254.265625, "epoch": 18.16, "grad_norm": 3.1875, "kl": 4.0643439292907715, "learning_rate": 4.017052714310906e-05, "loss": 0.1626, "reward": 3.4515947103500366, "reward_std": 3.2500704526901245, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.8078137636184692, "rewards/no_repetition_reward_func": -0.35621920228004456, "rewards/verse_reward_func": 0.0, "step": 2270 }, { "completion_length": 240.78125, "epoch": 18.168, "grad_norm": 2.71875, "kl": 5.7969560623168945, "learning_rate": 4.015942673917593e-05, "loss": 0.2319, "reward": 2.0967178344726562, "reward_std": 2.4765231013298035, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.342277765274048, "rewards/no_repetition_reward_func": -0.23774760216474533, "rewards/verse_reward_func": -0.0078125, "step": 2271 }, { "completion_length": 250.625, "epoch": 18.176, "grad_norm": 3.15625, "kl": 6.118791818618774, "learning_rate": 4.0148321606589656e-05, "loss": 0.2448, "reward": 2.234833240509033, "reward_std": 3.057141661643982, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4923975467681885, "rewards/no_repetition_reward_func": -0.2497517317533493, "rewards/verse_reward_func": -0.0078125, "step": 2272 }, { "completion_length": 242.390625, "epoch": 18.184, "grad_norm": 2.78125, "kl": 5.515336990356445, "learning_rate": 4.013721174881425e-05, "loss": 0.2206, "reward": 2.79047691822052, "reward_std": 3.247870683670044, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.093569755554199, "rewards/no_repetition_reward_func": -0.30309295654296875, "rewards/verse_reward_func": 0.0, "step": 2273 }, { "completion_length": 249.125, "epoch": 18.192, "grad_norm": 5.40625, "kl": 4.578294038772583, "learning_rate": 4.012609716931517e-05, "loss": 0.1831, "reward": 2.4032673835754395, "reward_std": 2.8071855306625366, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.67240309715271, "rewards/no_repetition_reward_func": -0.2691357135772705, "rewards/verse_reward_func": 0.0, "step": 2274 }, { "completion_length": 241.28125, "epoch": 18.2, "grad_norm": 5.90625, "kl": 6.012111186981201, "learning_rate": 4.011497787155938e-05, "loss": 0.2405, "reward": 2.17077499628067, "reward_std": 2.498119592666626, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4155389070510864, "rewards/no_repetition_reward_func": -0.24476391077041626, "rewards/verse_reward_func": 0.0, "step": 2275 }, { "completion_length": 253.578125, "epoch": 18.208, "grad_norm": 2.75, "kl": 4.355277180671692, "learning_rate": 4.01038538590153e-05, "loss": 0.1742, "reward": 2.7517932653427124, "reward_std": 3.040894389152527, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0471607446670532, "rewards/no_repetition_reward_func": -0.29536762833595276, "rewards/verse_reward_func": 0.0, "step": 2276 }, { "completion_length": 242.53125, "epoch": 18.216, "grad_norm": 2.578125, "kl": 5.216010808944702, "learning_rate": 4.009272513515281e-05, "loss": 0.2086, "reward": 2.1386988162994385, "reward_std": 2.976863980293274, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4112062454223633, "rewards/no_repetition_reward_func": -0.2568823769688606, "rewards/verse_reward_func": -0.015625, "step": 2277 }, { "completion_length": 236.078125, "epoch": 18.224, "grad_norm": 3.875, "kl": 4.07985520362854, "learning_rate": 4.00815917034433e-05, "loss": 0.1632, "reward": 3.1586368083953857, "reward_std": 3.1111855506896973, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.459468364715576, "rewards/no_repetition_reward_func": -0.29301898181438446, "rewards/verse_reward_func": -0.0078125, "step": 2278 }, { "completion_length": 251.65625, "epoch": 18.232, "grad_norm": 2.53125, "kl": 3.705253839492798, "learning_rate": 4.007045356735959e-05, "loss": 0.1482, "reward": 2.982000231742859, "reward_std": 3.0663141012191772, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2974005937576294, "rewards/no_repetition_reward_func": -0.29977528750896454, "rewards/verse_reward_func": -0.015625, "step": 2279 }, { "completion_length": 251.078125, "epoch": 18.24, "grad_norm": 4.6875, "kl": 3.5311156511306763, "learning_rate": 4.005931073037596e-05, "loss": 0.1412, "reward": 2.6578099727630615, "reward_std": 3.285459041595459, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.953603148460388, "rewards/no_repetition_reward_func": -0.28798067569732666, "rewards/verse_reward_func": -0.0078125, "step": 2280 }, { "completion_length": 256.0, "epoch": 18.248, "grad_norm": 5.03125, "kl": 3.070462942123413, "learning_rate": 4.0048163195968214e-05, "loss": 0.1228, "reward": 2.873164415359497, "reward_std": 3.3254700899124146, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.181216835975647, "rewards/no_repetition_reward_func": -0.3080523610115051, "rewards/verse_reward_func": 0.0, "step": 2281 }, { "completion_length": 249.6875, "epoch": 18.256, "grad_norm": 2.78125, "kl": 4.022772192955017, "learning_rate": 4.003701096761355e-05, "loss": 0.1609, "reward": 2.1229106187820435, "reward_std": 2.8858048915863037, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3855363726615906, "rewards/no_repetition_reward_func": -0.2626258209347725, "rewards/verse_reward_func": 0.0, "step": 2282 }, { "completion_length": 243.40625, "epoch": 18.264, "grad_norm": 2.046875, "kl": 3.7355384826660156, "learning_rate": 4.0025854048790677e-05, "loss": 0.1494, "reward": 2.6733508110046387, "reward_std": 2.803427815437317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.914710760116577, "rewards/no_repetition_reward_func": -0.24135982245206833, "rewards/verse_reward_func": 0.0, "step": 2283 }, { "completion_length": 256.0, "epoch": 18.272, "grad_norm": 2.75, "kl": 3.966935634613037, "learning_rate": 4.001469244297975e-05, "loss": 0.1587, "reward": 2.532896876335144, "reward_std": 2.8315480947494507, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.803192138671875, "rewards/no_repetition_reward_func": -0.27029525488615036, "rewards/verse_reward_func": 0.0, "step": 2284 }, { "completion_length": 249.71875, "epoch": 18.28, "grad_norm": 2.421875, "kl": 4.918416261672974, "learning_rate": 4.000352615366239e-05, "loss": 0.1967, "reward": 2.661984920501709, "reward_std": 3.0009028911590576, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.968892216682434, "rewards/no_repetition_reward_func": -0.3069072961807251, "rewards/verse_reward_func": 0.0, "step": 2285 }, { "completion_length": 253.0, "epoch": 18.288, "grad_norm": 3.53125, "kl": 4.869149684906006, "learning_rate": 3.999235518432168e-05, "loss": 0.1948, "reward": 2.3470130562782288, "reward_std": 2.901216506958008, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6337238550186157, "rewards/no_repetition_reward_func": -0.28671079874038696, "rewards/verse_reward_func": 0.0, "step": 2286 }, { "completion_length": 249.734375, "epoch": 18.296, "grad_norm": 2.984375, "kl": 4.998676061630249, "learning_rate": 3.9981179538442146e-05, "loss": 0.1999, "reward": 2.2845075130462646, "reward_std": 2.954659104347229, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5274243354797363, "rewards/no_repetition_reward_func": -0.2429167926311493, "rewards/verse_reward_func": 0.0, "step": 2287 }, { "completion_length": 251.9375, "epoch": 18.304, "grad_norm": 2.53125, "kl": 4.140599846839905, "learning_rate": 3.996999921950981e-05, "loss": 0.1656, "reward": 3.134469985961914, "reward_std": 3.0244734287261963, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4132957458496094, "rewards/no_repetition_reward_func": -0.2788259983062744, "rewards/verse_reward_func": 0.0, "step": 2288 }, { "completion_length": 249.328125, "epoch": 18.312, "grad_norm": 1.6328125, "kl": 4.305943965911865, "learning_rate": 3.9958814231012115e-05, "loss": 0.1722, "reward": 2.9487605690956116, "reward_std": 2.647127389907837, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.219752550125122, "rewards/no_repetition_reward_func": -0.26317956298589706, "rewards/verse_reward_func": -0.0078125, "step": 2289 }, { "completion_length": 251.765625, "epoch": 18.32, "grad_norm": 1.375, "kl": 5.395660400390625, "learning_rate": 3.9947624576437975e-05, "loss": 0.2158, "reward": 2.3964370489120483, "reward_std": 2.733886957168579, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6518014669418335, "rewards/no_repetition_reward_func": -0.2553645819425583, "rewards/verse_reward_func": 0.0, "step": 2290 }, { "completion_length": 253.0625, "epoch": 18.328, "grad_norm": 1.3046875, "kl": 4.6750710010528564, "learning_rate": 3.993643025927776e-05, "loss": 0.187, "reward": 2.8043274879455566, "reward_std": 2.9663928747177124, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.074309468269348, "rewards/no_repetition_reward_func": -0.26998215168714523, "rewards/verse_reward_func": 0.0, "step": 2291 }, { "completion_length": 248.78125, "epoch": 18.336, "grad_norm": 3.53125, "kl": 4.142567157745361, "learning_rate": 3.99252312830233e-05, "loss": 0.1657, "reward": 2.9158241748809814, "reward_std": 2.9964699745178223, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2167627811431885, "rewards/no_repetition_reward_func": -0.30093860626220703, "rewards/verse_reward_func": 0.0, "step": 2292 }, { "completion_length": 249.734375, "epoch": 18.344, "grad_norm": 3.515625, "kl": 5.575353622436523, "learning_rate": 3.9914027651167866e-05, "loss": 0.223, "reward": 2.1109776496887207, "reward_std": 2.514023184776306, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3672003149986267, "rewards/no_repetition_reward_func": -0.2562226355075836, "rewards/verse_reward_func": 0.0, "step": 2293 }, { "completion_length": 248.890625, "epoch": 18.352, "grad_norm": 3.578125, "kl": 4.490458965301514, "learning_rate": 3.990281936720619e-05, "loss": 0.1796, "reward": 2.9002597332000732, "reward_std": 3.087238073348999, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.213162899017334, "rewards/no_repetition_reward_func": -0.30509060621261597, "rewards/verse_reward_func": -0.0078125, "step": 2294 }, { "completion_length": 256.0, "epoch": 18.36, "grad_norm": 7.625, "kl": 5.734419822692871, "learning_rate": 3.989160643463445e-05, "loss": 0.2294, "reward": 1.6402832865715027, "reward_std": 2.2658531665802, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9120718836784363, "rewards/no_repetition_reward_func": -0.2717885226011276, "rewards/verse_reward_func": 0.0, "step": 2295 }, { "completion_length": 254.328125, "epoch": 18.368, "grad_norm": 4.40625, "kl": 4.129114031791687, "learning_rate": 3.988038885695028e-05, "loss": 0.1652, "reward": 3.0133447647094727, "reward_std": 3.3689017295837402, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3500614166259766, "rewards/no_repetition_reward_func": -0.33671659231185913, "rewards/verse_reward_func": 0.0, "step": 2296 }, { "completion_length": 252.453125, "epoch": 18.376, "grad_norm": 4.28125, "kl": 2.8627229928970337, "learning_rate": 3.986916663765275e-05, "loss": 0.1145, "reward": 3.3266079425811768, "reward_std": 3.33126437664032, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.6402522325515747, "rewards/no_repetition_reward_func": -0.3058321177959442, "rewards/verse_reward_func": -0.0078125, "step": 2297 }, { "completion_length": 253.171875, "epoch": 18.384, "grad_norm": 2.796875, "kl": 4.233923435211182, "learning_rate": 3.985793978024239e-05, "loss": 0.1694, "reward": 2.681282639503479, "reward_std": 3.077781915664673, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9456679821014404, "rewards/no_repetition_reward_func": -0.26438526809215546, "rewards/verse_reward_func": 0.0, "step": 2298 }, { "completion_length": 255.359375, "epoch": 18.392, "grad_norm": 1.5234375, "kl": 4.032816767692566, "learning_rate": 3.984670828822118e-05, "loss": 0.1613, "reward": 2.645945906639099, "reward_std": 3.0490529537200928, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9407403469085693, "rewards/no_repetition_reward_func": -0.29479438066482544, "rewards/verse_reward_func": 0.0, "step": 2299 }, { "completion_length": 254.46875, "epoch": 18.4, "grad_norm": 1.8515625, "kl": 4.62942910194397, "learning_rate": 3.983547216509254e-05, "loss": 0.1852, "reward": 2.869823455810547, "reward_std": 3.016662359237671, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1306707859039307, "rewards/no_repetition_reward_func": -0.2608473151922226, "rewards/verse_reward_func": 0.0, "step": 2300 }, { "completion_length": 254.328125, "epoch": 18.408, "grad_norm": 2.15625, "kl": 4.013220310211182, "learning_rate": 3.9824231414361324e-05, "loss": 0.1605, "reward": 2.6253836154937744, "reward_std": 2.8604609966278076, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.936345338821411, "rewards/no_repetition_reward_func": -0.31096151471138, "rewards/verse_reward_func": 0.0, "step": 2301 }, { "completion_length": 251.34375, "epoch": 18.416, "grad_norm": 1.6640625, "kl": 4.20117712020874, "learning_rate": 3.981298603953385e-05, "loss": 0.168, "reward": 2.4310109615325928, "reward_std": 2.9530560970306396, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7304153442382812, "rewards/no_repetition_reward_func": -0.2994043231010437, "rewards/verse_reward_func": 0.0, "step": 2302 }, { "completion_length": 249.3125, "epoch": 18.424, "grad_norm": 2.703125, "kl": 5.411336898803711, "learning_rate": 3.980173604411786e-05, "loss": 0.2165, "reward": 2.6775258779525757, "reward_std": 3.1270127296447754, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.935454487800598, "rewards/no_repetition_reward_func": -0.2501162737607956, "rewards/verse_reward_func": -0.0078125, "step": 2303 }, { "completion_length": 251.71875, "epoch": 18.432, "grad_norm": 3.1875, "kl": 5.459472179412842, "learning_rate": 3.979048143162255e-05, "loss": 0.2184, "reward": 1.8615427017211914, "reward_std": 2.7754253149032593, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1157236099243164, "rewards/no_repetition_reward_func": -0.2463683858513832, "rewards/verse_reward_func": -0.0078125, "step": 2304 }, { "completion_length": 249.546875, "epoch": 18.44, "grad_norm": 2.390625, "kl": 4.905409574508667, "learning_rate": 3.977922220555855e-05, "loss": 0.1962, "reward": 3.2837787866592407, "reward_std": 3.2851576805114746, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.59954035282135, "rewards/no_repetition_reward_func": -0.31576158106327057, "rewards/verse_reward_func": 0.0, "step": 2305 }, { "completion_length": 254.125, "epoch": 18.448, "grad_norm": 2.171875, "kl": 4.372055292129517, "learning_rate": 3.976795836943793e-05, "loss": 0.1749, "reward": 3.304523229598999, "reward_std": 3.180543065071106, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.616202473640442, "rewards/no_repetition_reward_func": -0.3116793632507324, "rewards/verse_reward_func": 0.0, "step": 2306 }, { "completion_length": 247.40625, "epoch": 18.456, "grad_norm": 1.8671875, "kl": 4.903334856033325, "learning_rate": 3.9756689926774196e-05, "loss": 0.1961, "reward": 2.3022229075431824, "reward_std": 3.0490269660949707, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5905362367630005, "rewards/no_repetition_reward_func": -0.2883130982518196, "rewards/verse_reward_func": 0.0, "step": 2307 }, { "completion_length": 245.25, "epoch": 18.464, "grad_norm": 4.40625, "kl": 5.566983699798584, "learning_rate": 3.97454168810823e-05, "loss": 0.2227, "reward": 2.6720099449157715, "reward_std": 2.8461105823516846, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9767255783081055, "rewards/no_repetition_reward_func": -0.2969030737876892, "rewards/verse_reward_func": -0.0078125, "step": 2308 }, { "completion_length": 252.796875, "epoch": 18.472, "grad_norm": 4.3125, "kl": 3.174337863922119, "learning_rate": 3.973413923587862e-05, "loss": 0.127, "reward": 3.4654061794281006, "reward_std": 3.189800977706909, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.785583019256592, "rewards/no_repetition_reward_func": -0.3201768100261688, "rewards/verse_reward_func": 0.0, "step": 2309 }, { "completion_length": 252.84375, "epoch": 18.48, "grad_norm": 3.71875, "kl": 4.048848628997803, "learning_rate": 3.9722856994680966e-05, "loss": 0.162, "reward": 2.9206188917160034, "reward_std": 3.2380576133728027, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2040735483169556, "rewards/no_repetition_reward_func": -0.28345464169979095, "rewards/verse_reward_func": 0.0, "step": 2310 }, { "completion_length": 244.15625, "epoch": 18.488, "grad_norm": 3.765625, "kl": 5.442103862762451, "learning_rate": 3.9711570161008596e-05, "loss": 0.2177, "reward": 1.8107166290283203, "reward_std": 2.5516605377197266, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.061761260032654, "rewards/no_repetition_reward_func": -0.2354196310043335, "rewards/verse_reward_func": -0.015625, "step": 2311 }, { "completion_length": 256.0, "epoch": 18.496, "grad_norm": 3.484375, "kl": 7.014396905899048, "learning_rate": 3.970027873838219e-05, "loss": 0.2806, "reward": 1.9919592142105103, "reward_std": 3.075474262237549, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2471068501472473, "rewards/no_repetition_reward_func": -0.2473350465297699, "rewards/verse_reward_func": -0.0078125, "step": 2312 }, { "completion_length": 251.65625, "epoch": 18.504, "grad_norm": 4.0, "kl": 4.144210338592529, "learning_rate": 3.9688982730323865e-05, "loss": 0.1658, "reward": 3.037562608718872, "reward_std": 3.0840336084365845, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3549360036849976, "rewards/no_repetition_reward_func": -0.30956099927425385, "rewards/verse_reward_func": -0.0078125, "step": 2313 }, { "completion_length": 247.765625, "epoch": 18.512, "grad_norm": 3.4375, "kl": 4.407976388931274, "learning_rate": 3.967768214035715e-05, "loss": 0.1763, "reward": 2.526479959487915, "reward_std": 2.869503974914551, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8100290298461914, "rewards/no_repetition_reward_func": -0.2835491746664047, "rewards/verse_reward_func": 0.0, "step": 2314 }, { "completion_length": 249.859375, "epoch": 18.52, "grad_norm": 2.484375, "kl": 4.1485679149627686, "learning_rate": 3.966637697200703e-05, "loss": 0.1659, "reward": 2.436046838760376, "reward_std": 2.7659417390823364, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7381409406661987, "rewards/no_repetition_reward_func": -0.3020942807197571, "rewards/verse_reward_func": 0.0, "step": 2315 }, { "completion_length": 246.171875, "epoch": 18.528, "grad_norm": 2.234375, "kl": 4.673894166946411, "learning_rate": 3.965506722879991e-05, "loss": 0.187, "reward": 2.60582435131073, "reward_std": 2.8333606719970703, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8825764656066895, "rewards/no_repetition_reward_func": -0.27675221860408783, "rewards/verse_reward_func": 0.0, "step": 2316 }, { "completion_length": 252.578125, "epoch": 18.536, "grad_norm": 2.78125, "kl": 4.210357904434204, "learning_rate": 3.964375291426361e-05, "loss": 0.1684, "reward": 2.7629624605178833, "reward_std": 2.8743656873703003, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0517690181732178, "rewards/no_repetition_reward_func": -0.28880637884140015, "rewards/verse_reward_func": 0.0, "step": 2317 }, { "completion_length": 252.078125, "epoch": 18.544, "grad_norm": 2.984375, "kl": 4.163542747497559, "learning_rate": 3.963243403192739e-05, "loss": 0.1665, "reward": 3.079049587249756, "reward_std": 2.7781715393066406, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3448184728622437, "rewards/no_repetition_reward_func": -0.2657689303159714, "rewards/verse_reward_func": 0.0, "step": 2318 }, { "completion_length": 256.0, "epoch": 18.552, "grad_norm": 3.984375, "kl": 3.891822099685669, "learning_rate": 3.962111058532192e-05, "loss": 0.1557, "reward": 2.931944489479065, "reward_std": 3.2926357984542847, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2416863441467285, "rewards/no_repetition_reward_func": -0.30974188446998596, "rewards/verse_reward_func": 0.0, "step": 2319 }, { "completion_length": 249.875, "epoch": 18.56, "grad_norm": 3.859375, "kl": 5.528583526611328, "learning_rate": 3.960978257797931e-05, "loss": 0.2211, "reward": 1.568440318107605, "reward_std": 2.562896728515625, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.856436848640442, "rewards/no_repetition_reward_func": -0.28799647092819214, "rewards/verse_reward_func": 0.0, "step": 2320 }, { "completion_length": 245.25, "epoch": 18.568, "grad_norm": 2.34375, "kl": 4.59578275680542, "learning_rate": 3.9598450013433075e-05, "loss": 0.1838, "reward": 2.3996493816375732, "reward_std": 3.040003776550293, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6801598072052, "rewards/no_repetition_reward_func": -0.27269798517227173, "rewards/verse_reward_func": -0.0078125, "step": 2321 }, { "completion_length": 247.46875, "epoch": 18.576, "grad_norm": 2.203125, "kl": 4.668794870376587, "learning_rate": 3.9587112895218184e-05, "loss": 0.1868, "reward": 2.164377450942993, "reward_std": 2.7474048137664795, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.444103479385376, "rewards/no_repetition_reward_func": -0.27191346883773804, "rewards/verse_reward_func": -0.0078125, "step": 2322 }, { "completion_length": 252.265625, "epoch": 18.584, "grad_norm": 2.046875, "kl": 3.9434726238250732, "learning_rate": 3.957577122687098e-05, "loss": 0.1577, "reward": 2.609360694885254, "reward_std": 3.2172528505325317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9115973711013794, "rewards/no_repetition_reward_func": -0.2944243475794792, "rewards/verse_reward_func": -0.0078125, "step": 2323 }, { "completion_length": 247.859375, "epoch": 18.592, "grad_norm": 4.65625, "kl": 3.3142101764678955, "learning_rate": 3.9564425011929265e-05, "loss": 0.1326, "reward": 2.9468178749084473, "reward_std": 3.3688567876815796, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2775800228118896, "rewards/no_repetition_reward_func": -0.32294946908950806, "rewards/verse_reward_func": -0.0078125, "step": 2324 }, { "completion_length": 250.375, "epoch": 18.6, "grad_norm": 2.046875, "kl": 4.466644048690796, "learning_rate": 3.955307425393224e-05, "loss": 0.1787, "reward": 2.5726068019866943, "reward_std": 3.2662487030029297, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8552504777908325, "rewards/no_repetition_reward_func": -0.28264372050762177, "rewards/verse_reward_func": 0.0, "step": 2325 }, { "completion_length": 256.0, "epoch": 18.608, "grad_norm": 2.0, "kl": 4.779887914657593, "learning_rate": 3.954171895642052e-05, "loss": 0.1912, "reward": 2.468783974647522, "reward_std": 2.8512200117111206, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.74561607837677, "rewards/no_repetition_reward_func": -0.26901975274086, "rewards/verse_reward_func": -0.0078125, "step": 2326 }, { "completion_length": 242.375, "epoch": 18.616, "grad_norm": 2.15625, "kl": 4.503003358840942, "learning_rate": 3.953035912293616e-05, "loss": 0.1801, "reward": 2.0794219970703125, "reward_std": 2.9490926265716553, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3431142568588257, "rewards/no_repetition_reward_func": -0.2558796852827072, "rewards/verse_reward_func": -0.0078125, "step": 2327 }, { "completion_length": 247.671875, "epoch": 18.624, "grad_norm": 4.28125, "kl": 3.6045570373535156, "learning_rate": 3.951899475702259e-05, "loss": 0.1442, "reward": 2.781038761138916, "reward_std": 3.0796797275543213, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.073127031326294, "rewards/no_repetition_reward_func": -0.28427571058273315, "rewards/verse_reward_func": -0.0078125, "step": 2328 }, { "completion_length": 253.140625, "epoch": 18.632, "grad_norm": 1.734375, "kl": 4.323079347610474, "learning_rate": 3.950762586222468e-05, "loss": 0.1729, "reward": 2.730276584625244, "reward_std": 3.3880324363708496, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0163941383361816, "rewards/no_repetition_reward_func": -0.2861174941062927, "rewards/verse_reward_func": 0.0, "step": 2329 }, { "completion_length": 256.0, "epoch": 18.64, "grad_norm": 1.296875, "kl": 5.020974159240723, "learning_rate": 3.9496252442088733e-05, "loss": 0.2008, "reward": 2.5841922760009766, "reward_std": 3.0676138401031494, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8765244483947754, "rewards/no_repetition_reward_func": -0.28451962769031525, "rewards/verse_reward_func": -0.0078125, "step": 2330 }, { "completion_length": 253.09375, "epoch": 18.648, "grad_norm": 3.359375, "kl": 5.5804407596588135, "learning_rate": 3.948487450016242e-05, "loss": 0.2232, "reward": 2.1899980306625366, "reward_std": 3.2257015705108643, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4990872144699097, "rewards/no_repetition_reward_func": -0.2934640496969223, "rewards/verse_reward_func": -0.015625, "step": 2331 }, { "completion_length": 247.328125, "epoch": 18.656, "grad_norm": 5.59375, "kl": 5.5505077838897705, "learning_rate": 3.947349203999484e-05, "loss": 0.222, "reward": 2.190966844558716, "reward_std": 2.9151276350021362, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4637383222579956, "rewards/no_repetition_reward_func": -0.26495908200740814, "rewards/verse_reward_func": -0.0078125, "step": 2332 }, { "completion_length": 251.875, "epoch": 18.664, "grad_norm": 1.703125, "kl": 5.329030752182007, "learning_rate": 3.946210506513651e-05, "loss": 0.2132, "reward": 3.0643287897109985, "reward_std": 3.1659129858016968, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.362362504005432, "rewards/no_repetition_reward_func": -0.2902212291955948, "rewards/verse_reward_func": -0.0078125, "step": 2333 }, { "completion_length": 256.0, "epoch": 18.672, "grad_norm": 6.28125, "kl": 4.844520807266235, "learning_rate": 3.945071357913935e-05, "loss": 0.1938, "reward": 2.608189344406128, "reward_std": 2.718052625656128, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.880050301551819, "rewards/no_repetition_reward_func": -0.27186090499162674, "rewards/verse_reward_func": 0.0, "step": 2334 }, { "completion_length": 251.875, "epoch": 18.68, "grad_norm": 3.625, "kl": 4.958564400672913, "learning_rate": 3.943931758555669e-05, "loss": 0.1983, "reward": 2.3179420828819275, "reward_std": 2.8120687007904053, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6461876034736633, "rewards/no_repetition_reward_func": -0.3126205801963806, "rewards/verse_reward_func": -0.015625, "step": 2335 }, { "completion_length": 245.484375, "epoch": 18.688, "grad_norm": 5.28125, "kl": 5.508277416229248, "learning_rate": 3.942791708794326e-05, "loss": 0.2203, "reward": 2.1578623056411743, "reward_std": 2.8470616340637207, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.40069317817688, "rewards/no_repetition_reward_func": -0.24283071607351303, "rewards/verse_reward_func": 0.0, "step": 2336 }, { "completion_length": 256.0, "epoch": 18.696, "grad_norm": 1.8046875, "kl": 4.670703291893005, "learning_rate": 3.9416512089855184e-05, "loss": 0.1868, "reward": 2.9646759033203125, "reward_std": 3.194557785987854, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.264892578125, "rewards/no_repetition_reward_func": -0.30021654069423676, "rewards/verse_reward_func": 0.0, "step": 2337 }, { "completion_length": 249.828125, "epoch": 18.704, "grad_norm": 2.96875, "kl": 3.932090640068054, "learning_rate": 3.940510259485002e-05, "loss": 0.1573, "reward": 2.291827917098999, "reward_std": 2.9057507514953613, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.574987530708313, "rewards/no_repetition_reward_func": -0.27534736692905426, "rewards/verse_reward_func": -0.0078125, "step": 2338 }, { "completion_length": 251.359375, "epoch": 18.712, "grad_norm": 1.8515625, "kl": 3.757925868034363, "learning_rate": 3.939368860648669e-05, "loss": 0.1503, "reward": 3.160076141357422, "reward_std": 2.8499069213867188, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4276334047317505, "rewards/no_repetition_reward_func": -0.26755744218826294, "rewards/verse_reward_func": 0.0, "step": 2339 }, { "completion_length": 256.0, "epoch": 18.72, "grad_norm": 4.375, "kl": 3.037665605545044, "learning_rate": 3.938227012832557e-05, "loss": 0.1215, "reward": 3.283557176589966, "reward_std": 3.008602738380432, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5772377252578735, "rewards/no_repetition_reward_func": -0.29368047416210175, "rewards/verse_reward_func": 0.0, "step": 2340 }, { "completion_length": 252.296875, "epoch": 18.728, "grad_norm": 4.3125, "kl": 3.9191704988479614, "learning_rate": 3.937084716392838e-05, "loss": 0.1568, "reward": 2.7574667930603027, "reward_std": 3.3119136095046997, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1065276861190796, "rewards/no_repetition_reward_func": -0.3412483036518097, "rewards/verse_reward_func": -0.0078125, "step": 2341 }, { "completion_length": 249.78125, "epoch": 18.736, "grad_norm": 3.625, "kl": 4.6984477043151855, "learning_rate": 3.9359419716858274e-05, "loss": 0.1879, "reward": 2.8506643772125244, "reward_std": 3.1818153858184814, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.135258436203003, "rewards/no_repetition_reward_func": -0.2767813205718994, "rewards/verse_reward_func": -0.0078125, "step": 2342 }, { "completion_length": 245.21875, "epoch": 18.744, "grad_norm": 4.125, "kl": 3.4861698150634766, "learning_rate": 3.93479877906798e-05, "loss": 0.1394, "reward": 3.492279052734375, "reward_std": 3.3334635496139526, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.818957567214966, "rewards/no_repetition_reward_func": -0.32667872309684753, "rewards/verse_reward_func": 0.0, "step": 2343 }, { "completion_length": 252.4375, "epoch": 18.752, "grad_norm": 3.09375, "kl": 4.036119103431702, "learning_rate": 3.933655138895889e-05, "loss": 0.1614, "reward": 2.8089709281921387, "reward_std": 3.325024127960205, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.134114623069763, "rewards/no_repetition_reward_func": -0.32514360547065735, "rewards/verse_reward_func": 0.0, "step": 2344 }, { "completion_length": 250.328125, "epoch": 18.76, "grad_norm": 4.0625, "kl": 5.8988189697265625, "learning_rate": 3.932511051526289e-05, "loss": 0.236, "reward": 1.6613972783088684, "reward_std": 2.3941463232040405, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9462676048278809, "rewards/no_repetition_reward_func": -0.28487031161785126, "rewards/verse_reward_func": 0.0, "step": 2345 }, { "completion_length": 247.3125, "epoch": 18.768, "grad_norm": 2.65625, "kl": 4.496132850646973, "learning_rate": 3.931366517316052e-05, "loss": 0.1798, "reward": 2.6865108013153076, "reward_std": 3.105563759803772, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9568698406219482, "rewards/no_repetition_reward_func": -0.2703590840101242, "rewards/verse_reward_func": 0.0, "step": 2346 }, { "completion_length": 252.734375, "epoch": 18.776, "grad_norm": 2.171875, "kl": 5.013957500457764, "learning_rate": 3.930221536622191e-05, "loss": 0.2006, "reward": 2.8582820892333984, "reward_std": 3.4180697202682495, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1068174839019775, "rewards/no_repetition_reward_func": -0.24072282761335373, "rewards/verse_reward_func": -0.0078125, "step": 2347 }, { "completion_length": 252.90625, "epoch": 18.784, "grad_norm": 1.4921875, "kl": 4.348979473114014, "learning_rate": 3.9290761098018585e-05, "loss": 0.174, "reward": 2.9305918216705322, "reward_std": 3.198329448699951, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1939905881881714, "rewards/no_repetition_reward_func": -0.2633987069129944, "rewards/verse_reward_func": 0.0, "step": 2348 }, { "completion_length": 255.15625, "epoch": 18.792, "grad_norm": 4.4375, "kl": 6.371293783187866, "learning_rate": 3.927930237212345e-05, "loss": 0.2549, "reward": 2.069201648235321, "reward_std": 2.521351933479309, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3472135066986084, "rewards/no_repetition_reward_func": -0.270199254155159, "rewards/verse_reward_func": -0.0078125, "step": 2349 }, { "completion_length": 253.734375, "epoch": 18.8, "grad_norm": 2.140625, "kl": 4.6327431201934814, "learning_rate": 3.92678391921108e-05, "loss": 0.1853, "reward": 3.5658910274505615, "reward_std": 3.2272685766220093, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.8922781944274902, "rewards/no_repetition_reward_func": -0.31857483088970184, "rewards/verse_reward_func": -0.0078125, "step": 2350 }, { "completion_length": 253.53125, "epoch": 18.808, "grad_norm": 3.453125, "kl": 6.09805965423584, "learning_rate": 3.925637156155633e-05, "loss": 0.2439, "reward": 2.8595486879348755, "reward_std": 3.4157408475875854, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1636404991149902, "rewards/no_repetition_reward_func": -0.3040916323661804, "rewards/verse_reward_func": 0.0, "step": 2351 }, { "completion_length": 248.828125, "epoch": 18.816, "grad_norm": 2.96875, "kl": 4.966061353683472, "learning_rate": 3.924489948403711e-05, "loss": 0.1986, "reward": 3.010325074195862, "reward_std": 3.2461167573928833, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3621875047683716, "rewards/no_repetition_reward_func": -0.3440498858690262, "rewards/verse_reward_func": -0.0078125, "step": 2352 }, { "completion_length": 256.0, "epoch": 18.824, "grad_norm": 6.0625, "kl": 7.159048557281494, "learning_rate": 3.9233422963131616e-05, "loss": 0.2864, "reward": 2.217911720275879, "reward_std": 2.808537721633911, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.513531982898712, "rewards/no_repetition_reward_func": -0.29562026262283325, "rewards/verse_reward_func": 0.0, "step": 2353 }, { "completion_length": 246.546875, "epoch": 18.832, "grad_norm": 5.75, "kl": 8.086720943450928, "learning_rate": 3.922194200241969e-05, "loss": 0.3235, "reward": 1.9570005536079407, "reward_std": 3.178880453109741, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.250922441482544, "rewards/no_repetition_reward_func": -0.27048441767692566, "rewards/verse_reward_func": -0.0234375, "step": 2354 }, { "completion_length": 256.0, "epoch": 18.84, "grad_norm": 2.71875, "kl": 4.053266882896423, "learning_rate": 3.9210456605482576e-05, "loss": 0.1621, "reward": 3.3603806495666504, "reward_std": 3.0764836072921753, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.679222583770752, "rewards/no_repetition_reward_func": -0.31884221732616425, "rewards/verse_reward_func": 0.0, "step": 2355 }, { "completion_length": 253.109375, "epoch": 18.848, "grad_norm": 2.28125, "kl": 5.8633716106414795, "learning_rate": 3.919896677590289e-05, "loss": 0.2345, "reward": 2.8449270725250244, "reward_std": 2.972219228744507, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1247429847717285, "rewards/no_repetition_reward_func": -0.27981579303741455, "rewards/verse_reward_func": 0.0, "step": 2356 }, { "completion_length": 249.734375, "epoch": 18.856, "grad_norm": 2.484375, "kl": 5.795509099960327, "learning_rate": 3.918747251726463e-05, "loss": 0.2318, "reward": 2.5400906801223755, "reward_std": 3.0436254739761353, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.830810070037842, "rewards/no_repetition_reward_func": -0.28290678560733795, "rewards/verse_reward_func": -0.0078125, "step": 2357 }, { "completion_length": 249.75, "epoch": 18.864, "grad_norm": 3.65625, "kl": 5.302897930145264, "learning_rate": 3.9175973833153186e-05, "loss": 0.2121, "reward": 2.637457013130188, "reward_std": 3.4205989837646484, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.962156295776367, "rewards/no_repetition_reward_func": -0.3090742975473404, "rewards/verse_reward_func": -0.015625, "step": 2358 }, { "completion_length": 256.0, "epoch": 18.872, "grad_norm": 7.375, "kl": 6.6974382400512695, "learning_rate": 3.9164470727155314e-05, "loss": 0.2679, "reward": 1.7515899538993835, "reward_std": 2.4580748081207275, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0532166957855225, "rewards/no_repetition_reward_func": -0.29381421208381653, "rewards/verse_reward_func": -0.0078125, "step": 2359 }, { "completion_length": 238.46875, "epoch": 18.88, "grad_norm": 2.453125, "kl": 5.138902902603149, "learning_rate": 3.915296320285917e-05, "loss": 0.2056, "reward": 2.079463839530945, "reward_std": 2.9699639081954956, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.334181070327759, "rewards/no_repetition_reward_func": -0.2469048872590065, "rewards/verse_reward_func": -0.0078125, "step": 2360 }, { "completion_length": 247.5, "epoch": 18.888, "grad_norm": 4.40625, "kl": 3.6143628358840942, "learning_rate": 3.914145126385426e-05, "loss": 0.1446, "reward": 3.17507541179657, "reward_std": 3.108330249786377, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4991332292556763, "rewards/no_repetition_reward_func": -0.3240576535463333, "rewards/verse_reward_func": 0.0, "step": 2361 }, { "completion_length": 253.03125, "epoch": 18.896, "grad_norm": 1.640625, "kl": 5.417354345321655, "learning_rate": 3.91299349137315e-05, "loss": 0.2167, "reward": 2.366302013397217, "reward_std": 3.0847431421279907, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6436057090759277, "rewards/no_repetition_reward_func": -0.2773035243153572, "rewards/verse_reward_func": 0.0, "step": 2362 }, { "completion_length": 253.109375, "epoch": 18.904, "grad_norm": 3.671875, "kl": 5.527652740478516, "learning_rate": 3.911841415608315e-05, "loss": 0.2211, "reward": 2.551238775253296, "reward_std": 2.9698463678359985, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.805214762687683, "rewards/no_repetition_reward_func": -0.253976047039032, "rewards/verse_reward_func": 0.0, "step": 2363 }, { "completion_length": 252.265625, "epoch": 18.912, "grad_norm": 3.453125, "kl": 5.142015695571899, "learning_rate": 3.9106888994502864e-05, "loss": 0.2057, "reward": 2.138382077217102, "reward_std": 2.7662761211395264, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4529918432235718, "rewards/no_repetition_reward_func": -0.29898475110530853, "rewards/verse_reward_func": -0.015625, "step": 2364 }, { "completion_length": 256.0, "epoch": 18.92, "grad_norm": 2.40625, "kl": 4.476930618286133, "learning_rate": 3.909535943258567e-05, "loss": 0.1791, "reward": 2.031323194503784, "reward_std": 2.4691033363342285, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3133209943771362, "rewards/no_repetition_reward_func": -0.2819976657629013, "rewards/verse_reward_func": 0.0, "step": 2365 }, { "completion_length": 251.1875, "epoch": 18.928, "grad_norm": 2.453125, "kl": 3.6105350255966187, "learning_rate": 3.908382547392796e-05, "loss": 0.1444, "reward": 3.156471848487854, "reward_std": 2.946348190307617, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4279319047927856, "rewards/no_repetition_reward_func": -0.26364748179912567, "rewards/verse_reward_func": -0.0078125, "step": 2366 }, { "completion_length": 255.546875, "epoch": 18.936, "grad_norm": 5.78125, "kl": 3.5539718866348267, "learning_rate": 3.907228712212751e-05, "loss": 0.1422, "reward": 3.0605145692825317, "reward_std": 3.0422098636627197, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3813217878341675, "rewards/no_repetition_reward_func": -0.32080718874931335, "rewards/verse_reward_func": 0.0, "step": 2367 }, { "completion_length": 252.875, "epoch": 18.944, "grad_norm": 4.625, "kl": 3.5973836183547974, "learning_rate": 3.9060744380783435e-05, "loss": 0.1439, "reward": 2.7045563459396362, "reward_std": 3.5014222860336304, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.025775909423828, "rewards/no_repetition_reward_func": -0.32121968269348145, "rewards/verse_reward_func": 0.0, "step": 2368 }, { "completion_length": 252.140625, "epoch": 18.951999999999998, "grad_norm": 2.109375, "kl": 4.980719804763794, "learning_rate": 3.9049197253496264e-05, "loss": 0.1992, "reward": 1.7768189311027527, "reward_std": 2.92545747756958, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0475562810897827, "rewards/no_repetition_reward_func": -0.2629248946905136, "rewards/verse_reward_func": -0.0078125, "step": 2369 }, { "completion_length": 253.28125, "epoch": 18.96, "grad_norm": 4.34375, "kl": 3.0974233150482178, "learning_rate": 3.903764574386786e-05, "loss": 0.1239, "reward": 3.6442681550979614, "reward_std": 3.2442911863327026, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.9831478595733643, "rewards/no_repetition_reward_func": -0.3310672342777252, "rewards/verse_reward_func": -0.0078125, "step": 2370 }, { "completion_length": 248.421875, "epoch": 18.968, "grad_norm": 3.109375, "kl": 3.968507409095764, "learning_rate": 3.902608985550147e-05, "loss": 0.1587, "reward": 1.633641541004181, "reward_std": 2.661769151687622, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9236931204795837, "rewards/no_repetition_reward_func": -0.2822391092777252, "rewards/verse_reward_func": -0.0078125, "step": 2371 }, { "completion_length": 255.28125, "epoch": 18.976, "grad_norm": 2.5625, "kl": 4.413487672805786, "learning_rate": 3.9014529592001705e-05, "loss": 0.1765, "reward": 1.7220697402954102, "reward_std": 2.6254959106445312, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9922294616699219, "rewards/no_repetition_reward_func": -0.25453466176986694, "rewards/verse_reward_func": -0.015625, "step": 2372 }, { "completion_length": 253.953125, "epoch": 18.984, "grad_norm": 2.921875, "kl": 4.513641119003296, "learning_rate": 3.900296495697453e-05, "loss": 0.1805, "reward": 2.0773908495903015, "reward_std": 2.9952456951141357, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3976171016693115, "rewards/no_repetition_reward_func": -0.30460117757320404, "rewards/verse_reward_func": -0.015625, "step": 2373 }, { "completion_length": 243.828125, "epoch": 18.992, "grad_norm": 2.8125, "kl": 3.048968553543091, "learning_rate": 3.899139595402729e-05, "loss": 0.122, "reward": 2.57305645942688, "reward_std": 3.013240694999695, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8650994300842285, "rewards/no_repetition_reward_func": -0.2842304855585098, "rewards/verse_reward_func": -0.0078125, "step": 2374 }, { "completion_length": 256.0, "epoch": 19.0, "grad_norm": 2.109375, "kl": 3.9829485416412354, "learning_rate": 3.897982258676867e-05, "loss": 0.1593, "reward": 2.472282290458679, "reward_std": 3.107578754425049, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7651621103286743, "rewards/no_repetition_reward_func": -0.2928798794746399, "rewards/verse_reward_func": 0.0, "step": 2375 }, { "completion_length": 250.375, "epoch": 19.008, "grad_norm": 3.125, "kl": 3.871223568916321, "learning_rate": 3.896824485880874e-05, "loss": 0.1548, "reward": 3.0045108795166016, "reward_std": 3.266538381576538, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.340010404586792, "rewards/no_repetition_reward_func": -0.32768726348876953, "rewards/verse_reward_func": -0.0078125, "step": 2376 }, { "completion_length": 256.0, "epoch": 19.016, "grad_norm": 3.046875, "kl": 3.956605911254883, "learning_rate": 3.895666277375892e-05, "loss": 0.1583, "reward": 2.329959273338318, "reward_std": 2.949242353439331, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6051825284957886, "rewards/no_repetition_reward_func": -0.27522316575050354, "rewards/verse_reward_func": 0.0, "step": 2377 }, { "completion_length": 256.0, "epoch": 19.024, "grad_norm": 2.125, "kl": 5.003918886184692, "learning_rate": 3.894507633523199e-05, "loss": 0.2002, "reward": 2.4689568281173706, "reward_std": 3.4205265045166016, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7413958311080933, "rewards/no_repetition_reward_func": -0.2724389284849167, "rewards/verse_reward_func": 0.0, "step": 2378 }, { "completion_length": 252.921875, "epoch": 19.032, "grad_norm": 2.5625, "kl": 5.122969388961792, "learning_rate": 3.8933485546842094e-05, "loss": 0.2049, "reward": 2.2108630537986755, "reward_std": 2.82697069644928, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4932515621185303, "rewards/no_repetition_reward_func": -0.28238844871520996, "rewards/verse_reward_func": 0.0, "step": 2379 }, { "completion_length": 249.140625, "epoch": 19.04, "grad_norm": 2.125, "kl": 4.043543577194214, "learning_rate": 3.8921890412204705e-05, "loss": 0.1617, "reward": 2.3439407348632812, "reward_std": 2.9069948196411133, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6105140447616577, "rewards/no_repetition_reward_func": -0.2587607800960541, "rewards/verse_reward_func": -0.0078125, "step": 2380 }, { "completion_length": 247.78125, "epoch": 19.048, "grad_norm": 2.546875, "kl": 4.564141750335693, "learning_rate": 3.891029093493669e-05, "loss": 0.1826, "reward": 3.0599437952041626, "reward_std": 3.3181207180023193, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.404944658279419, "rewards/no_repetition_reward_func": -0.3293757438659668, "rewards/verse_reward_func": -0.015625, "step": 2381 }, { "completion_length": 240.75, "epoch": 19.056, "grad_norm": 1.3359375, "kl": 5.2582597732543945, "learning_rate": 3.889868711865624e-05, "loss": 0.2103, "reward": 2.3037514686584473, "reward_std": 3.3451876640319824, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6002241373062134, "rewards/no_repetition_reward_func": -0.28866029530763626, "rewards/verse_reward_func": -0.0078125, "step": 2382 }, { "completion_length": 249.671875, "epoch": 19.064, "grad_norm": 1.4765625, "kl": 5.033370494842529, "learning_rate": 3.8887078966982925e-05, "loss": 0.2013, "reward": 2.8251324892044067, "reward_std": 3.0954742431640625, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0947872400283813, "rewards/no_repetition_reward_func": -0.269654780626297, "rewards/verse_reward_func": 0.0, "step": 2383 }, { "completion_length": 256.0, "epoch": 19.072, "grad_norm": 2.734375, "kl": 6.337474584579468, "learning_rate": 3.887546648353765e-05, "loss": 0.2535, "reward": 2.242149829864502, "reward_std": 2.8914886713027954, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.544477701187134, "rewards/no_repetition_reward_func": -0.2945152819156647, "rewards/verse_reward_func": -0.0078125, "step": 2384 }, { "completion_length": 247.484375, "epoch": 19.08, "grad_norm": 8.5, "kl": 4.418201684951782, "learning_rate": 3.8863849671942685e-05, "loss": 0.1767, "reward": 2.803465962409973, "reward_std": 3.136575222015381, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1038928031921387, "rewards/no_repetition_reward_func": -0.2926144450902939, "rewards/verse_reward_func": -0.0078125, "step": 2385 }, { "completion_length": 250.34375, "epoch": 19.088, "grad_norm": 1.8671875, "kl": 5.614689588546753, "learning_rate": 3.885222853582163e-05, "loss": 0.2246, "reward": 1.9762799739837646, "reward_std": 2.8220367431640625, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.216152310371399, "rewards/no_repetition_reward_func": -0.2398722618818283, "rewards/verse_reward_func": 0.0, "step": 2386 }, { "completion_length": 252.203125, "epoch": 19.096, "grad_norm": 72.0, "kl": 12.419540166854858, "learning_rate": 3.8840603078799445e-05, "loss": 0.4968, "reward": 2.2049976587295532, "reward_std": 2.759711742401123, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4814422726631165, "rewards/no_repetition_reward_func": -0.276444673538208, "rewards/verse_reward_func": 0.0, "step": 2387 }, { "completion_length": 250.96875, "epoch": 19.104, "grad_norm": 1.921875, "kl": 5.070712089538574, "learning_rate": 3.8828973304502446e-05, "loss": 0.2028, "reward": 2.6555827856063843, "reward_std": 3.058837890625, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9951462149620056, "rewards/no_repetition_reward_func": -0.3317507207393646, "rewards/verse_reward_func": -0.0078125, "step": 2388 }, { "completion_length": 255.390625, "epoch": 19.112, "grad_norm": 3.984375, "kl": 4.287745237350464, "learning_rate": 3.881733921655829e-05, "loss": 0.1715, "reward": 2.5816102027893066, "reward_std": 2.775912642478943, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9058042764663696, "rewards/no_repetition_reward_func": -0.31638170778751373, "rewards/verse_reward_func": -0.0078125, "step": 2389 }, { "completion_length": 247.390625, "epoch": 19.12, "grad_norm": 1.53125, "kl": 5.168323278427124, "learning_rate": 3.880570081859597e-05, "loss": 0.2067, "reward": 2.076113998889923, "reward_std": 2.781480550765991, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.320968508720398, "rewards/no_repetition_reward_func": -0.24485443532466888, "rewards/verse_reward_func": 0.0, "step": 2390 }, { "completion_length": 247.359375, "epoch": 19.128, "grad_norm": 1.953125, "kl": 3.925888776779175, "learning_rate": 3.879405811424583e-05, "loss": 0.157, "reward": 2.719944953918457, "reward_std": 3.1040902137756348, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.025743246078491, "rewards/no_repetition_reward_func": -0.3057984709739685, "rewards/verse_reward_func": 0.0, "step": 2391 }, { "completion_length": 251.390625, "epoch": 19.136, "grad_norm": 2.5, "kl": 4.390908718109131, "learning_rate": 3.8782411107139564e-05, "loss": 0.1756, "reward": 2.844651937484741, "reward_std": 3.0561580657958984, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.149897575378418, "rewards/no_repetition_reward_func": -0.297433003783226, "rewards/verse_reward_func": -0.0078125, "step": 2392 }, { "completion_length": 250.875, "epoch": 19.144, "grad_norm": 3.734375, "kl": 5.535202264785767, "learning_rate": 3.87707598009102e-05, "loss": 0.2214, "reward": 1.6423057317733765, "reward_std": 2.822591185569763, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.891141414642334, "rewards/no_repetition_reward_func": -0.2332107275724411, "rewards/verse_reward_func": -0.015625, "step": 2393 }, { "completion_length": 254.90625, "epoch": 19.152, "grad_norm": 4.9375, "kl": 4.296781778335571, "learning_rate": 3.875910419919211e-05, "loss": 0.1719, "reward": 2.931102752685547, "reward_std": 3.1186522245407104, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.246506929397583, "rewards/no_repetition_reward_func": -0.31540414690971375, "rewards/verse_reward_func": 0.0, "step": 2394 }, { "completion_length": 252.984375, "epoch": 19.16, "grad_norm": 2.234375, "kl": 3.748315691947937, "learning_rate": 3.8747444305621e-05, "loss": 0.1499, "reward": 3.015622854232788, "reward_std": 3.092779278755188, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.273138999938965, "rewards/no_repetition_reward_func": -0.2575160712003708, "rewards/verse_reward_func": 0.0, "step": 2395 }, { "completion_length": 247.34375, "epoch": 19.168, "grad_norm": 2.203125, "kl": 4.625586271286011, "learning_rate": 3.873578012383393e-05, "loss": 0.185, "reward": 2.393413782119751, "reward_std": 3.115650177001953, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7056888341903687, "rewards/no_repetition_reward_func": -0.31227511167526245, "rewards/verse_reward_func": 0.0, "step": 2396 }, { "completion_length": 251.90625, "epoch": 19.176, "grad_norm": 2.46875, "kl": 5.780324935913086, "learning_rate": 3.872411165746927e-05, "loss": 0.2312, "reward": 1.9377458691596985, "reward_std": 2.9221572875976562, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.221142053604126, "rewards/no_repetition_reward_func": -0.2833961397409439, "rewards/verse_reward_func": 0.0, "step": 2397 }, { "completion_length": 252.625, "epoch": 19.184, "grad_norm": 1.203125, "kl": 5.1230738162994385, "learning_rate": 3.871243891016676e-05, "loss": 0.2049, "reward": 2.5220947265625, "reward_std": 3.2021580934524536, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.804266095161438, "rewards/no_repetition_reward_func": -0.2821713238954544, "rewards/verse_reward_func": 0.0, "step": 2398 }, { "completion_length": 253.90625, "epoch": 19.192, "grad_norm": 2.9375, "kl": 4.937232971191406, "learning_rate": 3.870076188556746e-05, "loss": 0.1975, "reward": 2.03878515958786, "reward_std": 2.8361353874206543, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.307198643684387, "rewards/no_repetition_reward_func": -0.2684134542942047, "rewards/verse_reward_func": 0.0, "step": 2399 }, { "completion_length": 252.9375, "epoch": 19.2, "grad_norm": 2.015625, "kl": 5.177260875701904, "learning_rate": 3.868908058731376e-05, "loss": 0.2071, "reward": 2.145944118499756, "reward_std": 2.798878312110901, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4614734649658203, "rewards/no_repetition_reward_func": -0.3155294358730316, "rewards/verse_reward_func": 0.0, "step": 2400 }, { "completion_length": 253.59375, "epoch": 19.208, "grad_norm": 3.796875, "kl": 3.195793628692627, "learning_rate": 3.867739501904938e-05, "loss": 0.1278, "reward": 3.5255900621414185, "reward_std": 3.4680490493774414, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.87366259098053, "rewards/no_repetition_reward_func": -0.3480726182460785, "rewards/verse_reward_func": 0.0, "step": 2401 }, { "completion_length": 246.296875, "epoch": 19.216, "grad_norm": 3.546875, "kl": 4.097120761871338, "learning_rate": 3.8665705184419386e-05, "loss": 0.1639, "reward": 2.8523333072662354, "reward_std": 3.181929349899292, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.13077449798584, "rewards/no_repetition_reward_func": -0.2784412056207657, "rewards/verse_reward_func": 0.0, "step": 2402 }, { "completion_length": 251.8125, "epoch": 19.224, "grad_norm": 2.265625, "kl": 4.5173094272613525, "learning_rate": 3.865401108707017e-05, "loss": 0.1807, "reward": 2.2882784605026245, "reward_std": 3.0082032680511475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.612156867980957, "rewards/no_repetition_reward_func": -0.31606587767601013, "rewards/verse_reward_func": -0.0078125, "step": 2403 }, { "completion_length": 238.453125, "epoch": 19.232, "grad_norm": 3.625, "kl": 3.6935195922851562, "learning_rate": 3.864231273064944e-05, "loss": 0.1477, "reward": 2.1907927989959717, "reward_std": 2.5926380157470703, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5001235008239746, "rewards/no_repetition_reward_func": -0.27808067947626114, "rewards/verse_reward_func": -0.03125, "step": 2404 }, { "completion_length": 243.40625, "epoch": 19.24, "grad_norm": 1.7265625, "kl": 4.399570941925049, "learning_rate": 3.8630610118806254e-05, "loss": 0.176, "reward": 2.202229619026184, "reward_std": 3.0417490005493164, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4715077877044678, "rewards/no_repetition_reward_func": -0.2692782133817673, "rewards/verse_reward_func": 0.0, "step": 2405 }, { "completion_length": 252.71875, "epoch": 19.248, "grad_norm": 2.796875, "kl": 4.258141756057739, "learning_rate": 3.861890325519098e-05, "loss": 0.1703, "reward": 3.1238373517990112, "reward_std": 3.072588562965393, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4793128967285156, "rewards/no_repetition_reward_func": -0.33985067903995514, "rewards/verse_reward_func": -0.015625, "step": 2406 }, { "completion_length": 250.359375, "epoch": 19.256, "grad_norm": 2.34375, "kl": 5.692086696624756, "learning_rate": 3.8607192143455326e-05, "loss": 0.2277, "reward": 1.8525612354278564, "reward_std": 2.747217059135437, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.109224319458008, "rewards/no_repetition_reward_func": -0.2566632032394409, "rewards/verse_reward_func": 0.0, "step": 2407 }, { "completion_length": 241.859375, "epoch": 19.264, "grad_norm": 1.6328125, "kl": 4.39554762840271, "learning_rate": 3.859547678725231e-05, "loss": 0.1758, "reward": 2.403843641281128, "reward_std": 3.101594567298889, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6781431436538696, "rewards/no_repetition_reward_func": -0.26648691296577454, "rewards/verse_reward_func": -0.0078125, "step": 2408 }, { "completion_length": 248.921875, "epoch": 19.272, "grad_norm": 8.125, "kl": 4.7620813846588135, "learning_rate": 3.858375719023629e-05, "loss": 0.1905, "reward": 1.3076627850532532, "reward_std": 2.192988157272339, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.5502089262008667, "rewards/no_repetition_reward_func": -0.24254614859819412, "rewards/verse_reward_func": 0.0, "step": 2409 }, { "completion_length": 243.6875, "epoch": 19.28, "grad_norm": 5.28125, "kl": 4.942929029464722, "learning_rate": 3.8572033356062943e-05, "loss": 0.1977, "reward": 2.0455381274223328, "reward_std": 2.721994400024414, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.355522632598877, "rewards/no_repetition_reward_func": -0.30998457968235016, "rewards/verse_reward_func": 0.0, "step": 2410 }, { "completion_length": 254.546875, "epoch": 19.288, "grad_norm": 2.0625, "kl": 4.7986133098602295, "learning_rate": 3.856030528838925e-05, "loss": 0.1919, "reward": 2.665735125541687, "reward_std": 2.885435104370117, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9701337814331055, "rewards/no_repetition_reward_func": -0.29658613353967667, "rewards/verse_reward_func": -0.0078125, "step": 2411 }, { "completion_length": 245.609375, "epoch": 19.296, "grad_norm": 1.796875, "kl": 4.495667099952698, "learning_rate": 3.854857299087353e-05, "loss": 0.1798, "reward": 2.207573652267456, "reward_std": 2.627901792526245, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4279879331588745, "rewards/no_repetition_reward_func": -0.2204144150018692, "rewards/verse_reward_func": 0.0, "step": 2412 }, { "completion_length": 253.03125, "epoch": 19.304, "grad_norm": 2.890625, "kl": 4.962618350982666, "learning_rate": 3.853683646717543e-05, "loss": 0.1985, "reward": 1.9096946716308594, "reward_std": 2.812321662902832, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.21574330329895, "rewards/no_repetition_reward_func": -0.29042352735996246, "rewards/verse_reward_func": -0.015625, "step": 2413 }, { "completion_length": 255.625, "epoch": 19.312, "grad_norm": 3.28125, "kl": 4.689586162567139, "learning_rate": 3.852509572095588e-05, "loss": 0.1876, "reward": 2.7329416275024414, "reward_std": 3.059487223625183, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0179848670959473, "rewards/no_repetition_reward_func": -0.28504350036382675, "rewards/verse_reward_func": 0.0, "step": 2414 }, { "completion_length": 253.46875, "epoch": 19.32, "grad_norm": 2.109375, "kl": 5.187913656234741, "learning_rate": 3.851335075587718e-05, "loss": 0.2075, "reward": 2.098264455795288, "reward_std": 2.707901954650879, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.374450206756592, "rewards/no_repetition_reward_func": -0.2761857509613037, "rewards/verse_reward_func": 0.0, "step": 2415 }, { "completion_length": 252.703125, "epoch": 19.328, "grad_norm": 2.6875, "kl": 4.196899652481079, "learning_rate": 3.85016015756029e-05, "loss": 0.1679, "reward": 2.9557392597198486, "reward_std": 2.9658313989639282, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.257159113883972, "rewards/no_repetition_reward_func": -0.30141983926296234, "rewards/verse_reward_func": 0.0, "step": 2416 }, { "completion_length": 250.515625, "epoch": 19.336, "grad_norm": 2.75, "kl": 3.9653854370117188, "learning_rate": 3.848984818379793e-05, "loss": 0.1586, "reward": 2.503294348716736, "reward_std": 2.7913284301757812, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8035765886306763, "rewards/no_repetition_reward_func": -0.29246971011161804, "rewards/verse_reward_func": -0.0078125, "step": 2417 }, { "completion_length": 251.515625, "epoch": 19.344, "grad_norm": 3.71875, "kl": 3.9349464178085327, "learning_rate": 3.84780905841285e-05, "loss": 0.1574, "reward": 2.311024308204651, "reward_std": 3.0783685445785522, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6482656002044678, "rewards/no_repetition_reward_func": -0.3372412621974945, "rewards/verse_reward_func": 0.0, "step": 2418 }, { "completion_length": 240.875, "epoch": 19.352, "grad_norm": 2.15625, "kl": 3.901139497756958, "learning_rate": 3.846632878026214e-05, "loss": 0.156, "reward": 2.2545154094696045, "reward_std": 3.0650116205215454, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5204802751541138, "rewards/no_repetition_reward_func": -0.2581523656845093, "rewards/verse_reward_func": -0.0078125, "step": 2419 }, { "completion_length": 253.46875, "epoch": 19.36, "grad_norm": 2.59375, "kl": 4.237576365470886, "learning_rate": 3.8454562775867684e-05, "loss": 0.1695, "reward": 2.473580241203308, "reward_std": 3.1518197059631348, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.762731194496155, "rewards/no_repetition_reward_func": -0.2891509681940079, "rewards/verse_reward_func": 0.0, "step": 2420 }, { "completion_length": 249.4375, "epoch": 19.368, "grad_norm": 3.46875, "kl": 3.9950008392333984, "learning_rate": 3.8442792574615275e-05, "loss": 0.1598, "reward": 2.3790897130966187, "reward_std": 3.003344774246216, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7033771276474, "rewards/no_repetition_reward_func": -0.3086623549461365, "rewards/verse_reward_func": -0.015625, "step": 2421 }, { "completion_length": 251.859375, "epoch": 19.376, "grad_norm": 4.15625, "kl": 5.104958534240723, "learning_rate": 3.843101818017637e-05, "loss": 0.2042, "reward": 1.9553574919700623, "reward_std": 2.785863995552063, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.22374165058136, "rewards/no_repetition_reward_func": -0.26838408410549164, "rewards/verse_reward_func": 0.0, "step": 2422 }, { "completion_length": 251.234375, "epoch": 19.384, "grad_norm": 4.96875, "kl": 4.386731863021851, "learning_rate": 3.841923959622375e-05, "loss": 0.1755, "reward": 2.253082573413849, "reward_std": 2.9530028104782104, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5206627249717712, "rewards/no_repetition_reward_func": -0.26757998764514923, "rewards/verse_reward_func": 0.0, "step": 2423 }, { "completion_length": 247.53125, "epoch": 19.392, "grad_norm": 2.359375, "kl": 4.06579852104187, "learning_rate": 3.840745682643147e-05, "loss": 0.1626, "reward": 2.4061243534088135, "reward_std": 2.896989941596985, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.658211827278137, "rewards/no_repetition_reward_func": -0.24427519738674164, "rewards/verse_reward_func": -0.0078125, "step": 2424 }, { "completion_length": 256.0, "epoch": 19.4, "grad_norm": 2.203125, "kl": 4.769468307495117, "learning_rate": 3.8395669874474915e-05, "loss": 0.1908, "reward": 2.472085118293762, "reward_std": 3.126389741897583, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.758436679840088, "rewards/no_repetition_reward_func": -0.28635160624980927, "rewards/verse_reward_func": 0.0, "step": 2425 }, { "completion_length": 251.21875, "epoch": 19.408, "grad_norm": 2.828125, "kl": 5.701293468475342, "learning_rate": 3.8383878744030776e-05, "loss": 0.2281, "reward": 2.0399287939071655, "reward_std": 2.8866575956344604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.333493947982788, "rewards/no_repetition_reward_func": -0.2935652434825897, "rewards/verse_reward_func": 0.0, "step": 2426 }, { "completion_length": 249.671875, "epoch": 19.416, "grad_norm": 2.046875, "kl": 4.170648097991943, "learning_rate": 3.837208343877703e-05, "loss": 0.1668, "reward": 2.334060311317444, "reward_std": 2.7460389137268066, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5944615602493286, "rewards/no_repetition_reward_func": -0.2525886595249176, "rewards/verse_reward_func": -0.0078125, "step": 2427 }, { "completion_length": 250.296875, "epoch": 19.424, "grad_norm": 2.515625, "kl": 4.650768876075745, "learning_rate": 3.836028396239297e-05, "loss": 0.186, "reward": 2.6462103128433228, "reward_std": 2.6176241636276245, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9113452434539795, "rewards/no_repetition_reward_func": -0.26513491570949554, "rewards/verse_reward_func": 0.0, "step": 2428 }, { "completion_length": 242.578125, "epoch": 19.432, "grad_norm": 2.171875, "kl": 4.264769077301025, "learning_rate": 3.834848031855919e-05, "loss": 0.1706, "reward": 2.5367393493652344, "reward_std": 2.809635043144226, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8021732568740845, "rewards/no_repetition_reward_func": -0.26543380320072174, "rewards/verse_reward_func": 0.0, "step": 2429 }, { "completion_length": 249.25, "epoch": 19.44, "grad_norm": 5.28125, "kl": 5.343470573425293, "learning_rate": 3.8336672510957574e-05, "loss": 0.2137, "reward": 1.689126431941986, "reward_std": 2.387776255607605, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9467322826385498, "rewards/no_repetition_reward_func": -0.24979326128959656, "rewards/verse_reward_func": -0.0078125, "step": 2430 }, { "completion_length": 253.71875, "epoch": 19.448, "grad_norm": 4.09375, "kl": 3.926680564880371, "learning_rate": 3.83248605432713e-05, "loss": 0.1571, "reward": 2.902620553970337, "reward_std": 3.2013018131256104, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1767053604125977, "rewards/no_repetition_reward_func": -0.27408474683761597, "rewards/verse_reward_func": 0.0, "step": 2431 }, { "completion_length": 243.71875, "epoch": 19.456, "grad_norm": 2.65625, "kl": 5.497242450714111, "learning_rate": 3.8313044419184873e-05, "loss": 0.2199, "reward": 2.146754026412964, "reward_std": 2.8814942836761475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4424866437911987, "rewards/no_repetition_reward_func": -0.28792019188404083, "rewards/verse_reward_func": -0.0078125, "step": 2432 }, { "completion_length": 245.96875, "epoch": 19.464, "grad_norm": 3.28125, "kl": 4.936975955963135, "learning_rate": 3.830122414238406e-05, "loss": 0.1975, "reward": 1.8504250049591064, "reward_std": 2.9139838218688965, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.101551115512848, "rewards/no_repetition_reward_func": -0.2511261850595474, "rewards/verse_reward_func": 0.0, "step": 2433 }, { "completion_length": 247.25, "epoch": 19.472, "grad_norm": 2.578125, "kl": 4.236417889595032, "learning_rate": 3.828939971655595e-05, "loss": 0.1695, "reward": 2.0569997429847717, "reward_std": 2.544069766998291, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3124172687530518, "rewards/no_repetition_reward_func": -0.24760501831769943, "rewards/verse_reward_func": -0.0078125, "step": 2434 }, { "completion_length": 243.3125, "epoch": 19.48, "grad_norm": 1.7109375, "kl": 4.896556854248047, "learning_rate": 3.827757114538892e-05, "loss": 0.1959, "reward": 1.994144856929779, "reward_std": 2.6916840076446533, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2318274974823, "rewards/no_repetition_reward_func": -0.2220577895641327, "rewards/verse_reward_func": -0.015625, "step": 2435 }, { "completion_length": 252.328125, "epoch": 19.488, "grad_norm": 3.25, "kl": 4.234266757965088, "learning_rate": 3.826573843257262e-05, "loss": 0.1694, "reward": 2.5399824380874634, "reward_std": 3.1354708671569824, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8478082418441772, "rewards/no_repetition_reward_func": -0.30001330375671387, "rewards/verse_reward_func": -0.0078125, "step": 2436 }, { "completion_length": 249.5625, "epoch": 19.496, "grad_norm": 3.78125, "kl": 3.861445426940918, "learning_rate": 3.8253901581798016e-05, "loss": 0.1545, "reward": 2.6387479305267334, "reward_std": 3.1512718200683594, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.933220148086548, "rewards/no_repetition_reward_func": -0.29447224736213684, "rewards/verse_reward_func": 0.0, "step": 2437 }, { "completion_length": 254.265625, "epoch": 19.504, "grad_norm": 1.8671875, "kl": 4.224627137184143, "learning_rate": 3.824206059675736e-05, "loss": 0.169, "reward": 2.2606446743011475, "reward_std": 3.11668860912323, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5480376482009888, "rewards/no_repetition_reward_func": -0.2873929589986801, "rewards/verse_reward_func": 0.0, "step": 2438 }, { "completion_length": 249.953125, "epoch": 19.512, "grad_norm": 4.71875, "kl": 3.9906668663024902, "learning_rate": 3.823021548114417e-05, "loss": 0.1596, "reward": 3.1017630100250244, "reward_std": 2.8011789321899414, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.389840245246887, "rewards/no_repetition_reward_func": -0.2880772650241852, "rewards/verse_reward_func": 0.0, "step": 2439 }, { "completion_length": 238.21875, "epoch": 19.52, "grad_norm": 3.578125, "kl": 3.8205970525741577, "learning_rate": 3.821836623865329e-05, "loss": 0.1528, "reward": 2.6835062503814697, "reward_std": 3.153773784637451, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9679092168807983, "rewards/no_repetition_reward_func": -0.2844030484557152, "rewards/verse_reward_func": 0.0, "step": 2440 }, { "completion_length": 254.75, "epoch": 19.528, "grad_norm": 3.4375, "kl": 3.7913776636123657, "learning_rate": 3.820651287298084e-05, "loss": 0.1517, "reward": 3.2213693857192993, "reward_std": 3.1824045181274414, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.571267247200012, "rewards/no_repetition_reward_func": -0.349897637963295, "rewards/verse_reward_func": 0.0, "step": 2441 }, { "completion_length": 249.765625, "epoch": 19.536, "grad_norm": 3.4375, "kl": 4.290755033493042, "learning_rate": 3.81946553878242e-05, "loss": 0.1716, "reward": 2.3312078714370728, "reward_std": 3.0213814973831177, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.647369623184204, "rewards/no_repetition_reward_func": -0.3083494305610657, "rewards/verse_reward_func": -0.0078125, "step": 2442 }, { "completion_length": 245.140625, "epoch": 19.544, "grad_norm": 3.28125, "kl": 3.890289545059204, "learning_rate": 3.8182793786882065e-05, "loss": 0.1556, "reward": 2.541450262069702, "reward_std": 3.1877485513687134, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8478827476501465, "rewards/no_repetition_reward_func": -0.3064323663711548, "rewards/verse_reward_func": 0.0, "step": 2443 }, { "completion_length": 238.34375, "epoch": 19.552, "grad_norm": 3.21875, "kl": 4.3029221296310425, "learning_rate": 3.8170928073854396e-05, "loss": 0.1721, "reward": 2.637252449989319, "reward_std": 3.057868242263794, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.908121347427368, "rewards/no_repetition_reward_func": -0.26305635273456573, "rewards/verse_reward_func": -0.0078125, "step": 2444 }, { "completion_length": 254.390625, "epoch": 19.56, "grad_norm": 4.1875, "kl": 5.6191253662109375, "learning_rate": 3.8159058252442446e-05, "loss": 0.2248, "reward": 1.817229449748993, "reward_std": 2.77210533618927, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0934964418411255, "rewards/no_repetition_reward_func": -0.2684545964002609, "rewards/verse_reward_func": -0.0078125, "step": 2445 }, { "completion_length": 251.59375, "epoch": 19.568, "grad_norm": 1.9140625, "kl": 4.32275652885437, "learning_rate": 3.814718432634876e-05, "loss": 0.1729, "reward": 1.6215834021568298, "reward_std": 2.6104618310928345, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.8715805411338806, "rewards/no_repetition_reward_func": -0.2343720942735672, "rewards/verse_reward_func": -0.015625, "step": 2446 }, { "completion_length": 252.25, "epoch": 19.576, "grad_norm": 4.90625, "kl": 3.0850807428359985, "learning_rate": 3.813530629927714e-05, "loss": 0.1234, "reward": 2.6773701906204224, "reward_std": 3.2170077562332153, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9834041595458984, "rewards/no_repetition_reward_func": -0.30603379011154175, "rewards/verse_reward_func": 0.0, "step": 2447 }, { "completion_length": 253.984375, "epoch": 19.584, "grad_norm": 2.375, "kl": 4.31839656829834, "learning_rate": 3.8123424174932674e-05, "loss": 0.1727, "reward": 3.235581398010254, "reward_std": 3.181338906288147, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.56340754032135, "rewards/no_repetition_reward_func": -0.32001355290412903, "rewards/verse_reward_func": -0.0078125, "step": 2448 }, { "completion_length": 243.84375, "epoch": 19.592, "grad_norm": 1.8515625, "kl": 4.629213333129883, "learning_rate": 3.811153795702174e-05, "loss": 0.1852, "reward": 2.388851761817932, "reward_std": 2.9593777656555176, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.675053358078003, "rewards/no_repetition_reward_func": -0.2783891260623932, "rewards/verse_reward_func": -0.0078125, "step": 2449 }, { "completion_length": 250.75, "epoch": 19.6, "grad_norm": 1.8671875, "kl": 4.5462329387664795, "learning_rate": 3.8099647649251986e-05, "loss": 0.1818, "reward": 2.1745779514312744, "reward_std": 2.686591386795044, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4204235076904297, "rewards/no_repetition_reward_func": -0.24584555625915527, "rewards/verse_reward_func": 0.0, "step": 2450 }, { "completion_length": 252.328125, "epoch": 19.608, "grad_norm": 4.15625, "kl": 4.754650831222534, "learning_rate": 3.808775325533232e-05, "loss": 0.1902, "reward": 2.3336222171783447, "reward_std": 3.054073214530945, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6513466835021973, "rewards/no_repetition_reward_func": -0.31772445142269135, "rewards/verse_reward_func": 0.0, "step": 2451 }, { "completion_length": 246.59375, "epoch": 19.616, "grad_norm": 4.03125, "kl": 6.501791000366211, "learning_rate": 3.8075854778972955e-05, "loss": 0.2601, "reward": 1.846511960029602, "reward_std": 2.923835277557373, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1005821228027344, "rewards/no_repetition_reward_func": -0.25407011061906815, "rewards/verse_reward_func": 0.0, "step": 2452 }, { "completion_length": 250.78125, "epoch": 19.624, "grad_norm": 5.1875, "kl": 4.8738343715667725, "learning_rate": 3.806395222388536e-05, "loss": 0.195, "reward": 2.508421778678894, "reward_std": 3.0472190380096436, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.790554642677307, "rewards/no_repetition_reward_func": -0.2821328192949295, "rewards/verse_reward_func": 0.0, "step": 2453 }, { "completion_length": 245.828125, "epoch": 19.632, "grad_norm": 6.65625, "kl": 5.0266053676605225, "learning_rate": 3.805204559378227e-05, "loss": 0.2011, "reward": 1.903687596321106, "reward_std": 2.6681487560272217, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1565961837768555, "rewards/no_repetition_reward_func": -0.2450961098074913, "rewards/verse_reward_func": -0.0078125, "step": 2454 }, { "completion_length": 256.0, "epoch": 19.64, "grad_norm": 2.453125, "kl": 4.01382303237915, "learning_rate": 3.80401348923777e-05, "loss": 0.1606, "reward": 2.6665866374969482, "reward_std": 3.2331795692443848, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9721685647964478, "rewards/no_repetition_reward_func": -0.3055819123983383, "rewards/verse_reward_func": 0.0, "step": 2455 }, { "completion_length": 250.4375, "epoch": 19.648, "grad_norm": 1.4296875, "kl": 4.872669219970703, "learning_rate": 3.802822012338694e-05, "loss": 0.1949, "reward": 2.923089861869812, "reward_std": 3.3089780807495117, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.220598578453064, "rewards/no_repetition_reward_func": -0.29750870168209076, "rewards/verse_reward_func": 0.0, "step": 2456 }, { "completion_length": 255.375, "epoch": 19.656, "grad_norm": 1.6953125, "kl": 5.2006916999816895, "learning_rate": 3.8016301290526534e-05, "loss": 0.208, "reward": 2.6986236572265625, "reward_std": 3.3666844367980957, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.986414670944214, "rewards/no_repetition_reward_func": -0.2877911329269409, "rewards/verse_reward_func": 0.0, "step": 2457 }, { "completion_length": 255.890625, "epoch": 19.664, "grad_norm": 3.65625, "kl": 4.671751022338867, "learning_rate": 3.8004378397514315e-05, "loss": 0.1869, "reward": 2.2674542665481567, "reward_std": 2.8099582195281982, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5268566608428955, "rewards/no_repetition_reward_func": -0.2594023048877716, "rewards/verse_reward_func": 0.0, "step": 2458 }, { "completion_length": 252.40625, "epoch": 19.672, "grad_norm": 3.4375, "kl": 4.083723783493042, "learning_rate": 3.799245144806937e-05, "loss": 0.1633, "reward": 2.938588857650757, "reward_std": 3.054056406021118, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.234588146209717, "rewards/no_repetition_reward_func": -0.29599934816360474, "rewards/verse_reward_func": 0.0, "step": 2459 }, { "completion_length": 247.046875, "epoch": 19.68, "grad_norm": 3.609375, "kl": 4.116396188735962, "learning_rate": 3.798052044591204e-05, "loss": 0.1647, "reward": 2.859673857688904, "reward_std": 2.962303042411804, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.161802291870117, "rewards/no_repetition_reward_func": -0.302128404378891, "rewards/verse_reward_func": 0.0, "step": 2460 }, { "completion_length": 244.96875, "epoch": 19.688, "grad_norm": 4.125, "kl": 3.844008207321167, "learning_rate": 3.796858539476394e-05, "loss": 0.1538, "reward": 2.791731834411621, "reward_std": 3.250115394592285, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.07887065410614, "rewards/no_repetition_reward_func": -0.2871389091014862, "rewards/verse_reward_func": 0.0, "step": 2461 }, { "completion_length": 256.0, "epoch": 19.696, "grad_norm": 3.21875, "kl": 4.935750961303711, "learning_rate": 3.7956646298347956e-05, "loss": 0.1974, "reward": 2.808248996734619, "reward_std": 3.2349976301193237, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1029399633407593, "rewards/no_repetition_reward_func": -0.2946910262107849, "rewards/verse_reward_func": 0.0, "step": 2462 }, { "completion_length": 249.421875, "epoch": 19.704, "grad_norm": 2.0, "kl": 4.923263072967529, "learning_rate": 3.7944703160388234e-05, "loss": 0.1969, "reward": 2.3657662868499756, "reward_std": 3.197966694831848, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.669213056564331, "rewards/no_repetition_reward_func": -0.30344685912132263, "rewards/verse_reward_func": 0.0, "step": 2463 }, { "completion_length": 252.625, "epoch": 19.712, "grad_norm": 1.9296875, "kl": 4.491645812988281, "learning_rate": 3.793275598461017e-05, "loss": 0.1797, "reward": 2.7372156381607056, "reward_std": 3.053502678871155, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.004207491874695, "rewards/no_repetition_reward_func": -0.2669918090105057, "rewards/verse_reward_func": 0.0, "step": 2464 }, { "completion_length": 252.8125, "epoch": 19.72, "grad_norm": 2.515625, "kl": 4.184690356254578, "learning_rate": 3.792080477474043e-05, "loss": 0.1674, "reward": 3.297636389732361, "reward_std": 3.2653087377548218, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.639716625213623, "rewards/no_repetition_reward_func": -0.32645532488822937, "rewards/verse_reward_func": -0.015625, "step": 2465 }, { "completion_length": 249.859375, "epoch": 19.728, "grad_norm": 4.9375, "kl": 5.728889465332031, "learning_rate": 3.790884953450692e-05, "loss": 0.2292, "reward": 2.2060694694519043, "reward_std": 3.1516993045806885, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.516193151473999, "rewards/no_repetition_reward_func": -0.30231115221977234, "rewards/verse_reward_func": -0.0078125, "step": 2466 }, { "completion_length": 252.5, "epoch": 19.736, "grad_norm": 2.8125, "kl": 5.152135848999023, "learning_rate": 3.789689026763883e-05, "loss": 0.2061, "reward": 2.4420822858810425, "reward_std": 2.8299028873443604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7148754596710205, "rewards/no_repetition_reward_func": -0.2727932929992676, "rewards/verse_reward_func": 0.0, "step": 2467 }, { "completion_length": 255.890625, "epoch": 19.744, "grad_norm": 2.609375, "kl": 5.435340881347656, "learning_rate": 3.788492697786658e-05, "loss": 0.2174, "reward": 2.228224515914917, "reward_std": 2.6341371536254883, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5235615968704224, "rewards/no_repetition_reward_func": -0.29533714056015015, "rewards/verse_reward_func": 0.0, "step": 2468 }, { "completion_length": 256.0, "epoch": 19.752, "grad_norm": 3.296875, "kl": 6.446285724639893, "learning_rate": 3.7872959668921884e-05, "loss": 0.2579, "reward": 2.115436375141144, "reward_std": 2.987046003341675, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.389530062675476, "rewards/no_repetition_reward_func": -0.2662811279296875, "rewards/verse_reward_func": -0.0078125, "step": 2469 }, { "completion_length": 249.203125, "epoch": 19.76, "grad_norm": 3.421875, "kl": 5.294046640396118, "learning_rate": 3.786098834453766e-05, "loss": 0.2118, "reward": 1.752849042415619, "reward_std": 2.490428924560547, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9813569784164429, "rewards/no_repetition_reward_func": -0.22069554030895233, "rewards/verse_reward_func": -0.0078125, "step": 2470 }, { "completion_length": 253.875, "epoch": 19.768, "grad_norm": 2.359375, "kl": 3.862309694290161, "learning_rate": 3.7849013008448115e-05, "loss": 0.1545, "reward": 3.111785650253296, "reward_std": 3.07194185256958, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4007747173309326, "rewards/no_repetition_reward_func": -0.28898924589157104, "rewards/verse_reward_func": 0.0, "step": 2471 }, { "completion_length": 253.796875, "epoch": 19.776, "grad_norm": 1.765625, "kl": 5.306143045425415, "learning_rate": 3.783703366438868e-05, "loss": 0.2122, "reward": 2.9431899785995483, "reward_std": 3.433154344558716, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.23696506023407, "rewards/no_repetition_reward_func": -0.2937749773263931, "rewards/verse_reward_func": 0.0, "step": 2472 }, { "completion_length": 243.21875, "epoch": 19.784, "grad_norm": 2.046875, "kl": 4.862072944641113, "learning_rate": 3.782505031609607e-05, "loss": 0.1945, "reward": 2.459527850151062, "reward_std": 2.9731855392456055, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.758760094642639, "rewards/no_repetition_reward_func": -0.2836073487997055, "rewards/verse_reward_func": -0.015625, "step": 2473 }, { "completion_length": 253.84375, "epoch": 19.792, "grad_norm": 2.078125, "kl": 5.196453809738159, "learning_rate": 3.78130629673082e-05, "loss": 0.2079, "reward": 1.9586570262908936, "reward_std": 2.661607503890991, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1718279123306274, "rewards/no_repetition_reward_func": -0.2131708413362503, "rewards/verse_reward_func": 0.0, "step": 2474 }, { "completion_length": 252.15625, "epoch": 19.8, "grad_norm": 2.5625, "kl": 5.083586692810059, "learning_rate": 3.780107162176429e-05, "loss": 0.2033, "reward": 2.6579054594039917, "reward_std": 3.2995927333831787, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.937095284461975, "rewards/no_repetition_reward_func": -0.2791898399591446, "rewards/verse_reward_func": 0.0, "step": 2475 }, { "completion_length": 249.796875, "epoch": 19.808, "grad_norm": 2.25, "kl": 3.8069647550582886, "learning_rate": 3.778907628320477e-05, "loss": 0.1523, "reward": 3.320319890975952, "reward_std": 3.2716790437698364, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.6346781253814697, "rewards/no_repetition_reward_func": -0.3143582344055176, "rewards/verse_reward_func": 0.0, "step": 2476 }, { "completion_length": 246.109375, "epoch": 19.816, "grad_norm": 2.28125, "kl": 4.619163990020752, "learning_rate": 3.777707695537133e-05, "loss": 0.1848, "reward": 1.7887014746665955, "reward_std": 2.7815146446228027, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1086423993110657, "rewards/no_repetition_reward_func": -0.312128484249115, "rewards/verse_reward_func": -0.0078125, "step": 2477 }, { "completion_length": 248.25, "epoch": 19.824, "grad_norm": 2.828125, "kl": 4.646752595901489, "learning_rate": 3.776507364200689e-05, "loss": 0.1859, "reward": 2.424764394760132, "reward_std": 2.6732966899871826, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7097960710525513, "rewards/no_repetition_reward_func": -0.28503160178661346, "rewards/verse_reward_func": 0.0, "step": 2478 }, { "completion_length": 254.234375, "epoch": 19.832, "grad_norm": 2.28125, "kl": 4.479369401931763, "learning_rate": 3.775306634685562e-05, "loss": 0.1792, "reward": 2.570875406265259, "reward_std": 3.262750267982483, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.880908727645874, "rewards/no_repetition_reward_func": -0.31003333628177643, "rewards/verse_reward_func": 0.0, "step": 2479 }, { "completion_length": 253.046875, "epoch": 19.84, "grad_norm": 6.5, "kl": 5.427735328674316, "learning_rate": 3.7741055073662946e-05, "loss": 0.2171, "reward": 1.8161431550979614, "reward_std": 2.7641358375549316, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0467326641082764, "rewards/no_repetition_reward_func": -0.23058954626321793, "rewards/verse_reward_func": 0.0, "step": 2480 }, { "completion_length": 251.515625, "epoch": 19.848, "grad_norm": 2.96875, "kl": 5.033045053482056, "learning_rate": 3.772903982617552e-05, "loss": 0.2013, "reward": 2.421074151992798, "reward_std": 2.729067325592041, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6967642307281494, "rewards/no_repetition_reward_func": -0.2756900042295456, "rewards/verse_reward_func": 0.0, "step": 2481 }, { "completion_length": 254.671875, "epoch": 19.856, "grad_norm": 5.5, "kl": 3.8597666025161743, "learning_rate": 3.771702060814123e-05, "loss": 0.1544, "reward": 3.061647415161133, "reward_std": 3.0906875133514404, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3688464164733887, "rewards/no_repetition_reward_func": -0.30719907581806183, "rewards/verse_reward_func": 0.0, "step": 2482 }, { "completion_length": 252.546875, "epoch": 19.864, "grad_norm": 3.953125, "kl": 3.806453824043274, "learning_rate": 3.770499742330922e-05, "loss": 0.1523, "reward": 2.4644827842712402, "reward_std": 3.0883233547210693, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7514359951019287, "rewards/no_repetition_reward_func": -0.28695307672023773, "rewards/verse_reward_func": 0.0, "step": 2483 }, { "completion_length": 243.5, "epoch": 19.872, "grad_norm": 3.46875, "kl": 4.407474994659424, "learning_rate": 3.769297027542985e-05, "loss": 0.1763, "reward": 2.138533353805542, "reward_std": 3.0709697008132935, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.403984308242798, "rewards/no_repetition_reward_func": -0.2576385736465454, "rewards/verse_reward_func": -0.0078125, "step": 2484 }, { "completion_length": 247.78125, "epoch": 19.88, "grad_norm": 3.3125, "kl": 3.7871068716049194, "learning_rate": 3.7680939168254733e-05, "loss": 0.1515, "reward": 2.8310115337371826, "reward_std": 3.015395998954773, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.141453266143799, "rewards/no_repetition_reward_func": -0.3104417473077774, "rewards/verse_reward_func": 0.0, "step": 2485 }, { "completion_length": 252.328125, "epoch": 19.888, "grad_norm": 4.125, "kl": 4.475067138671875, "learning_rate": 3.7668904105536706e-05, "loss": 0.179, "reward": 2.0731163024902344, "reward_std": 2.8274272680282593, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3466662764549255, "rewards/no_repetition_reward_func": -0.27354995906352997, "rewards/verse_reward_func": 0.0, "step": 2486 }, { "completion_length": 244.1875, "epoch": 19.896, "grad_norm": 3.703125, "kl": 4.06195592880249, "learning_rate": 3.765686509102985e-05, "loss": 0.1625, "reward": 2.6392059326171875, "reward_std": 3.1247740983963013, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9845701456069946, "rewards/no_repetition_reward_func": -0.3141142278909683, "rewards/verse_reward_func": -0.03125, "step": 2487 }, { "completion_length": 250.234375, "epoch": 19.904, "grad_norm": 2.09375, "kl": 4.21800422668457, "learning_rate": 3.764482212848948e-05, "loss": 0.1687, "reward": 2.3174670338630676, "reward_std": 2.8179608583450317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6091853380203247, "rewards/no_repetition_reward_func": -0.2839057445526123, "rewards/verse_reward_func": -0.0078125, "step": 2488 }, { "completion_length": 247.109375, "epoch": 19.912, "grad_norm": 3.171875, "kl": 4.880640983581543, "learning_rate": 3.7632775221672115e-05, "loss": 0.1952, "reward": 1.7168945670127869, "reward_std": 2.8246898651123047, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.017392933368683, "rewards/no_repetition_reward_func": -0.3004983365535736, "rewards/verse_reward_func": 0.0, "step": 2489 }, { "completion_length": 256.0, "epoch": 19.92, "grad_norm": 4.5625, "kl": 3.305001735687256, "learning_rate": 3.762072437433555e-05, "loss": 0.1322, "reward": 3.0230677127838135, "reward_std": 3.0506314039230347, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.358176589012146, "rewards/no_repetition_reward_func": -0.33510904014110565, "rewards/verse_reward_func": 0.0, "step": 2490 }, { "completion_length": 249.59375, "epoch": 19.928, "grad_norm": 1.6875, "kl": 3.6587586402893066, "learning_rate": 3.760866959023877e-05, "loss": 0.1464, "reward": 2.8204562067985535, "reward_std": 2.769387722015381, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.072500228881836, "rewards/no_repetition_reward_func": -0.2520440146327019, "rewards/verse_reward_func": 0.0, "step": 2491 }, { "completion_length": 249.59375, "epoch": 19.936, "grad_norm": 1.3671875, "kl": 4.638121128082275, "learning_rate": 3.759661087314199e-05, "loss": 0.1855, "reward": 2.3361462354660034, "reward_std": 3.0658161640167236, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5959240198135376, "rewards/no_repetition_reward_func": -0.25977783650159836, "rewards/verse_reward_func": 0.0, "step": 2492 }, { "completion_length": 253.59375, "epoch": 19.944, "grad_norm": 1.65625, "kl": 4.297084331512451, "learning_rate": 3.7584548226806696e-05, "loss": 0.1719, "reward": 2.513193130493164, "reward_std": 2.8260542154312134, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.767236351966858, "rewards/no_repetition_reward_func": -0.25404320657253265, "rewards/verse_reward_func": 0.0, "step": 2493 }, { "completion_length": 256.0, "epoch": 19.951999999999998, "grad_norm": 1.8359375, "kl": 4.69388484954834, "learning_rate": 3.757248165499555e-05, "loss": 0.1878, "reward": 2.2124812602996826, "reward_std": 2.7960145473480225, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.50289523601532, "rewards/no_repetition_reward_func": -0.2747889310121536, "rewards/verse_reward_func": -0.015625, "step": 2494 }, { "completion_length": 252.578125, "epoch": 19.96, "grad_norm": 2.578125, "kl": 5.171160697937012, "learning_rate": 3.7560411161472456e-05, "loss": 0.2068, "reward": 2.656731128692627, "reward_std": 3.168222427368164, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.928546905517578, "rewards/no_repetition_reward_func": -0.27181585133075714, "rewards/verse_reward_func": 0.0, "step": 2495 }, { "completion_length": 251.3125, "epoch": 19.968, "grad_norm": 2.21875, "kl": 4.95780086517334, "learning_rate": 3.7548336750002544e-05, "loss": 0.1983, "reward": 1.839199185371399, "reward_std": 2.8981399536132812, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.109082520008087, "rewards/no_repetition_reward_func": -0.26988327503204346, "rewards/verse_reward_func": 0.0, "step": 2496 }, { "completion_length": 250.03125, "epoch": 19.976, "grad_norm": 1.84375, "kl": 4.533321976661682, "learning_rate": 3.753625842435216e-05, "loss": 0.1813, "reward": 2.363423705101013, "reward_std": 2.879490852355957, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6596450805664062, "rewards/no_repetition_reward_func": -0.28840896487236023, "rewards/verse_reward_func": -0.0078125, "step": 2497 }, { "completion_length": 254.359375, "epoch": 19.984, "grad_norm": 3.21875, "kl": 4.7815916538238525, "learning_rate": 3.752417618828888e-05, "loss": 0.1913, "reward": 2.9774210453033447, "reward_std": 2.4402753114700317, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2556304931640625, "rewards/no_repetition_reward_func": -0.2782094478607178, "rewards/verse_reward_func": 0.0, "step": 2498 }, { "completion_length": 245.953125, "epoch": 19.992, "grad_norm": 1.7890625, "kl": 5.112710237503052, "learning_rate": 3.751209004558149e-05, "loss": 0.2045, "reward": 2.5715954303741455, "reward_std": 3.0915600061416626, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.861931085586548, "rewards/no_repetition_reward_func": -0.29033559560775757, "rewards/verse_reward_func": 0.0, "step": 2499 }, { "completion_length": 256.0, "epoch": 20.0, "grad_norm": 1.125, "kl": 4.645780086517334, "learning_rate": 3.7500000000000003e-05, "loss": 0.1858, "reward": 2.4137760400772095, "reward_std": 3.2373321056365967, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.724105477333069, "rewards/no_repetition_reward_func": -0.3103293478488922, "rewards/verse_reward_func": 0.0, "step": 2500 }, { "completion_length": 250.515625, "epoch": 20.008, "grad_norm": 5.90625, "kl": 6.3195178508758545, "learning_rate": 3.748790605531565e-05, "loss": 0.2528, "reward": 1.6639041900634766, "reward_std": 2.487017512321472, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.914036750793457, "rewards/no_repetition_reward_func": -0.2501325383782387, "rewards/verse_reward_func": 0.0, "step": 2501 }, { "completion_length": 252.71875, "epoch": 20.016, "grad_norm": 6.4375, "kl": 6.627748012542725, "learning_rate": 3.7475808215300854e-05, "loss": 0.2651, "reward": 2.1109004616737366, "reward_std": 2.795762062072754, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3506943583488464, "rewards/no_repetition_reward_func": -0.23979389667510986, "rewards/verse_reward_func": 0.0, "step": 2502 }, { "completion_length": 246.859375, "epoch": 20.024, "grad_norm": 2.453125, "kl": 3.918997287750244, "learning_rate": 3.7463706483729296e-05, "loss": 0.1568, "reward": 2.785613775253296, "reward_std": 2.8191224336624146, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0595176219940186, "rewards/no_repetition_reward_func": -0.27390384674072266, "rewards/verse_reward_func": 0.0, "step": 2503 }, { "completion_length": 248.734375, "epoch": 20.032, "grad_norm": 2.9375, "kl": 4.362747430801392, "learning_rate": 3.7451600864375844e-05, "loss": 0.1745, "reward": 2.6429158449172974, "reward_std": 3.026262879371643, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.961502194404602, "rewards/no_repetition_reward_func": -0.31077398359775543, "rewards/verse_reward_func": -0.0078125, "step": 2504 }, { "completion_length": 252.78125, "epoch": 20.04, "grad_norm": 3.484375, "kl": 3.8734002113342285, "learning_rate": 3.7439491361016564e-05, "loss": 0.1549, "reward": 2.897261381149292, "reward_std": 3.183054804801941, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1771520376205444, "rewards/no_repetition_reward_func": -0.279890775680542, "rewards/verse_reward_func": 0.0, "step": 2505 }, { "completion_length": 252.25, "epoch": 20.048, "grad_norm": 1.5234375, "kl": 4.644927740097046, "learning_rate": 3.742737797742878e-05, "loss": 0.1858, "reward": 2.444152593612671, "reward_std": 2.9626115560531616, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.703352928161621, "rewards/no_repetition_reward_func": -0.2592001259326935, "rewards/verse_reward_func": 0.0, "step": 2506 }, { "completion_length": 247.375, "epoch": 20.056, "grad_norm": 2.015625, "kl": 4.063134789466858, "learning_rate": 3.741526071739097e-05, "loss": 0.1625, "reward": 2.627143144607544, "reward_std": 3.1905447244644165, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.951101303100586, "rewards/no_repetition_reward_func": -0.30833323299884796, "rewards/verse_reward_func": -0.015625, "step": 2507 }, { "completion_length": 245.8125, "epoch": 20.064, "grad_norm": 1.921875, "kl": 4.534357786178589, "learning_rate": 3.740313958468287e-05, "loss": 0.1814, "reward": 2.577694892883301, "reward_std": 2.8611936569213867, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.869460701942444, "rewards/no_repetition_reward_func": -0.2839531749486923, "rewards/verse_reward_func": -0.0078125, "step": 2508 }, { "completion_length": 255.671875, "epoch": 20.072, "grad_norm": 2.546875, "kl": 5.799417972564697, "learning_rate": 3.7391014583085385e-05, "loss": 0.232, "reward": 2.2706002593040466, "reward_std": 2.7569013833999634, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4834154844284058, "rewards/no_repetition_reward_func": -0.21281521022319794, "rewards/verse_reward_func": 0.0, "step": 2509 }, { "completion_length": 256.0, "epoch": 20.08, "grad_norm": 1.9921875, "kl": 6.046168804168701, "learning_rate": 3.7378885716380664e-05, "loss": 0.2418, "reward": 1.8530099987983704, "reward_std": 2.7730746269226074, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1334294080734253, "rewards/no_repetition_reward_func": -0.2804194688796997, "rewards/verse_reward_func": 0.0, "step": 2510 }, { "completion_length": 256.0, "epoch": 20.088, "grad_norm": 1.7578125, "kl": 4.816368341445923, "learning_rate": 3.736675298835203e-05, "loss": 0.1927, "reward": 3.124297857284546, "reward_std": 3.2362451553344727, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3989899158477783, "rewards/no_repetition_reward_func": -0.27469223737716675, "rewards/verse_reward_func": 0.0, "step": 2511 }, { "completion_length": 247.140625, "epoch": 20.096, "grad_norm": 2.359375, "kl": 4.886380910873413, "learning_rate": 3.7354616402784035e-05, "loss": 0.1955, "reward": 2.828717589378357, "reward_std": 3.0835903882980347, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0996071100234985, "rewards/no_repetition_reward_func": -0.2630769908428192, "rewards/verse_reward_func": -0.0078125, "step": 2512 }, { "completion_length": 256.0, "epoch": 20.104, "grad_norm": 3.125, "kl": 4.4229514598846436, "learning_rate": 3.734247596346242e-05, "loss": 0.1769, "reward": 2.2135674953460693, "reward_std": 2.2670019268989563, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.46645724773407, "rewards/no_repetition_reward_func": -0.25288983434438705, "rewards/verse_reward_func": 0.0, "step": 2513 }, { "completion_length": 249.59375, "epoch": 20.112, "grad_norm": 3.53125, "kl": 3.70813250541687, "learning_rate": 3.7330331674174125e-05, "loss": 0.1483, "reward": 3.2610023021698, "reward_std": 2.808486580848694, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5135854482650757, "rewards/no_repetition_reward_func": -0.25258325040340424, "rewards/verse_reward_func": 0.0, "step": 2514 }, { "completion_length": 249.703125, "epoch": 20.12, "grad_norm": 3.890625, "kl": 4.079591512680054, "learning_rate": 3.731818353870729e-05, "loss": 0.1632, "reward": 3.0951547622680664, "reward_std": 3.1045689582824707, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4083369970321655, "rewards/no_repetition_reward_func": -0.3053695857524872, "rewards/verse_reward_func": -0.0078125, "step": 2515 }, { "completion_length": 244.71875, "epoch": 20.128, "grad_norm": 2.640625, "kl": 5.5174336433410645, "learning_rate": 3.7306031560851275e-05, "loss": 0.2207, "reward": 2.0445533990859985, "reward_std": 2.734795570373535, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3440545797348022, "rewards/no_repetition_reward_func": -0.2995012402534485, "rewards/verse_reward_func": 0.0, "step": 2516 }, { "completion_length": 248.21875, "epoch": 20.136, "grad_norm": 1.8046875, "kl": 4.940777778625488, "learning_rate": 3.729387574439662e-05, "loss": 0.1976, "reward": 2.50168776512146, "reward_std": 3.1893982887268066, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7784279584884644, "rewards/no_repetition_reward_func": -0.2689278721809387, "rewards/verse_reward_func": -0.0078125, "step": 2517 }, { "completion_length": 247.609375, "epoch": 20.144, "grad_norm": 4.21875, "kl": 4.143097877502441, "learning_rate": 3.7281716093135063e-05, "loss": 0.1657, "reward": 3.4364556074142456, "reward_std": 3.3795498609542847, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.8098357915878296, "rewards/no_repetition_reward_func": -0.37338006496429443, "rewards/verse_reward_func": 0.0, "step": 2518 }, { "completion_length": 252.703125, "epoch": 20.152, "grad_norm": 2.484375, "kl": 4.554168939590454, "learning_rate": 3.726955261085956e-05, "loss": 0.1822, "reward": 2.6877142190933228, "reward_std": 3.1634929180145264, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.996354579925537, "rewards/no_repetition_reward_func": -0.30864037573337555, "rewards/verse_reward_func": 0.0, "step": 2519 }, { "completion_length": 252.375, "epoch": 20.16, "grad_norm": 4.71875, "kl": 5.499161720275879, "learning_rate": 3.725738530136422e-05, "loss": 0.22, "reward": 2.091940402984619, "reward_std": 2.7302260398864746, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3592723608016968, "rewards/no_repetition_reward_func": -0.26733192056417465, "rewards/verse_reward_func": 0.0, "step": 2520 }, { "completion_length": 256.0, "epoch": 20.168, "grad_norm": 2.5625, "kl": 4.855573892593384, "learning_rate": 3.7245214168444386e-05, "loss": 0.1942, "reward": 2.6883453130722046, "reward_std": 3.205469250679016, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9661628007888794, "rewards/no_repetition_reward_func": -0.2778175622224808, "rewards/verse_reward_func": 0.0, "step": 2521 }, { "completion_length": 252.90625, "epoch": 20.176, "grad_norm": 2.34375, "kl": 4.9070188999176025, "learning_rate": 3.723303921589657e-05, "loss": 0.1963, "reward": 2.848165273666382, "reward_std": 2.93003249168396, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1201770305633545, "rewards/no_repetition_reward_func": -0.27201153337955475, "rewards/verse_reward_func": 0.0, "step": 2522 }, { "completion_length": 246.8125, "epoch": 20.184, "grad_norm": 2.703125, "kl": 4.892606258392334, "learning_rate": 3.722086044751849e-05, "loss": 0.1957, "reward": 2.452742040157318, "reward_std": 3.34965717792511, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7675607800483704, "rewards/no_repetition_reward_func": -0.31481875479221344, "rewards/verse_reward_func": 0.0, "step": 2523 }, { "completion_length": 252.875, "epoch": 20.192, "grad_norm": 1.8671875, "kl": 5.861100435256958, "learning_rate": 3.720867786710904e-05, "loss": 0.2344, "reward": 2.1469573974609375, "reward_std": 3.1133484840393066, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4481661319732666, "rewards/no_repetition_reward_func": -0.3012087047100067, "rewards/verse_reward_func": 0.0, "step": 2524 }, { "completion_length": 248.640625, "epoch": 20.2, "grad_norm": 4.8125, "kl": 5.158840894699097, "learning_rate": 3.719649147846832e-05, "loss": 0.2064, "reward": 2.530093193054199, "reward_std": 3.0171873569488525, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7851686477661133, "rewards/no_repetition_reward_func": -0.2550753206014633, "rewards/verse_reward_func": 0.0, "step": 2525 }, { "completion_length": 250.546875, "epoch": 20.208, "grad_norm": 3.5, "kl": 5.883092164993286, "learning_rate": 3.71843012853976e-05, "loss": 0.2353, "reward": 2.1520758867263794, "reward_std": 2.8787283897399902, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.407586991786957, "rewards/no_repetition_reward_func": -0.2555110454559326, "rewards/verse_reward_func": 0.0, "step": 2526 }, { "completion_length": 252.203125, "epoch": 20.216, "grad_norm": 3.46875, "kl": 4.423244476318359, "learning_rate": 3.717210729169935e-05, "loss": 0.1769, "reward": 3.1048340797424316, "reward_std": 3.335515022277832, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.433867573738098, "rewards/no_repetition_reward_func": -0.31340843439102173, "rewards/verse_reward_func": -0.015625, "step": 2527 }, { "completion_length": 252.859375, "epoch": 20.224, "grad_norm": 2.390625, "kl": 4.018190622329712, "learning_rate": 3.7159909501177226e-05, "loss": 0.1607, "reward": 2.677567958831787, "reward_std": 2.9869943857192993, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9692574739456177, "rewards/no_repetition_reward_func": -0.28387705981731415, "rewards/verse_reward_func": -0.0078125, "step": 2528 }, { "completion_length": 253.71875, "epoch": 20.232, "grad_norm": 2.703125, "kl": 4.011890411376953, "learning_rate": 3.7147707917636046e-05, "loss": 0.1605, "reward": 2.928339719772339, "reward_std": 3.3680427074432373, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2331442832946777, "rewards/no_repetition_reward_func": -0.30480462312698364, "rewards/verse_reward_func": 0.0, "step": 2529 }, { "completion_length": 252.390625, "epoch": 20.24, "grad_norm": 1.390625, "kl": 4.8078837394714355, "learning_rate": 3.713550254488185e-05, "loss": 0.1923, "reward": 2.6419758796691895, "reward_std": 2.918771982192993, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.887049913406372, "rewards/no_repetition_reward_func": -0.24507398903369904, "rewards/verse_reward_func": 0.0, "step": 2530 }, { "completion_length": 246.15625, "epoch": 20.248, "grad_norm": 2.84375, "kl": 4.049734830856323, "learning_rate": 3.712329338672182e-05, "loss": 0.162, "reward": 2.8574146032333374, "reward_std": 3.116011142730713, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.134942889213562, "rewards/no_repetition_reward_func": -0.2775281146168709, "rewards/verse_reward_func": 0.0, "step": 2531 }, { "completion_length": 250.0625, "epoch": 20.256, "grad_norm": 1.7734375, "kl": 4.629230976104736, "learning_rate": 3.711108044696436e-05, "loss": 0.1852, "reward": 2.4824737310409546, "reward_std": 2.9580743312835693, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7471346855163574, "rewards/no_repetition_reward_func": -0.26466110348701477, "rewards/verse_reward_func": 0.0, "step": 2532 }, { "completion_length": 245.1875, "epoch": 20.264, "grad_norm": 1.6328125, "kl": 4.232065796852112, "learning_rate": 3.7098863729419e-05, "loss": 0.1693, "reward": 2.238156735897064, "reward_std": 3.0477805137634277, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4973257780075073, "rewards/no_repetition_reward_func": -0.2513565272092819, "rewards/verse_reward_func": -0.0078125, "step": 2533 }, { "completion_length": 244.625, "epoch": 20.272, "grad_norm": 2.515625, "kl": 4.555538892745972, "learning_rate": 3.7086643237896504e-05, "loss": 0.1822, "reward": 2.328242540359497, "reward_std": 2.810738444328308, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5644211769104004, "rewards/no_repetition_reward_func": -0.23617874830961227, "rewards/verse_reward_func": 0.0, "step": 2534 }, { "completion_length": 245.328125, "epoch": 20.28, "grad_norm": 1.8125, "kl": 5.114596128463745, "learning_rate": 3.7074418976208766e-05, "loss": 0.2046, "reward": 1.9653655886650085, "reward_std": 3.1575801372528076, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2676544189453125, "rewards/no_repetition_reward_func": -0.29447630047798157, "rewards/verse_reward_func": -0.0078125, "step": 2535 }, { "completion_length": 245.8125, "epoch": 20.288, "grad_norm": 3.546875, "kl": 4.7547547817230225, "learning_rate": 3.706219094816891e-05, "loss": 0.1902, "reward": 3.0695031881332397, "reward_std": 3.4850891828536987, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.38940691947937, "rewards/no_repetition_reward_func": -0.31209133565425873, "rewards/verse_reward_func": -0.0078125, "step": 2536 }, { "completion_length": 250.09375, "epoch": 20.296, "grad_norm": 1.8203125, "kl": 4.981866836547852, "learning_rate": 3.704995915759117e-05, "loss": 0.1993, "reward": 2.4767428636550903, "reward_std": 3.0695794820785522, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.801514744758606, "rewards/no_repetition_reward_func": -0.3247719407081604, "rewards/verse_reward_func": 0.0, "step": 2537 }, { "completion_length": 247.09375, "epoch": 20.304, "grad_norm": 2.84375, "kl": 5.48285436630249, "learning_rate": 3.7037723608291015e-05, "loss": 0.2193, "reward": 2.612276077270508, "reward_std": 2.9819501638412476, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8910298347473145, "rewards/no_repetition_reward_func": -0.2709411680698395, "rewards/verse_reward_func": -0.0078125, "step": 2538 }, { "completion_length": 251.96875, "epoch": 20.312, "grad_norm": 2.828125, "kl": 5.6674089431762695, "learning_rate": 3.7025484304085034e-05, "loss": 0.2267, "reward": 2.057295262813568, "reward_std": 2.77639377117157, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3493419885635376, "rewards/no_repetition_reward_func": -0.2920469045639038, "rewards/verse_reward_func": 0.0, "step": 2539 }, { "completion_length": 245.109375, "epoch": 20.32, "grad_norm": 4.96875, "kl": 6.599164962768555, "learning_rate": 3.701324124879102e-05, "loss": 0.264, "reward": 2.241352677345276, "reward_std": 2.8779796361923218, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.532130241394043, "rewards/no_repetition_reward_func": -0.26733988523483276, "rewards/verse_reward_func": -0.0234375, "step": 2540 }, { "completion_length": 252.046875, "epoch": 20.328, "grad_norm": 2.953125, "kl": 5.2951743602752686, "learning_rate": 3.700099444622794e-05, "loss": 0.2118, "reward": 2.4034169912338257, "reward_std": 2.6579480171203613, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6502413749694824, "rewards/no_repetition_reward_func": -0.24682458490133286, "rewards/verse_reward_func": 0.0, "step": 2541 }, { "completion_length": 251.515625, "epoch": 20.336, "grad_norm": 1.6015625, "kl": 5.168342351913452, "learning_rate": 3.6988743900215894e-05, "loss": 0.2067, "reward": 1.9436513185501099, "reward_std": 2.911059856414795, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2304576635360718, "rewards/no_repetition_reward_func": -0.27899394929409027, "rewards/verse_reward_func": -0.0078125, "step": 2542 }, { "completion_length": 248.703125, "epoch": 20.344, "grad_norm": 3.359375, "kl": 4.017400145530701, "learning_rate": 3.69764896145762e-05, "loss": 0.1607, "reward": 2.5861639976501465, "reward_std": 3.4006131887435913, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9102598428726196, "rewards/no_repetition_reward_func": -0.3084709644317627, "rewards/verse_reward_func": -0.015625, "step": 2543 }, { "completion_length": 244.34375, "epoch": 20.352, "grad_norm": 2.09375, "kl": 5.076488971710205, "learning_rate": 3.696423159313129e-05, "loss": 0.2031, "reward": 2.091484487056732, "reward_std": 2.7731932401657104, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3961745500564575, "rewards/no_repetition_reward_func": -0.2812526375055313, "rewards/verse_reward_func": -0.0234375, "step": 2544 }, { "completion_length": 247.796875, "epoch": 20.36, "grad_norm": 2.5, "kl": 4.183829307556152, "learning_rate": 3.695196983970481e-05, "loss": 0.1674, "reward": 2.9516175985336304, "reward_std": 2.8867461681365967, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2545511722564697, "rewards/no_repetition_reward_func": -0.2873087003827095, "rewards/verse_reward_func": -0.015625, "step": 2545 }, { "completion_length": 246.828125, "epoch": 20.368, "grad_norm": 2.96875, "kl": 3.7595932483673096, "learning_rate": 3.693970435812153e-05, "loss": 0.1504, "reward": 2.920470952987671, "reward_std": 3.236953020095825, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.190054178237915, "rewards/no_repetition_reward_func": -0.2617708295583725, "rewards/verse_reward_func": -0.0078125, "step": 2546 }, { "completion_length": 248.859375, "epoch": 20.376, "grad_norm": 1.6953125, "kl": 5.039263725280762, "learning_rate": 3.6927435152207406e-05, "loss": 0.2016, "reward": 2.090929687023163, "reward_std": 3.0152560472488403, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3750652074813843, "rewards/no_repetition_reward_func": -0.2841353714466095, "rewards/verse_reward_func": 0.0, "step": 2547 }, { "completion_length": 252.890625, "epoch": 20.384, "grad_norm": 3.296875, "kl": 3.9016103744506836, "learning_rate": 3.6915162225789546e-05, "loss": 0.1561, "reward": 2.9094992876052856, "reward_std": 3.3324472904205322, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2036126852035522, "rewards/no_repetition_reward_func": -0.29411347210407257, "rewards/verse_reward_func": 0.0, "step": 2548 }, { "completion_length": 249.515625, "epoch": 20.392, "grad_norm": 3.734375, "kl": 2.991729736328125, "learning_rate": 3.690288558269623e-05, "loss": 0.1197, "reward": 3.285549759864807, "reward_std": 2.985168933868408, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.597399950027466, "rewards/no_repetition_reward_func": -0.3040376305580139, "rewards/verse_reward_func": -0.0078125, "step": 2549 }, { "completion_length": 256.0, "epoch": 20.4, "grad_norm": 4.46875, "kl": 3.1767985820770264, "learning_rate": 3.689060522675689e-05, "loss": 0.1271, "reward": 2.7885022163391113, "reward_std": 3.0856930017471313, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0704293251037598, "rewards/no_repetition_reward_func": -0.2819271981716156, "rewards/verse_reward_func": 0.0, "step": 2550 }, { "completion_length": 256.0, "epoch": 20.408, "grad_norm": 2.546875, "kl": 4.456199884414673, "learning_rate": 3.6878321161802104e-05, "loss": 0.1782, "reward": 3.198836088180542, "reward_std": 3.219378113746643, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5175026655197144, "rewards/no_repetition_reward_func": -0.31085386872291565, "rewards/verse_reward_func": -0.0078125, "step": 2551 }, { "completion_length": 247.890625, "epoch": 20.416, "grad_norm": 2.0625, "kl": 5.470416784286499, "learning_rate": 3.686603339166362e-05, "loss": 0.2188, "reward": 2.0233289003372192, "reward_std": 2.658071756362915, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2944700717926025, "rewards/no_repetition_reward_func": -0.26332858949899673, "rewards/verse_reward_func": -0.0078125, "step": 2552 }, { "completion_length": 245.65625, "epoch": 20.424, "grad_norm": 2.328125, "kl": 4.921002388000488, "learning_rate": 3.685374192017436e-05, "loss": 0.1968, "reward": 2.6838057041168213, "reward_std": 3.030738115310669, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0014169216156006, "rewards/no_repetition_reward_func": -0.3097987025976181, "rewards/verse_reward_func": -0.0078125, "step": 2553 }, { "completion_length": 254.578125, "epoch": 20.432, "grad_norm": 2.8125, "kl": 4.837340593338013, "learning_rate": 3.6841446751168355e-05, "loss": 0.1935, "reward": 2.3379647731781006, "reward_std": 3.056857109069824, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6091268062591553, "rewards/no_repetition_reward_func": -0.2711622267961502, "rewards/verse_reward_func": 0.0, "step": 2554 }, { "completion_length": 253.6875, "epoch": 20.44, "grad_norm": 2.46875, "kl": 4.852558851242065, "learning_rate": 3.682914788848083e-05, "loss": 0.1941, "reward": 2.612269878387451, "reward_std": 3.046481132507324, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8973653316497803, "rewards/no_repetition_reward_func": -0.2850954085588455, "rewards/verse_reward_func": 0.0, "step": 2555 }, { "completion_length": 254.078125, "epoch": 20.448, "grad_norm": 1.625, "kl": 4.023199558258057, "learning_rate": 3.681684533594815e-05, "loss": 0.1609, "reward": 2.802934408187866, "reward_std": 3.058650493621826, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.105069160461426, "rewards/no_repetition_reward_func": -0.30213476717472076, "rewards/verse_reward_func": 0.0, "step": 2556 }, { "completion_length": 252.546875, "epoch": 20.456, "grad_norm": 8.625, "kl": 5.639863967895508, "learning_rate": 3.680453909740782e-05, "loss": 0.2256, "reward": 2.1377450227737427, "reward_std": 2.67378306388855, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.440791666507721, "rewards/no_repetition_reward_func": -0.2952342629432678, "rewards/verse_reward_func": -0.0078125, "step": 2557 }, { "completion_length": 251.59375, "epoch": 20.464, "grad_norm": 2.546875, "kl": 4.664900302886963, "learning_rate": 3.679222917669851e-05, "loss": 0.1866, "reward": 3.0496816635131836, "reward_std": 3.302490234375, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.347464680671692, "rewards/no_repetition_reward_func": -0.2977830767631531, "rewards/verse_reward_func": 0.0, "step": 2558 }, { "completion_length": 246.21875, "epoch": 20.472, "grad_norm": 4.71875, "kl": 6.034001111984253, "learning_rate": 3.6779915577660015e-05, "loss": 0.2414, "reward": 2.3657811880111694, "reward_std": 2.796314239501953, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6281020641326904, "rewards/no_repetition_reward_func": -0.26232099533081055, "rewards/verse_reward_func": 0.0, "step": 2559 }, { "completion_length": 245.703125, "epoch": 20.48, "grad_norm": 1.703125, "kl": 5.29583215713501, "learning_rate": 3.6767598304133324e-05, "loss": 0.2118, "reward": 2.3523833751678467, "reward_std": 2.4312528371810913, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6069507598876953, "rewards/no_repetition_reward_func": -0.25456732511520386, "rewards/verse_reward_func": 0.0, "step": 2560 }, { "completion_length": 253.140625, "epoch": 20.488, "grad_norm": 3.421875, "kl": 4.749976634979248, "learning_rate": 3.67552773599605e-05, "loss": 0.19, "reward": 3.2501471042633057, "reward_std": 3.4915741682052612, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5912766456604004, "rewards/no_repetition_reward_func": -0.3255046308040619, "rewards/verse_reward_func": -0.015625, "step": 2561 }, { "completion_length": 253.109375, "epoch": 20.496, "grad_norm": 1.8203125, "kl": 5.183894157409668, "learning_rate": 3.674295274898485e-05, "loss": 0.2074, "reward": 2.9105114936828613, "reward_std": 2.9096968173980713, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2244588136672974, "rewards/no_repetition_reward_func": -0.31394752860069275, "rewards/verse_reward_func": 0.0, "step": 2562 }, { "completion_length": 253.109375, "epoch": 20.504, "grad_norm": 3.0, "kl": 5.477533340454102, "learning_rate": 3.673062447505072e-05, "loss": 0.2191, "reward": 2.439108371734619, "reward_std": 2.7300959825515747, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.707536458969116, "rewards/no_repetition_reward_func": -0.2684282213449478, "rewards/verse_reward_func": 0.0, "step": 2563 }, { "completion_length": 253.078125, "epoch": 20.512, "grad_norm": 4.65625, "kl": 5.899871587753296, "learning_rate": 3.6718292542003666e-05, "loss": 0.236, "reward": 2.2813937067985535, "reward_std": 2.788441300392151, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5682116746902466, "rewards/no_repetition_reward_func": -0.2868179380893707, "rewards/verse_reward_func": 0.0, "step": 2564 }, { "completion_length": 253.171875, "epoch": 20.52, "grad_norm": 7.125, "kl": 5.773971080780029, "learning_rate": 3.6705956953690364e-05, "loss": 0.231, "reward": 2.5325790643692017, "reward_std": 2.7985939979553223, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8128265142440796, "rewards/no_repetition_reward_func": -0.2802475020289421, "rewards/verse_reward_func": 0.0, "step": 2565 }, { "completion_length": 246.328125, "epoch": 20.528, "grad_norm": 2.171875, "kl": 5.434221267700195, "learning_rate": 3.6693617713958634e-05, "loss": 0.2174, "reward": 2.295655369758606, "reward_std": 3.1232104301452637, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5968949794769287, "rewards/no_repetition_reward_func": -0.285614475607872, "rewards/verse_reward_func": -0.015625, "step": 2566 }, { "completion_length": 249.578125, "epoch": 20.536, "grad_norm": 3.21875, "kl": 5.934787273406982, "learning_rate": 3.668127482665743e-05, "loss": 0.2374, "reward": 2.000828266143799, "reward_std": 2.657249093055725, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.255198299884796, "rewards/no_repetition_reward_func": -0.25436997413635254, "rewards/verse_reward_func": 0.0, "step": 2567 }, { "completion_length": 256.0, "epoch": 20.544, "grad_norm": 1.921875, "kl": 5.081005096435547, "learning_rate": 3.6668928295636854e-05, "loss": 0.2032, "reward": 2.34775173664093, "reward_std": 2.9197200536727905, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.618631362915039, "rewards/no_repetition_reward_func": -0.2708793953061104, "rewards/verse_reward_func": 0.0, "step": 2568 }, { "completion_length": 249.546875, "epoch": 20.552, "grad_norm": 1.546875, "kl": 4.533707141876221, "learning_rate": 3.665657812474812e-05, "loss": 0.1813, "reward": 2.1976948976516724, "reward_std": 2.8852062225341797, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4450751543045044, "rewards/no_repetition_reward_func": -0.24738018959760666, "rewards/verse_reward_func": 0.0, "step": 2569 }, { "completion_length": 248.46875, "epoch": 20.56, "grad_norm": 2.46875, "kl": 5.775630712509155, "learning_rate": 3.664422431784361e-05, "loss": 0.231, "reward": 1.9997017979621887, "reward_std": 2.840970754623413, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2511284947395325, "rewards/no_repetition_reward_func": -0.25142674893140793, "rewards/verse_reward_func": 0.0, "step": 2570 }, { "completion_length": 250.03125, "epoch": 20.568, "grad_norm": 5.46875, "kl": 3.3962115049362183, "learning_rate": 3.663186687877682e-05, "loss": 0.1358, "reward": 2.8836973905563354, "reward_std": 3.2579466104507446, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1950572729110718, "rewards/no_repetition_reward_func": -0.31136003136634827, "rewards/verse_reward_func": 0.0, "step": 2571 }, { "completion_length": 247.859375, "epoch": 20.576, "grad_norm": 2.28125, "kl": 4.8612189292907715, "learning_rate": 3.661950581140239e-05, "loss": 0.1944, "reward": 1.8095881938934326, "reward_std": 2.9948636293411255, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0441616773605347, "rewards/no_repetition_reward_func": -0.22676102817058563, "rewards/verse_reward_func": -0.0078125, "step": 2572 }, { "completion_length": 251.46875, "epoch": 20.584, "grad_norm": 4.78125, "kl": 4.094360589981079, "learning_rate": 3.6607141119576084e-05, "loss": 0.1638, "reward": 2.2179919481277466, "reward_std": 2.9942649602890015, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5107866525650024, "rewards/no_repetition_reward_func": -0.29279476404190063, "rewards/verse_reward_func": 0.0, "step": 2573 }, { "completion_length": 252.046875, "epoch": 20.592, "grad_norm": 3.25, "kl": 4.532434701919556, "learning_rate": 3.659477280715479e-05, "loss": 0.1813, "reward": 2.2097033262252808, "reward_std": 2.7018754482269287, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4872658252716064, "rewards/no_repetition_reward_func": -0.27756254374980927, "rewards/verse_reward_func": 0.0, "step": 2574 }, { "completion_length": 254.859375, "epoch": 20.6, "grad_norm": 7.0, "kl": 4.006133317947388, "learning_rate": 3.6582400877996546e-05, "loss": 0.1602, "reward": 3.2508511543273926, "reward_std": 3.259225606918335, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5500195026397705, "rewards/no_repetition_reward_func": -0.2913559377193451, "rewards/verse_reward_func": -0.0078125, "step": 2575 }, { "completion_length": 246.59375, "epoch": 20.608, "grad_norm": 4.4375, "kl": 3.132355809211731, "learning_rate": 3.657002533596049e-05, "loss": 0.1253, "reward": 3.0604748725891113, "reward_std": 3.2392042875289917, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3709893226623535, "rewards/no_repetition_reward_func": -0.3027019798755646, "rewards/verse_reward_func": -0.0078125, "step": 2576 }, { "completion_length": 255.171875, "epoch": 20.616, "grad_norm": 4.0, "kl": 3.5976957082748413, "learning_rate": 3.655764618490692e-05, "loss": 0.1439, "reward": 2.677895665168762, "reward_std": 3.2054141759872437, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9830950498580933, "rewards/no_repetition_reward_func": -0.30519920587539673, "rewards/verse_reward_func": 0.0, "step": 2577 }, { "completion_length": 251.265625, "epoch": 20.624, "grad_norm": 3.953125, "kl": 3.2771075963974, "learning_rate": 3.654526342869724e-05, "loss": 0.1311, "reward": 3.2813068628311157, "reward_std": 3.2257890701293945, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5671803951263428, "rewards/no_repetition_reward_func": -0.2858736366033554, "rewards/verse_reward_func": 0.0, "step": 2578 }, { "completion_length": 250.46875, "epoch": 20.632, "grad_norm": 1.84375, "kl": 3.9836602210998535, "learning_rate": 3.6532877071193974e-05, "loss": 0.1593, "reward": 3.088733196258545, "reward_std": 2.9738166332244873, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3714101314544678, "rewards/no_repetition_reward_func": -0.28267692029476166, "rewards/verse_reward_func": 0.0, "step": 2579 }, { "completion_length": 249.890625, "epoch": 20.64, "grad_norm": 3.375, "kl": 4.043083429336548, "learning_rate": 3.6520487116260776e-05, "loss": 0.1617, "reward": 2.8858487606048584, "reward_std": 3.274857997894287, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.198458194732666, "rewards/no_repetition_reward_func": -0.30479711294174194, "rewards/verse_reward_func": -0.0078125, "step": 2580 }, { "completion_length": 248.890625, "epoch": 20.648, "grad_norm": 1.8046875, "kl": 4.788072943687439, "learning_rate": 3.650809356776242e-05, "loss": 0.1915, "reward": 2.3412776589393616, "reward_std": 2.8959269523620605, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.637313485145569, "rewards/no_repetition_reward_func": -0.2882235199213028, "rewards/verse_reward_func": -0.0078125, "step": 2581 }, { "completion_length": 246.3125, "epoch": 20.656, "grad_norm": 2.625, "kl": 4.632530689239502, "learning_rate": 3.6495696429564823e-05, "loss": 0.1853, "reward": 2.8179476261138916, "reward_std": 3.000082015991211, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.097574472427368, "rewards/no_repetition_reward_func": -0.27962662279605865, "rewards/verse_reward_func": 0.0, "step": 2582 }, { "completion_length": 244.15625, "epoch": 20.664, "grad_norm": 1.9140625, "kl": 5.421109676361084, "learning_rate": 3.648329570553498e-05, "loss": 0.2168, "reward": 2.543831706047058, "reward_std": 2.9629825353622437, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.804764986038208, "rewards/no_repetition_reward_func": -0.25312085449695587, "rewards/verse_reward_func": -0.0078125, "step": 2583 }, { "completion_length": 247.328125, "epoch": 20.672, "grad_norm": 5.34375, "kl": 6.036077976226807, "learning_rate": 3.647089139954104e-05, "loss": 0.2414, "reward": 2.1956308484077454, "reward_std": 2.5808587670326233, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4871546030044556, "rewards/no_repetition_reward_func": -0.2837110906839371, "rewards/verse_reward_func": -0.0078125, "step": 2584 }, { "completion_length": 248.5, "epoch": 20.68, "grad_norm": 1.90625, "kl": 5.643099308013916, "learning_rate": 3.645848351545225e-05, "loss": 0.2257, "reward": 2.6893980503082275, "reward_std": 3.2122926712036133, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.983580231666565, "rewards/no_repetition_reward_func": -0.2785574197769165, "rewards/verse_reward_func": -0.015625, "step": 2585 }, { "completion_length": 243.5, "epoch": 20.688, "grad_norm": 3.78125, "kl": 6.456444501876831, "learning_rate": 3.644607205713898e-05, "loss": 0.2583, "reward": 1.9391737580299377, "reward_std": 3.0208919048309326, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2092918157577515, "rewards/no_repetition_reward_func": -0.26230549812316895, "rewards/verse_reward_func": -0.0078125, "step": 2586 }, { "completion_length": 251.671875, "epoch": 20.696, "grad_norm": 2.6875, "kl": 5.323865652084351, "learning_rate": 3.643365702847272e-05, "loss": 0.213, "reward": 2.6382994651794434, "reward_std": 2.868543267250061, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9277195930480957, "rewards/no_repetition_reward_func": -0.27379512786865234, "rewards/verse_reward_func": -0.015625, "step": 2587 }, { "completion_length": 249.4375, "epoch": 20.704, "grad_norm": 2.421875, "kl": 5.556635141372681, "learning_rate": 3.642123843332606e-05, "loss": 0.2223, "reward": 2.602445960044861, "reward_std": 2.9798526763916016, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8732247352600098, "rewards/no_repetition_reward_func": -0.27077876776456833, "rewards/verse_reward_func": 0.0, "step": 2588 }, { "completion_length": 255.03125, "epoch": 20.712, "grad_norm": 2.5625, "kl": 5.295403957366943, "learning_rate": 3.640881627557271e-05, "loss": 0.2118, "reward": 2.4304757118225098, "reward_std": 2.801243305206299, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.667265236377716, "rewards/no_repetition_reward_func": -0.23678958415985107, "rewards/verse_reward_func": 0.0, "step": 2589 }, { "completion_length": 250.03125, "epoch": 20.72, "grad_norm": 1.84375, "kl": 5.098846673965454, "learning_rate": 3.639639055908751e-05, "loss": 0.204, "reward": 2.018725335597992, "reward_std": 2.690093994140625, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.268270492553711, "rewards/no_repetition_reward_func": -0.24954508244991302, "rewards/verse_reward_func": 0.0, "step": 2590 }, { "completion_length": 256.0, "epoch": 20.728, "grad_norm": 5.21875, "kl": 5.518562316894531, "learning_rate": 3.638396128774636e-05, "loss": 0.2207, "reward": 2.1744974851608276, "reward_std": 2.9448800086975098, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.461490750312805, "rewards/no_repetition_reward_func": -0.2869934141635895, "rewards/verse_reward_func": 0.0, "step": 2591 }, { "completion_length": 249.265625, "epoch": 20.736, "grad_norm": 3.46875, "kl": 4.19843864440918, "learning_rate": 3.637152846542633e-05, "loss": 0.1679, "reward": 2.9242348670959473, "reward_std": 3.348129987716675, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.271736741065979, "rewards/no_repetition_reward_func": -0.33968931436538696, "rewards/verse_reward_func": -0.0078125, "step": 2592 }, { "completion_length": 255.34375, "epoch": 20.744, "grad_norm": 3.921875, "kl": 3.791491985321045, "learning_rate": 3.635909209600555e-05, "loss": 0.1517, "reward": 3.077221155166626, "reward_std": 3.422329902648926, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.396041750907898, "rewards/no_repetition_reward_func": -0.31882089376449585, "rewards/verse_reward_func": 0.0, "step": 2593 }, { "completion_length": 249.828125, "epoch": 20.752, "grad_norm": 4.25, "kl": 4.323318719863892, "learning_rate": 3.634665218336328e-05, "loss": 0.1729, "reward": 2.5400880575180054, "reward_std": 3.1028932332992554, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8530287742614746, "rewards/no_repetition_reward_func": -0.3051283210515976, "rewards/verse_reward_func": -0.0078125, "step": 2594 }, { "completion_length": 250.203125, "epoch": 20.76, "grad_norm": 1.71875, "kl": 3.7050116062164307, "learning_rate": 3.633420873137988e-05, "loss": 0.1482, "reward": 2.8381989002227783, "reward_std": 2.8167463541030884, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1328874826431274, "rewards/no_repetition_reward_func": -0.2868760675191879, "rewards/verse_reward_func": -0.0078125, "step": 2595 }, { "completion_length": 249.265625, "epoch": 20.768, "grad_norm": 2.890625, "kl": 5.115665435791016, "learning_rate": 3.632176174393682e-05, "loss": 0.2046, "reward": 3.2274736166000366, "reward_std": 3.341219663619995, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5633679628372192, "rewards/no_repetition_reward_func": -0.3280818462371826, "rewards/verse_reward_func": -0.0078125, "step": 2596 }, { "completion_length": 250.8125, "epoch": 20.776, "grad_norm": 4.15625, "kl": 5.366989612579346, "learning_rate": 3.630931122491666e-05, "loss": 0.2147, "reward": 2.392080783843994, "reward_std": 3.303101897239685, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7091970443725586, "rewards/no_repetition_reward_func": -0.31711600720882416, "rewards/verse_reward_func": 0.0, "step": 2597 }, { "completion_length": 247.40625, "epoch": 20.784, "grad_norm": 4.78125, "kl": 5.300602197647095, "learning_rate": 3.629685717820307e-05, "loss": 0.212, "reward": 2.0492974519729614, "reward_std": 2.455216407775879, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.331913709640503, "rewards/no_repetition_reward_func": -0.27480390667915344, "rewards/verse_reward_func": -0.0078125, "step": 2598 }, { "completion_length": 253.328125, "epoch": 20.792, "grad_norm": 7.1875, "kl": 5.796117782592773, "learning_rate": 3.628439960768082e-05, "loss": 0.2318, "reward": 2.3978118300437927, "reward_std": 2.5852835178375244, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.636353313922882, "rewards/no_repetition_reward_func": -0.23854149132966995, "rewards/verse_reward_func": 0.0, "step": 2599 }, { "completion_length": 251.84375, "epoch": 20.8, "grad_norm": 9.75, "kl": 5.951037168502808, "learning_rate": 3.627193851723577e-05, "loss": 0.238, "reward": 2.3170148730278015, "reward_std": 2.8560750484466553, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.606684446334839, "rewards/no_repetition_reward_func": -0.2896697670221329, "rewards/verse_reward_func": 0.0, "step": 2600 }, { "completion_length": 247.296875, "epoch": 20.808, "grad_norm": 2.984375, "kl": 5.455433130264282, "learning_rate": 3.6259473910754904e-05, "loss": 0.2182, "reward": 2.4619977474212646, "reward_std": 3.091675281524658, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.747604489326477, "rewards/no_repetition_reward_func": -0.2543569654226303, "rewards/verse_reward_func": -0.03125, "step": 2601 }, { "completion_length": 250.703125, "epoch": 20.816, "grad_norm": 1.796875, "kl": 4.796811580657959, "learning_rate": 3.624700579212626e-05, "loss": 0.1919, "reward": 2.7235522270202637, "reward_std": 3.3802597522735596, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.011555552482605, "rewards/no_repetition_reward_func": -0.28019101917743683, "rewards/verse_reward_func": -0.0078125, "step": 2602 }, { "completion_length": 256.0, "epoch": 20.824, "grad_norm": 1.96875, "kl": 5.039072275161743, "learning_rate": 3.623453416523902e-05, "loss": 0.2016, "reward": 2.8988382816314697, "reward_std": 3.092995524406433, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.197623610496521, "rewards/no_repetition_reward_func": -0.29878534376621246, "rewards/verse_reward_func": 0.0, "step": 2603 }, { "completion_length": 248.046875, "epoch": 20.832, "grad_norm": 2.671875, "kl": 5.5943357944488525, "learning_rate": 3.622205903398342e-05, "loss": 0.2238, "reward": 2.105209231376648, "reward_std": 2.868560552597046, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3742696046829224, "rewards/no_repetition_reward_func": -0.26906047761440277, "rewards/verse_reward_func": 0.0, "step": 2604 }, { "completion_length": 248.953125, "epoch": 20.84, "grad_norm": 3.640625, "kl": 4.485939025878906, "learning_rate": 3.6209580402250815e-05, "loss": 0.1794, "reward": 2.4461541175842285, "reward_std": 2.8271169662475586, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.764804244041443, "rewards/no_repetition_reward_func": -0.30302511155605316, "rewards/verse_reward_func": -0.015625, "step": 2605 }, { "completion_length": 248.703125, "epoch": 20.848, "grad_norm": 3.5, "kl": 6.006974220275879, "learning_rate": 3.6197098273933634e-05, "loss": 0.2403, "reward": 2.153836488723755, "reward_std": 2.82759165763855, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4147772789001465, "rewards/no_repetition_reward_func": -0.2609408348798752, "rewards/verse_reward_func": 0.0, "step": 2606 }, { "completion_length": 254.0625, "epoch": 20.856, "grad_norm": 2.828125, "kl": 5.0029377937316895, "learning_rate": 3.618461265292541e-05, "loss": 0.2001, "reward": 2.4102851152420044, "reward_std": 2.80126953125, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7039666771888733, "rewards/no_repetition_reward_func": -0.29368162155151367, "rewards/verse_reward_func": 0.0, "step": 2607 }, { "completion_length": 256.0, "epoch": 20.864, "grad_norm": 2.8125, "kl": 4.930952787399292, "learning_rate": 3.617212354312076e-05, "loss": 0.1972, "reward": 2.942846655845642, "reward_std": 3.1919487714767456, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2274426221847534, "rewards/no_repetition_reward_func": -0.28459586203098297, "rewards/verse_reward_func": 0.0, "step": 2608 }, { "completion_length": 250.359375, "epoch": 20.872, "grad_norm": 4.1875, "kl": 4.116886854171753, "learning_rate": 3.61596309484154e-05, "loss": 0.1647, "reward": 3.0388457775115967, "reward_std": 3.3178915977478027, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3448179960250854, "rewards/no_repetition_reward_func": -0.30597205460071564, "rewards/verse_reward_func": 0.0, "step": 2609 }, { "completion_length": 248.515625, "epoch": 20.88, "grad_norm": 1.8671875, "kl": 5.1670544147491455, "learning_rate": 3.614713487270611e-05, "loss": 0.2067, "reward": 2.579315423965454, "reward_std": 3.14819872379303, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8903000354766846, "rewards/no_repetition_reward_func": -0.31098467111587524, "rewards/verse_reward_func": 0.0, "step": 2610 }, { "completion_length": 249.234375, "epoch": 20.888, "grad_norm": 3.90625, "kl": 3.829073429107666, "learning_rate": 3.613463531989076e-05, "loss": 0.1532, "reward": 2.9699991941452026, "reward_std": 2.949783682823181, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2767621278762817, "rewards/no_repetition_reward_func": -0.3067627102136612, "rewards/verse_reward_func": 0.0, "step": 2611 }, { "completion_length": 245.921875, "epoch": 20.896, "grad_norm": 3.0625, "kl": 4.337199449539185, "learning_rate": 3.6122132293868335e-05, "loss": 0.1735, "reward": 2.8750076293945312, "reward_std": 3.3133591413497925, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1657793521881104, "rewards/no_repetition_reward_func": -0.2829594910144806, "rewards/verse_reward_func": -0.0078125, "step": 2612 }, { "completion_length": 249.03125, "epoch": 20.904, "grad_norm": 2.578125, "kl": 4.652855634689331, "learning_rate": 3.6109625798538873e-05, "loss": 0.1861, "reward": 2.4663918018341064, "reward_std": 3.0268714427948, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7753541469573975, "rewards/no_repetition_reward_func": -0.3011496812105179, "rewards/verse_reward_func": -0.0078125, "step": 2613 }, { "completion_length": 255.359375, "epoch": 20.912, "grad_norm": 4.40625, "kl": 4.476404428482056, "learning_rate": 3.6097115837803505e-05, "loss": 0.1791, "reward": 3.0237550735473633, "reward_std": 3.521793842315674, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3167905807495117, "rewards/no_repetition_reward_func": -0.29303547739982605, "rewards/verse_reward_func": 0.0, "step": 2614 }, { "completion_length": 248.109375, "epoch": 20.92, "grad_norm": 4.03125, "kl": 4.7978620529174805, "learning_rate": 3.608460241556443e-05, "loss": 0.1919, "reward": 2.8065438270568848, "reward_std": 3.295551061630249, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.111691951751709, "rewards/no_repetition_reward_func": -0.29733559489250183, "rewards/verse_reward_func": -0.0078125, "step": 2615 }, { "completion_length": 252.15625, "epoch": 20.928, "grad_norm": 1.7421875, "kl": 5.2269110679626465, "learning_rate": 3.6072085535724956e-05, "loss": 0.2091, "reward": 2.127238094806671, "reward_std": 2.8561623096466064, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.372241973876953, "rewards/no_repetition_reward_func": -0.24500376731157303, "rewards/verse_reward_func": 0.0, "step": 2616 }, { "completion_length": 252.21875, "epoch": 20.936, "grad_norm": 3.21875, "kl": 6.51970911026001, "learning_rate": 3.6059565202189435e-05, "loss": 0.2608, "reward": 1.7968212366104126, "reward_std": 2.6583878993988037, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.063792884349823, "rewards/no_repetition_reward_func": -0.2669717073440552, "rewards/verse_reward_func": 0.0, "step": 2617 }, { "completion_length": 244.203125, "epoch": 20.944, "grad_norm": 2.234375, "kl": 5.044234275817871, "learning_rate": 3.604704141886332e-05, "loss": 0.2018, "reward": 2.208994150161743, "reward_std": 2.6215740442276, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.482428789138794, "rewards/no_repetition_reward_func": -0.25780951976776123, "rewards/verse_reward_func": -0.015625, "step": 2618 }, { "completion_length": 254.625, "epoch": 20.951999999999998, "grad_norm": 2.515625, "kl": 4.736703157424927, "learning_rate": 3.603451418965313e-05, "loss": 0.1895, "reward": 3.2342336177825928, "reward_std": 3.1199333667755127, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5394421815872192, "rewards/no_repetition_reward_func": -0.3052087277173996, "rewards/verse_reward_func": 0.0, "step": 2619 }, { "completion_length": 256.0, "epoch": 20.96, "grad_norm": 2.171875, "kl": 5.194934129714966, "learning_rate": 3.602198351846647e-05, "loss": 0.2078, "reward": 2.1391900777816772, "reward_std": 3.0284503698349, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3943283557891846, "rewards/no_repetition_reward_func": -0.2551383301615715, "rewards/verse_reward_func": 0.0, "step": 2620 }, { "completion_length": 250.125, "epoch": 20.968, "grad_norm": 2.859375, "kl": 4.063441514968872, "learning_rate": 3.600944940921199e-05, "loss": 0.1625, "reward": 2.6045143604278564, "reward_std": 3.054546594619751, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9367810487747192, "rewards/no_repetition_reward_func": -0.3244541883468628, "rewards/verse_reward_func": -0.0078125, "step": 2621 }, { "completion_length": 246.375, "epoch": 20.976, "grad_norm": 2.546875, "kl": 4.822404861450195, "learning_rate": 3.5996911865799454e-05, "loss": 0.1929, "reward": 2.626461982727051, "reward_std": 2.6165958642959595, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9046900272369385, "rewards/no_repetition_reward_func": -0.27822817862033844, "rewards/verse_reward_func": 0.0, "step": 2622 }, { "completion_length": 248.015625, "epoch": 20.984, "grad_norm": 6.15625, "kl": 6.066302299499512, "learning_rate": 3.5984370892139666e-05, "loss": 0.2427, "reward": 2.0120432376861572, "reward_std": 2.81410813331604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2725115418434143, "rewards/no_repetition_reward_func": -0.2526558190584183, "rewards/verse_reward_func": -0.0078125, "step": 2623 }, { "completion_length": 252.9375, "epoch": 20.992, "grad_norm": 6.5, "kl": 5.173980712890625, "learning_rate": 3.5971826492144504e-05, "loss": 0.207, "reward": 2.547613263130188, "reward_std": 3.0442616939544678, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8300728797912598, "rewards/no_repetition_reward_func": -0.2824595719575882, "rewards/verse_reward_func": 0.0, "step": 2624 }, { "completion_length": 243.5625, "epoch": 21.0, "grad_norm": 2.875, "kl": 3.9234384298324585, "learning_rate": 3.5959278669726935e-05, "loss": 0.1569, "reward": 2.8777291774749756, "reward_std": 3.1279126405715942, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.131678819656372, "rewards/no_repetition_reward_func": -0.2539495751261711, "rewards/verse_reward_func": 0.0, "step": 2625 }, { "completion_length": 250.796875, "epoch": 21.008, "grad_norm": 2.328125, "kl": 4.419697999954224, "learning_rate": 3.594672742880097e-05, "loss": 0.1768, "reward": 1.9443974494934082, "reward_std": 2.365017056465149, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1786495447158813, "rewards/no_repetition_reward_func": -0.23425229638814926, "rewards/verse_reward_func": 0.0, "step": 2626 }, { "completion_length": 255.015625, "epoch": 21.016, "grad_norm": 1.671875, "kl": 4.902500152587891, "learning_rate": 3.5934172773281696e-05, "loss": 0.1961, "reward": 2.70535409450531, "reward_std": 3.0845504999160767, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9859646558761597, "rewards/no_repetition_reward_func": -0.2727980464696884, "rewards/verse_reward_func": -0.0078125, "step": 2627 }, { "completion_length": 248.921875, "epoch": 21.024, "grad_norm": 3.28125, "kl": 3.933831572532654, "learning_rate": 3.592161470708526e-05, "loss": 0.1574, "reward": 2.8094829320907593, "reward_std": 3.377626657485962, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1249122619628906, "rewards/no_repetition_reward_func": -0.3076169192790985, "rewards/verse_reward_func": -0.0078125, "step": 2628 }, { "completion_length": 252.625, "epoch": 21.032, "grad_norm": 4.15625, "kl": 4.251689195632935, "learning_rate": 3.5909053234128895e-05, "loss": 0.1701, "reward": 3.2607951164245605, "reward_std": 3.2599751949310303, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5744231939315796, "rewards/no_repetition_reward_func": -0.30581560730934143, "rewards/verse_reward_func": -0.0078125, "step": 2629 }, { "completion_length": 248.515625, "epoch": 21.04, "grad_norm": 8.5625, "kl": 3.694562554359436, "learning_rate": 3.5896488358330856e-05, "loss": 0.1478, "reward": 2.9913079738616943, "reward_std": 3.208927631378174, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2958996295928955, "rewards/no_repetition_reward_func": -0.29677924513816833, "rewards/verse_reward_func": -0.0078125, "step": 2630 }, { "completion_length": 247.75, "epoch": 21.048, "grad_norm": 3.359375, "kl": 4.094128370285034, "learning_rate": 3.588392008361049e-05, "loss": 0.1638, "reward": 2.5556472539901733, "reward_std": 3.2078899145126343, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.841815233230591, "rewards/no_repetition_reward_func": -0.2861679643392563, "rewards/verse_reward_func": 0.0, "step": 2631 }, { "completion_length": 243.375, "epoch": 21.056, "grad_norm": 2.640625, "kl": 4.105459690093994, "learning_rate": 3.5871348413888204e-05, "loss": 0.1642, "reward": 2.7493247985839844, "reward_std": 2.9318108558654785, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0177674293518066, "rewards/no_repetition_reward_func": -0.268442764878273, "rewards/verse_reward_func": 0.0, "step": 2632 }, { "completion_length": 255.375, "epoch": 21.064, "grad_norm": 2.71875, "kl": 4.686227083206177, "learning_rate": 3.585877335308546e-05, "loss": 0.1874, "reward": 3.0816385746002197, "reward_std": 3.053205728530884, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3662279844284058, "rewards/no_repetition_reward_func": -0.28458938002586365, "rewards/verse_reward_func": 0.0, "step": 2633 }, { "completion_length": 252.0625, "epoch": 21.072, "grad_norm": 3.078125, "kl": 4.809374809265137, "learning_rate": 3.5846194905124757e-05, "loss": 0.1924, "reward": 2.5634273290634155, "reward_std": 2.8762598037719727, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8654427528381348, "rewards/no_repetition_reward_func": -0.30201534926891327, "rewards/verse_reward_func": 0.0, "step": 2634 }, { "completion_length": 252.71875, "epoch": 21.08, "grad_norm": 2.90625, "kl": 4.3711957931518555, "learning_rate": 3.5833613073929684e-05, "loss": 0.1748, "reward": 2.7128039598464966, "reward_std": 3.253338575363159, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.021679997444153, "rewards/no_repetition_reward_func": -0.3010634332895279, "rewards/verse_reward_func": -0.0078125, "step": 2635 }, { "completion_length": 246.046875, "epoch": 21.088, "grad_norm": 2.265625, "kl": 5.493469715118408, "learning_rate": 3.582102786342485e-05, "loss": 0.2197, "reward": 2.2617504596710205, "reward_std": 2.9531136751174927, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.523513078689575, "rewards/no_repetition_reward_func": -0.26176267862319946, "rewards/verse_reward_func": 0.0, "step": 2636 }, { "completion_length": 249.953125, "epoch": 21.096, "grad_norm": 4.4375, "kl": 6.279629230499268, "learning_rate": 3.5808439277535964e-05, "loss": 0.2512, "reward": 2.0927082300186157, "reward_std": 2.8488006591796875, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.360255002975464, "rewards/no_repetition_reward_func": -0.25192148983478546, "rewards/verse_reward_func": -0.015625, "step": 2637 }, { "completion_length": 250.53125, "epoch": 21.104, "grad_norm": 5.03125, "kl": 6.079893112182617, "learning_rate": 3.5795847320189746e-05, "loss": 0.2432, "reward": 1.7305655479431152, "reward_std": 2.7325421571731567, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0061756372451782, "rewards/no_repetition_reward_func": -0.2756099998950958, "rewards/verse_reward_func": 0.0, "step": 2638 }, { "completion_length": 245.546875, "epoch": 21.112, "grad_norm": 4.28125, "kl": 5.842589855194092, "learning_rate": 3.5783251995313985e-05, "loss": 0.2337, "reward": 2.160114586353302, "reward_std": 2.8408061265945435, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4461092948913574, "rewards/no_repetition_reward_func": -0.2859945297241211, "rewards/verse_reward_func": 0.0, "step": 2639 }, { "completion_length": 247.109375, "epoch": 21.12, "grad_norm": 4.34375, "kl": 6.141192674636841, "learning_rate": 3.577065330683751e-05, "loss": 0.2456, "reward": 2.7229669094085693, "reward_std": 3.4328813552856445, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.012000560760498, "rewards/no_repetition_reward_func": -0.28122107684612274, "rewards/verse_reward_func": -0.0078125, "step": 2640 }, { "completion_length": 250.203125, "epoch": 21.128, "grad_norm": 1.859375, "kl": 4.9918437004089355, "learning_rate": 3.575805125869022e-05, "loss": 0.1997, "reward": 2.4594141244888306, "reward_std": 3.1614400148391724, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.764466166496277, "rewards/no_repetition_reward_func": -0.29723966121673584, "rewards/verse_reward_func": -0.0078125, "step": 2641 }, { "completion_length": 243.90625, "epoch": 21.136, "grad_norm": 2.28125, "kl": 4.541972875595093, "learning_rate": 3.574544585480305e-05, "loss": 0.1817, "reward": 3.048020362854004, "reward_std": 3.424705982208252, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3268494606018066, "rewards/no_repetition_reward_func": -0.27101656049489975, "rewards/verse_reward_func": -0.0078125, "step": 2642 }, { "completion_length": 250.890625, "epoch": 21.144, "grad_norm": 1.8203125, "kl": 5.449336051940918, "learning_rate": 3.573283709910798e-05, "loss": 0.218, "reward": 2.731094479560852, "reward_std": 3.3991702795028687, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0271724462509155, "rewards/no_repetition_reward_func": -0.28826554119586945, "rewards/verse_reward_func": -0.0078125, "step": 2643 }, { "completion_length": 249.03125, "epoch": 21.152, "grad_norm": 3.0625, "kl": 4.551589488983154, "learning_rate": 3.572022499553802e-05, "loss": 0.1821, "reward": 2.8282415866851807, "reward_std": 2.881666421890259, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.111616015434265, "rewards/no_repetition_reward_func": -0.28337429463863373, "rewards/verse_reward_func": 0.0, "step": 2644 }, { "completion_length": 248.328125, "epoch": 21.16, "grad_norm": 3.125, "kl": 5.230507493019104, "learning_rate": 3.570760954802726e-05, "loss": 0.2092, "reward": 2.007025718688965, "reward_std": 2.7017143964767456, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2911837100982666, "rewards/no_repetition_reward_func": -0.28415799140930176, "rewards/verse_reward_func": 0.0, "step": 2645 }, { "completion_length": 244.734375, "epoch": 21.168, "grad_norm": 2.484375, "kl": 4.940897464752197, "learning_rate": 3.569499076051081e-05, "loss": 0.1976, "reward": 2.4280906915664673, "reward_std": 3.1179628372192383, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7023357152938843, "rewards/no_repetition_reward_func": -0.274245023727417, "rewards/verse_reward_func": 0.0, "step": 2646 }, { "completion_length": 252.453125, "epoch": 21.176, "grad_norm": 1.84375, "kl": 5.93631911277771, "learning_rate": 3.568236863692482e-05, "loss": 0.2375, "reward": 1.9834197163581848, "reward_std": 3.1244972944259644, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.272048592567444, "rewards/no_repetition_reward_func": -0.2808164954185486, "rewards/verse_reward_func": -0.0078125, "step": 2647 }, { "completion_length": 250.734375, "epoch": 21.184, "grad_norm": 4.3125, "kl": 4.566388368606567, "learning_rate": 3.56697431812065e-05, "loss": 0.1827, "reward": 2.7400472164154053, "reward_std": 3.378239393234253, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.023655652999878, "rewards/no_repetition_reward_func": -0.28360848128795624, "rewards/verse_reward_func": 0.0, "step": 2648 }, { "completion_length": 249.90625, "epoch": 21.192, "grad_norm": 2.28125, "kl": 4.46029806137085, "learning_rate": 3.565711439729408e-05, "loss": 0.1784, "reward": 2.5641217827796936, "reward_std": 3.0016335248947144, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.84339439868927, "rewards/no_repetition_reward_func": -0.27146024256944656, "rewards/verse_reward_func": -0.0078125, "step": 2649 }, { "completion_length": 246.34375, "epoch": 21.2, "grad_norm": 4.78125, "kl": 4.533524990081787, "learning_rate": 3.564448228912682e-05, "loss": 0.1813, "reward": 2.7427759170532227, "reward_std": 3.0416438579559326, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.024930000305176, "rewards/no_repetition_reward_func": -0.26652903109788895, "rewards/verse_reward_func": -0.015625, "step": 2650 }, { "completion_length": 254.21875, "epoch": 21.208, "grad_norm": 3.46875, "kl": 5.5628626346588135, "learning_rate": 3.5631846860645044e-05, "loss": 0.2225, "reward": 1.7843484282493591, "reward_std": 2.6018357276916504, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0270289182662964, "rewards/no_repetition_reward_func": -0.24268054217100143, "rewards/verse_reward_func": 0.0, "step": 2651 }, { "completion_length": 250.796875, "epoch": 21.216, "grad_norm": 3.828125, "kl": 4.9910277128219604, "learning_rate": 3.56192081157901e-05, "loss": 0.1996, "reward": 2.276202440261841, "reward_std": 2.865975260734558, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5498546361923218, "rewards/no_repetition_reward_func": -0.2736522853374481, "rewards/verse_reward_func": 0.0, "step": 2652 }, { "completion_length": 250.484375, "epoch": 21.224, "grad_norm": 2.140625, "kl": 4.349950551986694, "learning_rate": 3.5606566058504375e-05, "loss": 0.174, "reward": 2.695328712463379, "reward_std": 3.07763934135437, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.007066011428833, "rewards/no_repetition_reward_func": -0.3039247840642929, "rewards/verse_reward_func": -0.0078125, "step": 2653 }, { "completion_length": 250.28125, "epoch": 21.232, "grad_norm": 3.015625, "kl": 3.8146800994873047, "learning_rate": 3.559392069273127e-05, "loss": 0.1526, "reward": 3.1988242864608765, "reward_std": 3.225839614868164, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4894237518310547, "rewards/no_repetition_reward_func": -0.2905993014574051, "rewards/verse_reward_func": 0.0, "step": 2654 }, { "completion_length": 256.0, "epoch": 21.24, "grad_norm": 1.390625, "kl": 4.830095529556274, "learning_rate": 3.5581272022415244e-05, "loss": 0.1932, "reward": 2.5821934938430786, "reward_std": 3.1843035221099854, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.861119508743286, "rewards/no_repetition_reward_func": -0.2789258509874344, "rewards/verse_reward_func": 0.0, "step": 2655 }, { "completion_length": 255.109375, "epoch": 21.248, "grad_norm": 3.546875, "kl": 3.644282817840576, "learning_rate": 3.5568620051501756e-05, "loss": 0.1458, "reward": 3.183902859687805, "reward_std": 3.162431001663208, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4859750270843506, "rewards/no_repetition_reward_func": -0.30207234621047974, "rewards/verse_reward_func": 0.0, "step": 2656 }, { "completion_length": 255.171875, "epoch": 21.256, "grad_norm": 1.359375, "kl": 4.8258044719696045, "learning_rate": 3.555596478393733e-05, "loss": 0.193, "reward": 2.584357738494873, "reward_std": 3.1747093200683594, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8903231620788574, "rewards/no_repetition_reward_func": -0.2981528788805008, "rewards/verse_reward_func": -0.0078125, "step": 2657 }, { "completion_length": 251.390625, "epoch": 21.264, "grad_norm": 2.203125, "kl": 4.4638237953186035, "learning_rate": 3.554330622366949e-05, "loss": 0.1786, "reward": 3.102433443069458, "reward_std": 3.1150110960006714, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.418126106262207, "rewards/no_repetition_reward_func": -0.31569287180900574, "rewards/verse_reward_func": 0.0, "step": 2658 }, { "completion_length": 255.25, "epoch": 21.272, "grad_norm": 2.546875, "kl": 4.7944159507751465, "learning_rate": 3.5530644374646815e-05, "loss": 0.1918, "reward": 2.064693033695221, "reward_std": 2.5287060737609863, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3121044635772705, "rewards/no_repetition_reward_func": -0.24741142988204956, "rewards/verse_reward_func": 0.0, "step": 2659 }, { "completion_length": 250.28125, "epoch": 21.28, "grad_norm": 1.90625, "kl": 5.105633735656738, "learning_rate": 3.551797924081887e-05, "loss": 0.2042, "reward": 2.35065758228302, "reward_std": 3.1673059463500977, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6298526525497437, "rewards/no_repetition_reward_func": -0.27919504046440125, "rewards/verse_reward_func": 0.0, "step": 2660 }, { "completion_length": 253.953125, "epoch": 21.288, "grad_norm": 2.1875, "kl": 5.176643133163452, "learning_rate": 3.5505310826136286e-05, "loss": 0.2071, "reward": 2.569870352745056, "reward_std": 3.5193995237350464, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.875909924507141, "rewards/no_repetition_reward_func": -0.29822708666324615, "rewards/verse_reward_func": -0.0078125, "step": 2661 }, { "completion_length": 253.5, "epoch": 21.296, "grad_norm": 2.890625, "kl": 5.221526622772217, "learning_rate": 3.5492639134550695e-05, "loss": 0.2089, "reward": 1.8504886031150818, "reward_std": 2.720352292060852, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1333160400390625, "rewards/no_repetition_reward_func": -0.2828274220228195, "rewards/verse_reward_func": 0.0, "step": 2662 }, { "completion_length": 254.03125, "epoch": 21.304, "grad_norm": 1.78125, "kl": 4.842368841171265, "learning_rate": 3.5479964170014746e-05, "loss": 0.1937, "reward": 2.6184608936309814, "reward_std": 2.960442066192627, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.961732506752014, "rewards/no_repetition_reward_func": -0.33545924723148346, "rewards/verse_reward_func": -0.0078125, "step": 2663 }, { "completion_length": 249.09375, "epoch": 21.312, "grad_norm": 3.328125, "kl": 4.684501886367798, "learning_rate": 3.546728593648213e-05, "loss": 0.1874, "reward": 2.082176625728607, "reward_std": 2.6043983697891235, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3698936700820923, "rewards/no_repetition_reward_func": -0.2799046188592911, "rewards/verse_reward_func": -0.0078125, "step": 2664 }, { "completion_length": 251.125, "epoch": 21.32, "grad_norm": 3.4375, "kl": 5.03705358505249, "learning_rate": 3.545460443790753e-05, "loss": 0.2015, "reward": 2.6602026224136353, "reward_std": 3.20892071723938, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.957267165184021, "rewards/no_repetition_reward_func": -0.29706454277038574, "rewards/verse_reward_func": 0.0, "step": 2665 }, { "completion_length": 252.234375, "epoch": 21.328, "grad_norm": 3.671875, "kl": 5.480250358581543, "learning_rate": 3.544191967824669e-05, "loss": 0.2192, "reward": 1.9212360382080078, "reward_std": 2.5891664028167725, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.189708411693573, "rewards/no_repetition_reward_func": -0.26847243309020996, "rewards/verse_reward_func": 0.0, "step": 2666 }, { "completion_length": 251.171875, "epoch": 21.336, "grad_norm": 3.046875, "kl": 5.591533899307251, "learning_rate": 3.542923166145633e-05, "loss": 0.2237, "reward": 2.327480435371399, "reward_std": 2.88411283493042, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6274383068084717, "rewards/no_repetition_reward_func": -0.2921454608440399, "rewards/verse_reward_func": -0.0078125, "step": 2667 }, { "completion_length": 252.703125, "epoch": 21.344, "grad_norm": 2.71875, "kl": 3.5045166015625, "learning_rate": 3.54165403914942e-05, "loss": 0.1402, "reward": 3.1108187437057495, "reward_std": 3.3613791465759277, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.444626569747925, "rewards/no_repetition_reward_func": -0.33380793035030365, "rewards/verse_reward_func": 0.0, "step": 2668 }, { "completion_length": 251.09375, "epoch": 21.352, "grad_norm": 4.5625, "kl": 4.752845287322998, "learning_rate": 3.540384587231906e-05, "loss": 0.1901, "reward": 2.6983379125595093, "reward_std": 2.8530479669570923, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.954367160797119, "rewards/no_repetition_reward_func": -0.25602930784225464, "rewards/verse_reward_func": 0.0, "step": 2669 }, { "completion_length": 254.0, "epoch": 21.36, "grad_norm": 3.828125, "kl": 4.257056474685669, "learning_rate": 3.53911481078907e-05, "loss": 0.1703, "reward": 2.15078604221344, "reward_std": 2.835730791091919, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4065712690353394, "rewards/no_repetition_reward_func": -0.255785308778286, "rewards/verse_reward_func": 0.0, "step": 2670 }, { "completion_length": 255.953125, "epoch": 21.368, "grad_norm": 3.984375, "kl": 4.217916965484619, "learning_rate": 3.5378447102169895e-05, "loss": 0.1687, "reward": 2.7814546823501587, "reward_std": 3.0726723670959473, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0539201498031616, "rewards/no_repetition_reward_func": -0.2724653482437134, "rewards/verse_reward_func": 0.0, "step": 2671 }, { "completion_length": 246.046875, "epoch": 21.376, "grad_norm": 5.875, "kl": 6.005558967590332, "learning_rate": 3.536574285911847e-05, "loss": 0.2402, "reward": 1.3969695568084717, "reward_std": 1.937573254108429, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.6595209240913391, "rewards/no_repetition_reward_func": -0.26255131512880325, "rewards/verse_reward_func": 0.0, "step": 2672 }, { "completion_length": 247.578125, "epoch": 21.384, "grad_norm": 3.09375, "kl": 3.274036169052124, "learning_rate": 3.535303538269922e-05, "loss": 0.131, "reward": 3.114245891571045, "reward_std": 2.970693588256836, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4358290433883667, "rewards/no_repetition_reward_func": -0.3137705475091934, "rewards/verse_reward_func": -0.0078125, "step": 2673 }, { "completion_length": 249.78125, "epoch": 21.392, "grad_norm": 2.5, "kl": 4.547175884246826, "learning_rate": 3.534032467687597e-05, "loss": 0.1819, "reward": 2.3844876289367676, "reward_std": 3.1485356092453003, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6575688123703003, "rewards/no_repetition_reward_func": -0.273080974817276, "rewards/verse_reward_func": 0.0, "step": 2674 }, { "completion_length": 245.671875, "epoch": 21.4, "grad_norm": 3.046875, "kl": 4.40234911441803, "learning_rate": 3.532761074561355e-05, "loss": 0.1761, "reward": 2.2795779705047607, "reward_std": 2.812758207321167, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.576169013977051, "rewards/no_repetition_reward_func": -0.288778617978096, "rewards/verse_reward_func": -0.0078125, "step": 2675 }, { "completion_length": 253.484375, "epoch": 21.408, "grad_norm": 3.375, "kl": 4.122230410575867, "learning_rate": 3.531489359287779e-05, "loss": 0.1649, "reward": 2.663666009902954, "reward_std": 2.8961987495422363, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9744114875793457, "rewards/no_repetition_reward_func": -0.3107454180717468, "rewards/verse_reward_func": 0.0, "step": 2676 }, { "completion_length": 250.5, "epoch": 21.416, "grad_norm": 3.296875, "kl": 4.757773160934448, "learning_rate": 3.5302173222635524e-05, "loss": 0.1903, "reward": 2.20634126663208, "reward_std": 2.861548066139221, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4727792739868164, "rewards/no_repetition_reward_func": -0.26643793284893036, "rewards/verse_reward_func": 0.0, "step": 2677 }, { "completion_length": 247.171875, "epoch": 21.424, "grad_norm": 3.0, "kl": 4.426108360290527, "learning_rate": 3.528944963885461e-05, "loss": 0.177, "reward": 2.07688307762146, "reward_std": 2.88293993473053, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3488277196884155, "rewards/no_repetition_reward_func": -0.2641322463750839, "rewards/verse_reward_func": -0.0078125, "step": 2678 }, { "completion_length": 248.140625, "epoch": 21.432, "grad_norm": 4.15625, "kl": 3.909531354904175, "learning_rate": 3.527672284550389e-05, "loss": 0.1564, "reward": 2.816018223762512, "reward_std": 3.0277082920074463, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0862321853637695, "rewards/no_repetition_reward_func": -0.2702140659093857, "rewards/verse_reward_func": 0.0, "step": 2679 }, { "completion_length": 245.0625, "epoch": 21.44, "grad_norm": 3.875, "kl": 4.1801981925964355, "learning_rate": 3.52639928465532e-05, "loss": 0.1672, "reward": 2.7520617246627808, "reward_std": 3.168421506881714, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0122915506362915, "rewards/no_repetition_reward_func": -0.26022979617118835, "rewards/verse_reward_func": 0.0, "step": 2680 }, { "completion_length": 251.390625, "epoch": 21.448, "grad_norm": 3.3125, "kl": 4.001266002655029, "learning_rate": 3.5251259645973394e-05, "loss": 0.1601, "reward": 3.0949511528015137, "reward_std": 3.192184090614319, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4256155490875244, "rewards/no_repetition_reward_func": -0.3228520005941391, "rewards/verse_reward_func": -0.0078125, "step": 2681 }, { "completion_length": 253.15625, "epoch": 21.456, "grad_norm": 2.703125, "kl": 4.136427044868469, "learning_rate": 3.523852324773631e-05, "loss": 0.1655, "reward": 2.949776291847229, "reward_std": 3.212481737136841, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2483174800872803, "rewards/no_repetition_reward_func": -0.29854103922843933, "rewards/verse_reward_func": 0.0, "step": 2682 }, { "completion_length": 251.109375, "epoch": 21.464, "grad_norm": 1.5546875, "kl": 4.455848693847656, "learning_rate": 3.5225783655814796e-05, "loss": 0.1782, "reward": 2.743180274963379, "reward_std": 3.1544610261917114, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0429344177246094, "rewards/no_repetition_reward_func": -0.2919416129589081, "rewards/verse_reward_func": -0.0078125, "step": 2683 }, { "completion_length": 255.96875, "epoch": 21.472, "grad_norm": 2.609375, "kl": 5.118905067443848, "learning_rate": 3.521304087418269e-05, "loss": 0.2048, "reward": 3.0042821168899536, "reward_std": 3.4681228399276733, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3149068355560303, "rewards/no_repetition_reward_func": -0.31062473356723785, "rewards/verse_reward_func": 0.0, "step": 2684 }, { "completion_length": 255.359375, "epoch": 21.48, "grad_norm": 1.6875, "kl": 5.873250722885132, "learning_rate": 3.5200294906814824e-05, "loss": 0.2349, "reward": 2.598334550857544, "reward_std": 3.0685967206954956, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.879473924636841, "rewards/no_repetition_reward_func": -0.2733268141746521, "rewards/verse_reward_func": -0.0078125, "step": 2685 }, { "completion_length": 250.609375, "epoch": 21.488, "grad_norm": 11.5, "kl": 7.956613540649414, "learning_rate": 3.5187545757687015e-05, "loss": 0.3183, "reward": 1.531522810459137, "reward_std": 2.2670650482177734, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.82299143075943, "rewards/no_repetition_reward_func": -0.29146865010261536, "rewards/verse_reward_func": 0.0, "step": 2686 }, { "completion_length": 249.828125, "epoch": 21.496, "grad_norm": 3.515625, "kl": 6.4892261028289795, "learning_rate": 3.517479343077611e-05, "loss": 0.2596, "reward": 2.4869980812072754, "reward_std": 3.321746587753296, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7973331212997437, "rewards/no_repetition_reward_func": -0.3025224804878235, "rewards/verse_reward_func": -0.0078125, "step": 2687 }, { "completion_length": 243.75, "epoch": 21.504, "grad_norm": 4.96875, "kl": 5.617842674255371, "learning_rate": 3.516203793005989e-05, "loss": 0.2247, "reward": 2.034395396709442, "reward_std": 2.5388216972351074, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.298748016357422, "rewards/no_repetition_reward_func": -0.25654011964797974, "rewards/verse_reward_func": -0.0078125, "step": 2688 }, { "completion_length": 249.40625, "epoch": 21.512, "grad_norm": 1.390625, "kl": 6.07291841506958, "learning_rate": 3.514927925951717e-05, "loss": 0.2429, "reward": 2.6741745471954346, "reward_std": 3.519699811935425, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9600569009780884, "rewards/no_repetition_reward_func": -0.2780698984861374, "rewards/verse_reward_func": -0.0078125, "step": 2689 }, { "completion_length": 250.84375, "epoch": 21.52, "grad_norm": 2.09375, "kl": 5.484033823013306, "learning_rate": 3.513651742312774e-05, "loss": 0.2194, "reward": 2.459504008293152, "reward_std": 3.095902919769287, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7463361024856567, "rewards/no_repetition_reward_func": -0.2790197432041168, "rewards/verse_reward_func": -0.0078125, "step": 2690 }, { "completion_length": 252.484375, "epoch": 21.528, "grad_norm": 4.0625, "kl": 5.165396213531494, "learning_rate": 3.512375242487236e-05, "loss": 0.2066, "reward": 1.8635209798812866, "reward_std": 2.895676374435425, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1235586404800415, "rewards/no_repetition_reward_func": -0.26003769785165787, "rewards/verse_reward_func": 0.0, "step": 2691 }, { "completion_length": 253.3125, "epoch": 21.536, "grad_norm": 6.90625, "kl": 6.152508974075317, "learning_rate": 3.511098426873283e-05, "loss": 0.2461, "reward": 1.9098873138427734, "reward_std": 2.6354066133499146, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1611937284469604, "rewards/no_repetition_reward_func": -0.25130636990070343, "rewards/verse_reward_func": 0.0, "step": 2692 }, { "completion_length": 247.703125, "epoch": 21.544, "grad_norm": 1.4375, "kl": 4.761345863342285, "learning_rate": 3.5098212958691854e-05, "loss": 0.1905, "reward": 2.419693946838379, "reward_std": 2.8586690425872803, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.690201163291931, "rewards/no_repetition_reward_func": -0.2705071493983269, "rewards/verse_reward_func": 0.0, "step": 2693 }, { "completion_length": 248.953125, "epoch": 21.552, "grad_norm": 4.90625, "kl": 5.520068645477295, "learning_rate": 3.50854384987332e-05, "loss": 0.2208, "reward": 2.4589977860450745, "reward_std": 2.4846450090408325, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.727825403213501, "rewards/no_repetition_reward_func": -0.2610151916742325, "rewards/verse_reward_func": -0.0078125, "step": 2694 }, { "completion_length": 249.5625, "epoch": 21.56, "grad_norm": 3.328125, "kl": 4.296879768371582, "learning_rate": 3.507266089284157e-05, "loss": 0.1719, "reward": 2.4427802562713623, "reward_std": 3.0828511714935303, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7244646549224854, "rewards/no_repetition_reward_func": -0.2816842794418335, "rewards/verse_reward_func": 0.0, "step": 2695 }, { "completion_length": 250.0625, "epoch": 21.568, "grad_norm": 2.421875, "kl": 3.9677116870880127, "learning_rate": 3.5059880145002654e-05, "loss": 0.1587, "reward": 2.571810483932495, "reward_std": 3.071771740913391, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.903540849685669, "rewards/no_repetition_reward_func": -0.33173027634620667, "rewards/verse_reward_func": 0.0, "step": 2696 }, { "completion_length": 256.0, "epoch": 21.576, "grad_norm": 4.625, "kl": 4.494335174560547, "learning_rate": 3.5047096259203135e-05, "loss": 0.1798, "reward": 2.6805777549743652, "reward_std": 3.118072748184204, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.98422110080719, "rewards/no_repetition_reward_func": -0.29583074152469635, "rewards/verse_reward_func": -0.0078125, "step": 2697 }, { "completion_length": 251.609375, "epoch": 21.584, "grad_norm": 2.734375, "kl": 3.4716821908950806, "learning_rate": 3.503430923943066e-05, "loss": 0.1389, "reward": 2.890663981437683, "reward_std": 2.861201524734497, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1996114253997803, "rewards/no_repetition_reward_func": -0.2933226227760315, "rewards/verse_reward_func": -0.015625, "step": 2698 }, { "completion_length": 247.859375, "epoch": 21.592, "grad_norm": 1.671875, "kl": 4.2865095138549805, "learning_rate": 3.5021519089673876e-05, "loss": 0.1715, "reward": 2.0498300790786743, "reward_std": 2.809962511062622, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3130516409873962, "rewards/no_repetition_reward_func": -0.2475965991616249, "rewards/verse_reward_func": -0.015625, "step": 2699 }, { "completion_length": 254.609375, "epoch": 21.6, "grad_norm": 3.359375, "kl": 3.969177722930908, "learning_rate": 3.5008725813922386e-05, "loss": 0.1588, "reward": 2.5807911157608032, "reward_std": 2.8141238689422607, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.873564124107361, "rewards/no_repetition_reward_func": -0.2849603444337845, "rewards/verse_reward_func": -0.0078125, "step": 2700 }, { "completion_length": 253.0625, "epoch": 21.608, "grad_norm": 2.03125, "kl": 4.569220542907715, "learning_rate": 3.4995929416166756e-05, "loss": 0.1828, "reward": 2.608228862285614, "reward_std": 3.1366238594055176, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9154398441314697, "rewards/no_repetition_reward_func": -0.2993985339999199, "rewards/verse_reward_func": -0.0078125, "step": 2701 }, { "completion_length": 250.25, "epoch": 21.616, "grad_norm": 1.875, "kl": 3.983293890953064, "learning_rate": 3.498312990039856e-05, "loss": 0.1593, "reward": 2.645170569419861, "reward_std": 2.739049792289734, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8970504999160767, "rewards/no_repetition_reward_func": -0.25188012421131134, "rewards/verse_reward_func": 0.0, "step": 2702 }, { "completion_length": 254.328125, "epoch": 21.624, "grad_norm": 2.796875, "kl": 4.11439847946167, "learning_rate": 3.497032727061034e-05, "loss": 0.1646, "reward": 2.5397257804870605, "reward_std": 3.354505777359009, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.852707862854004, "rewards/no_repetition_reward_func": -0.31298205256462097, "rewards/verse_reward_func": 0.0, "step": 2703 }, { "completion_length": 248.140625, "epoch": 21.632, "grad_norm": 1.90625, "kl": 4.511361360549927, "learning_rate": 3.495752153079557e-05, "loss": 0.1805, "reward": 2.023254632949829, "reward_std": 3.089200258255005, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.340646505355835, "rewards/no_repetition_reward_func": -0.30957941710948944, "rewards/verse_reward_func": -0.0078125, "step": 2704 }, { "completion_length": 252.75, "epoch": 21.64, "grad_norm": 1.4609375, "kl": 5.136857032775879, "learning_rate": 3.494471268494875e-05, "loss": 0.2055, "reward": 2.3498942852020264, "reward_std": 3.003011107444763, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6656018495559692, "rewards/no_repetition_reward_func": -0.3078950494527817, "rewards/verse_reward_func": -0.0078125, "step": 2705 }, { "completion_length": 240.046875, "epoch": 21.648, "grad_norm": 1.65625, "kl": 5.227049827575684, "learning_rate": 3.493190073706529e-05, "loss": 0.2091, "reward": 2.4117624759674072, "reward_std": 3.2527151107788086, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.68715763092041, "rewards/no_repetition_reward_func": -0.26758264750242233, "rewards/verse_reward_func": -0.0078125, "step": 2706 }, { "completion_length": 248.0625, "epoch": 21.656, "grad_norm": 2.453125, "kl": 5.074075698852539, "learning_rate": 3.491908569114164e-05, "loss": 0.203, "reward": 2.812162399291992, "reward_std": 3.4002054929733276, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.102999210357666, "rewards/no_repetition_reward_func": -0.2908366918563843, "rewards/verse_reward_func": 0.0, "step": 2707 }, { "completion_length": 256.0, "epoch": 21.664, "grad_norm": 5.90625, "kl": 5.3223114013671875, "learning_rate": 3.4906267551175124e-05, "loss": 0.2129, "reward": 2.4787209033966064, "reward_std": 3.0968685150146484, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.781050205230713, "rewards/no_repetition_reward_func": -0.30232924222946167, "rewards/verse_reward_func": 0.0, "step": 2708 }, { "completion_length": 251.609375, "epoch": 21.672, "grad_norm": 4.28125, "kl": 5.126291036605835, "learning_rate": 3.489344632116412e-05, "loss": 0.2051, "reward": 2.7206313610076904, "reward_std": 3.138688564300537, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0198750495910645, "rewards/no_repetition_reward_func": -0.29924361407756805, "rewards/verse_reward_func": 0.0, "step": 2709 }, { "completion_length": 249.625, "epoch": 21.68, "grad_norm": 3.453125, "kl": 5.908446788787842, "learning_rate": 3.488062200510791e-05, "loss": 0.2363, "reward": 2.077968657016754, "reward_std": 2.8911080360412598, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.354377031326294, "rewards/no_repetition_reward_func": -0.2607835531234741, "rewards/verse_reward_func": -0.015625, "step": 2710 }, { "completion_length": 252.515625, "epoch": 21.688, "grad_norm": 3.1875, "kl": 5.864310026168823, "learning_rate": 3.4867794607006784e-05, "loss": 0.2346, "reward": 2.455195128917694, "reward_std": 2.876625895500183, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7188758850097656, "rewards/no_repetition_reward_func": -0.2636808305978775, "rewards/verse_reward_func": 0.0, "step": 2711 }, { "completion_length": 242.65625, "epoch": 21.696, "grad_norm": 2.0625, "kl": 5.143085718154907, "learning_rate": 3.485496413086195e-05, "loss": 0.2057, "reward": 2.3996593952178955, "reward_std": 3.122380256652832, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.699162721633911, "rewards/no_repetition_reward_func": -0.2838784158229828, "rewards/verse_reward_func": -0.015625, "step": 2712 }, { "completion_length": 251.0625, "epoch": 21.704, "grad_norm": 2.03125, "kl": 4.6003618240356445, "learning_rate": 3.484213058067559e-05, "loss": 0.184, "reward": 2.701693058013916, "reward_std": 2.9838865995407104, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9911134243011475, "rewards/no_repetition_reward_func": -0.28942039608955383, "rewards/verse_reward_func": 0.0, "step": 2713 }, { "completion_length": 254.859375, "epoch": 21.712, "grad_norm": 4.21875, "kl": 4.704369306564331, "learning_rate": 3.482929396045087e-05, "loss": 0.1882, "reward": 3.2612804174423218, "reward_std": 3.4869483709335327, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.6137306690216064, "rewards/no_repetition_reward_func": -0.34463781118392944, "rewards/verse_reward_func": -0.0078125, "step": 2714 }, { "completion_length": 248.921875, "epoch": 21.72, "grad_norm": 3.0625, "kl": 4.257962942123413, "learning_rate": 3.481645427419188e-05, "loss": 0.1703, "reward": 2.493631601333618, "reward_std": 3.0263566970825195, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.797682285308838, "rewards/no_repetition_reward_func": -0.3040505349636078, "rewards/verse_reward_func": 0.0, "step": 2715 }, { "completion_length": 252.578125, "epoch": 21.728, "grad_norm": 2.0625, "kl": 5.345854997634888, "learning_rate": 3.4803611525903685e-05, "loss": 0.2138, "reward": 2.1089612245559692, "reward_std": 2.501000761985779, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.364847183227539, "rewards/no_repetition_reward_func": -0.24807336926460266, "rewards/verse_reward_func": -0.0078125, "step": 2716 }, { "completion_length": 250.46875, "epoch": 21.736, "grad_norm": 5.09375, "kl": 5.702582836151123, "learning_rate": 3.479076571959231e-05, "loss": 0.2281, "reward": 2.2637072801589966, "reward_std": 2.8901851177215576, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5601872205734253, "rewards/no_repetition_reward_func": -0.29647988080978394, "rewards/verse_reward_func": 0.0, "step": 2717 }, { "completion_length": 250.109375, "epoch": 21.744, "grad_norm": 3.96875, "kl": 4.623051404953003, "learning_rate": 3.477791685926471e-05, "loss": 0.1849, "reward": 3.120347499847412, "reward_std": 3.247213840484619, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4119337797164917, "rewards/no_repetition_reward_func": -0.2837739735841751, "rewards/verse_reward_func": -0.0078125, "step": 2718 }, { "completion_length": 252.359375, "epoch": 21.752, "grad_norm": 16.875, "kl": 6.5206499099731445, "learning_rate": 3.4765064948928814e-05, "loss": 0.2608, "reward": 3.0364203453063965, "reward_std": 3.0586416721343994, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3303239345550537, "rewards/no_repetition_reward_func": -0.2939034700393677, "rewards/verse_reward_func": 0.0, "step": 2719 }, { "completion_length": 246.1875, "epoch": 21.76, "grad_norm": 3.9375, "kl": 4.814740419387817, "learning_rate": 3.475220999259349e-05, "loss": 0.1926, "reward": 2.334904909133911, "reward_std": 2.9625792503356934, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6123201847076416, "rewards/no_repetition_reward_func": -0.2774154245853424, "rewards/verse_reward_func": 0.0, "step": 2720 }, { "completion_length": 250.96875, "epoch": 21.768, "grad_norm": 2.390625, "kl": 4.4925572872161865, "learning_rate": 3.473935199426858e-05, "loss": 0.1797, "reward": 2.617505192756653, "reward_std": 2.705155611038208, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9144002199172974, "rewards/no_repetition_reward_func": -0.2968951314687729, "rewards/verse_reward_func": 0.0, "step": 2721 }, { "completion_length": 245.3125, "epoch": 21.776, "grad_norm": 11.9375, "kl": 5.924263954162598, "learning_rate": 3.4726490957964834e-05, "loss": 0.237, "reward": 2.0513139963150024, "reward_std": 3.0322837829589844, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.322062909603119, "rewards/no_repetition_reward_func": -0.2551238089799881, "rewards/verse_reward_func": -0.015625, "step": 2722 }, { "completion_length": 251.859375, "epoch": 21.784, "grad_norm": 1.984375, "kl": 5.023334741592407, "learning_rate": 3.471362688769398e-05, "loss": 0.2009, "reward": 2.530726909637451, "reward_std": 3.0753313302993774, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8156224489212036, "rewards/no_repetition_reward_func": -0.28489553928375244, "rewards/verse_reward_func": 0.0, "step": 2723 }, { "completion_length": 250.734375, "epoch": 21.792, "grad_norm": 2.515625, "kl": 5.288913011550903, "learning_rate": 3.4700759787468695e-05, "loss": 0.2116, "reward": 3.0310211181640625, "reward_std": 2.572834610939026, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3269392251968384, "rewards/no_repetition_reward_func": -0.29591788351535797, "rewards/verse_reward_func": 0.0, "step": 2724 }, { "completion_length": 253.078125, "epoch": 21.8, "grad_norm": 1.8203125, "kl": 5.03266453742981, "learning_rate": 3.4687889661302576e-05, "loss": 0.2013, "reward": 2.359174609184265, "reward_std": 2.790994167327881, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.626866579055786, "rewards/no_repetition_reward_func": -0.2676919847726822, "rewards/verse_reward_func": 0.0, "step": 2725 }, { "completion_length": 255.09375, "epoch": 21.808, "grad_norm": 2.53125, "kl": 5.159397125244141, "learning_rate": 3.467501651321019e-05, "loss": 0.2064, "reward": 2.5147149562835693, "reward_std": 3.038466691970825, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7834399938583374, "rewards/no_repetition_reward_func": -0.2687249779701233, "rewards/verse_reward_func": 0.0, "step": 2726 }, { "completion_length": 250.328125, "epoch": 21.816, "grad_norm": 2.453125, "kl": 4.980376482009888, "learning_rate": 3.466214034720702e-05, "loss": 0.1992, "reward": 2.00734281539917, "reward_std": 2.63521945476532, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.291089653968811, "rewards/no_repetition_reward_func": -0.2837468385696411, "rewards/verse_reward_func": 0.0, "step": 2727 }, { "completion_length": 253.578125, "epoch": 21.824, "grad_norm": 2.40625, "kl": 5.4033355712890625, "learning_rate": 3.4649261167309526e-05, "loss": 0.2161, "reward": 1.9351129531860352, "reward_std": 2.8706074953079224, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1868560314178467, "rewards/no_repetition_reward_func": -0.2517430931329727, "rewards/verse_reward_func": 0.0, "step": 2728 }, { "completion_length": 249.109375, "epoch": 21.832, "grad_norm": 2.28125, "kl": 4.043223142623901, "learning_rate": 3.4636378977535075e-05, "loss": 0.1617, "reward": 2.2528269290924072, "reward_std": 2.446428060531616, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4880900382995605, "rewards/no_repetition_reward_func": -0.22745048999786377, "rewards/verse_reward_func": -0.0078125, "step": 2729 }, { "completion_length": 252.875, "epoch": 21.84, "grad_norm": 2.125, "kl": 4.227172613143921, "learning_rate": 3.462349378190199e-05, "loss": 0.1691, "reward": 1.9377825856208801, "reward_std": 2.770385265350342, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.223264694213867, "rewards/no_repetition_reward_func": -0.27766960859298706, "rewards/verse_reward_func": -0.0078125, "step": 2730 }, { "completion_length": 249.765625, "epoch": 21.848, "grad_norm": 4.84375, "kl": 2.8439489603042603, "learning_rate": 3.461060558442952e-05, "loss": 0.1138, "reward": 3.062549114227295, "reward_std": 3.0736606121063232, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.361386775970459, "rewards/no_repetition_reward_func": -0.29883745312690735, "rewards/verse_reward_func": 0.0, "step": 2731 }, { "completion_length": 246.90625, "epoch": 21.856, "grad_norm": 4.5625, "kl": 4.484829425811768, "learning_rate": 3.459771438913787e-05, "loss": 0.1794, "reward": 1.703876256942749, "reward_std": 2.3069549798965454, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9259673357009888, "rewards/no_repetition_reward_func": -0.2220909669995308, "rewards/verse_reward_func": 0.0, "step": 2732 }, { "completion_length": 253.03125, "epoch": 21.864, "grad_norm": 4.15625, "kl": 3.808169364929199, "learning_rate": 3.458482020004815e-05, "loss": 0.1523, "reward": 2.267799973487854, "reward_std": 3.098820686340332, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.573665499687195, "rewards/no_repetition_reward_func": -0.305865615606308, "rewards/verse_reward_func": 0.0, "step": 2733 }, { "completion_length": 253.796875, "epoch": 21.872, "grad_norm": 3.953125, "kl": 3.21604061126709, "learning_rate": 3.457192302118244e-05, "loss": 0.1286, "reward": 2.793545961380005, "reward_std": 3.36858332157135, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0719668865203857, "rewards/no_repetition_reward_func": -0.278421014547348, "rewards/verse_reward_func": 0.0, "step": 2734 }, { "completion_length": 252.359375, "epoch": 21.88, "grad_norm": 3.34375, "kl": 4.0617406368255615, "learning_rate": 3.455902285656373e-05, "loss": 0.1625, "reward": 2.8401501178741455, "reward_std": 3.5418671369552612, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1844617128372192, "rewards/no_repetition_reward_func": -0.3443116396665573, "rewards/verse_reward_func": 0.0, "step": 2735 }, { "completion_length": 252.15625, "epoch": 21.888, "grad_norm": 2.578125, "kl": 3.9682698249816895, "learning_rate": 3.454611971021593e-05, "loss": 0.1587, "reward": 2.4994919300079346, "reward_std": 2.891532778739929, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.753687620162964, "rewards/no_repetition_reward_func": -0.24638324975967407, "rewards/verse_reward_func": -0.0078125, "step": 2736 }, { "completion_length": 249.5625, "epoch": 21.896, "grad_norm": 2.375, "kl": 3.5907225608825684, "learning_rate": 3.453321358616393e-05, "loss": 0.1436, "reward": 2.1484609842300415, "reward_std": 2.5316171646118164, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4304096698760986, "rewards/no_repetition_reward_func": -0.2819487303495407, "rewards/verse_reward_func": 0.0, "step": 2737 }, { "completion_length": 251.015625, "epoch": 21.904, "grad_norm": 2.375, "kl": 3.344138979911804, "learning_rate": 3.452030448843347e-05, "loss": 0.1338, "reward": 2.664926528930664, "reward_std": 2.9889910221099854, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.928482174873352, "rewards/no_repetition_reward_func": -0.2557431757450104, "rewards/verse_reward_func": -0.0078125, "step": 2738 }, { "completion_length": 252.46875, "epoch": 21.912, "grad_norm": 1.5390625, "kl": 4.042092442512512, "learning_rate": 3.45073924210513e-05, "loss": 0.1617, "reward": 2.1746931672096252, "reward_std": 2.6508995294570923, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4255019426345825, "rewards/no_repetition_reward_func": -0.2429962456226349, "rewards/verse_reward_func": -0.0078125, "step": 2739 }, { "completion_length": 242.546875, "epoch": 21.92, "grad_norm": 3.203125, "kl": 4.188169479370117, "learning_rate": 3.4494477388045035e-05, "loss": 0.1675, "reward": 1.7656832933425903, "reward_std": 2.684933662414551, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0547179579734802, "rewards/no_repetition_reward_func": -0.27340956032276154, "rewards/verse_reward_func": -0.015625, "step": 2740 }, { "completion_length": 245.46875, "epoch": 21.928, "grad_norm": 2.0, "kl": 4.357338786125183, "learning_rate": 3.448155939344324e-05, "loss": 0.1743, "reward": 2.2472589015960693, "reward_std": 2.8305749893188477, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.52913635969162, "rewards/no_repetition_reward_func": -0.27406495809555054, "rewards/verse_reward_func": -0.0078125, "step": 2741 }, { "completion_length": 253.703125, "epoch": 21.936, "grad_norm": 3.8125, "kl": 4.4624924659729, "learning_rate": 3.4468638441275415e-05, "loss": 0.1785, "reward": 2.145301580429077, "reward_std": 2.770263433456421, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.390986919403076, "rewards/no_repetition_reward_func": -0.24568530172109604, "rewards/verse_reward_func": 0.0, "step": 2742 }, { "completion_length": 252.390625, "epoch": 21.944, "grad_norm": 3.90625, "kl": 5.1977763175964355, "learning_rate": 3.445571453557196e-05, "loss": 0.2079, "reward": 1.84543114900589, "reward_std": 2.7944713830947876, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.110028088092804, "rewards/no_repetition_reward_func": -0.26459700614213943, "rewards/verse_reward_func": 0.0, "step": 2743 }, { "completion_length": 251.4375, "epoch": 21.951999999999998, "grad_norm": 4.0625, "kl": 4.081314563751221, "learning_rate": 3.444278768036421e-05, "loss": 0.1633, "reward": 2.616711735725403, "reward_std": 3.177279233932495, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8853697776794434, "rewards/no_repetition_reward_func": -0.26865819096565247, "rewards/verse_reward_func": 0.0, "step": 2744 }, { "completion_length": 248.5625, "epoch": 21.96, "grad_norm": 4.28125, "kl": 4.362367391586304, "learning_rate": 3.442985787968442e-05, "loss": 0.1745, "reward": 2.609548330307007, "reward_std": 2.7190186977386475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.877181053161621, "rewards/no_repetition_reward_func": -0.25982002913951874, "rewards/verse_reward_func": -0.0078125, "step": 2745 }, { "completion_length": 254.515625, "epoch": 21.968, "grad_norm": 2.859375, "kl": 5.03757905960083, "learning_rate": 3.4416925137565754e-05, "loss": 0.2015, "reward": 2.20046067237854, "reward_std": 2.6264586448669434, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.449631094932556, "rewards/no_repetition_reward_func": -0.24917051196098328, "rewards/verse_reward_func": 0.0, "step": 2746 }, { "completion_length": 248.078125, "epoch": 21.976, "grad_norm": 3.453125, "kl": 4.2966320514678955, "learning_rate": 3.440398945804229e-05, "loss": 0.1719, "reward": 2.56934654712677, "reward_std": 3.1947327852249146, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.847731113433838, "rewards/no_repetition_reward_func": -0.27057206630706787, "rewards/verse_reward_func": -0.0078125, "step": 2747 }, { "completion_length": 253.890625, "epoch": 21.984, "grad_norm": 3.609375, "kl": 4.545084714889526, "learning_rate": 3.439105084514905e-05, "loss": 0.1818, "reward": 2.6197198629379272, "reward_std": 3.14014995098114, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9044069051742554, "rewards/no_repetition_reward_func": -0.28468701243400574, "rewards/verse_reward_func": 0.0, "step": 2748 }, { "completion_length": 248.90625, "epoch": 21.992, "grad_norm": 3.859375, "kl": 4.630160689353943, "learning_rate": 3.437810930292195e-05, "loss": 0.1852, "reward": 2.4014145135879517, "reward_std": 2.5324000120162964, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.653851270675659, "rewards/no_repetition_reward_func": -0.25243697315454483, "rewards/verse_reward_func": 0.0, "step": 2749 }, { "completion_length": 244.875, "epoch": 22.0, "grad_norm": 2.796875, "kl": 4.82483983039856, "learning_rate": 3.436516483539781e-05, "loss": 0.193, "reward": 2.4623947143554688, "reward_std": 2.9587323665618896, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.752513289451599, "rewards/no_repetition_reward_func": -0.29011842608451843, "rewards/verse_reward_func": 0.0, "step": 2750 }, { "completion_length": 245.625, "epoch": 22.008, "grad_norm": 1.859375, "kl": 4.858478307723999, "learning_rate": 3.435221744661438e-05, "loss": 0.1943, "reward": 2.3816288709640503, "reward_std": 3.2548381090164185, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6713757514953613, "rewards/no_repetition_reward_func": -0.27412185072898865, "rewards/verse_reward_func": -0.015625, "step": 2751 }, { "completion_length": 252.5, "epoch": 22.016, "grad_norm": 2.453125, "kl": 4.472387790679932, "learning_rate": 3.433926714061032e-05, "loss": 0.1789, "reward": 2.71647310256958, "reward_std": 3.1445271968841553, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.049625277519226, "rewards/no_repetition_reward_func": -0.32533977925777435, "rewards/verse_reward_func": -0.0078125, "step": 2752 }, { "completion_length": 243.71875, "epoch": 22.024, "grad_norm": 1.5859375, "kl": 4.560039401054382, "learning_rate": 3.432631392142519e-05, "loss": 0.1824, "reward": 2.571096658706665, "reward_std": 3.2140337228775024, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8854719400405884, "rewards/no_repetition_reward_func": -0.3065629228949547, "rewards/verse_reward_func": -0.0078125, "step": 2753 }, { "completion_length": 246.703125, "epoch": 22.032, "grad_norm": 2.25, "kl": 4.550965309143066, "learning_rate": 3.431335779309947e-05, "loss": 0.182, "reward": 2.0306970477104187, "reward_std": 2.522837996482849, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.287715971469879, "rewards/no_repetition_reward_func": -0.2570188343524933, "rewards/verse_reward_func": 0.0, "step": 2754 }, { "completion_length": 256.0, "epoch": 22.04, "grad_norm": 2.640625, "kl": 4.740448951721191, "learning_rate": 3.430039875967454e-05, "loss": 0.1896, "reward": 2.771466374397278, "reward_std": 3.1427762508392334, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0903894901275635, "rewards/no_repetition_reward_func": -0.3189230412244797, "rewards/verse_reward_func": 0.0, "step": 2755 }, { "completion_length": 249.4375, "epoch": 22.048, "grad_norm": 2.0, "kl": 4.720620632171631, "learning_rate": 3.428743682519269e-05, "loss": 0.1888, "reward": 2.0054277777671814, "reward_std": 2.7035998106002808, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.248774528503418, "rewards/no_repetition_reward_func": -0.24334686994552612, "rewards/verse_reward_func": 0.0, "step": 2756 }, { "completion_length": 252.421875, "epoch": 22.056, "grad_norm": 3.03125, "kl": 5.00327205657959, "learning_rate": 3.427447199369711e-05, "loss": 0.2001, "reward": 2.3289974331855774, "reward_std": 2.7751561403274536, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.634157657623291, "rewards/no_repetition_reward_func": -0.297347754240036, "rewards/verse_reward_func": -0.0078125, "step": 2757 }, { "completion_length": 249.078125, "epoch": 22.064, "grad_norm": 1.71875, "kl": 4.929533958435059, "learning_rate": 3.4261504269231904e-05, "loss": 0.1972, "reward": 2.1315571069717407, "reward_std": 2.647746205329895, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3823494911193848, "rewards/no_repetition_reward_func": -0.2507924288511276, "rewards/verse_reward_func": 0.0, "step": 2758 }, { "completion_length": 249.09375, "epoch": 22.072, "grad_norm": 2.625, "kl": 3.9509804248809814, "learning_rate": 3.4248533655842066e-05, "loss": 0.158, "reward": 2.8500101566314697, "reward_std": 3.121552348136902, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1410118341445923, "rewards/no_repetition_reward_func": -0.29100191593170166, "rewards/verse_reward_func": 0.0, "step": 2759 }, { "completion_length": 251.96875, "epoch": 22.08, "grad_norm": 3.109375, "kl": 4.83384895324707, "learning_rate": 3.423556015757349e-05, "loss": 0.1934, "reward": 2.1095266342163086, "reward_std": 2.903302550315857, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4199421405792236, "rewards/no_repetition_reward_func": -0.31041552126407623, "rewards/verse_reward_func": 0.0, "step": 2760 }, { "completion_length": 252.78125, "epoch": 22.088, "grad_norm": 3.6875, "kl": 5.579763889312744, "learning_rate": 3.4222583778472996e-05, "loss": 0.2232, "reward": 2.1036932468414307, "reward_std": 2.439785599708557, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.358803629875183, "rewards/no_repetition_reward_func": -0.25511030852794647, "rewards/verse_reward_func": 0.0, "step": 2761 }, { "completion_length": 241.359375, "epoch": 22.096, "grad_norm": 2.03125, "kl": 4.634203910827637, "learning_rate": 3.4209604522588255e-05, "loss": 0.1854, "reward": 2.3048473596572876, "reward_std": 3.0739344358444214, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.60315477848053, "rewards/no_repetition_reward_func": -0.298307329416275, "rewards/verse_reward_func": 0.0, "step": 2762 }, { "completion_length": 249.84375, "epoch": 22.104, "grad_norm": 3.328125, "kl": 6.5090491771698, "learning_rate": 3.419662239396789e-05, "loss": 0.2604, "reward": 1.986690878868103, "reward_std": 2.5774916410446167, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.221747934818268, "rewards/no_repetition_reward_func": -0.23505698889493942, "rewards/verse_reward_func": 0.0, "step": 2763 }, { "completion_length": 247.390625, "epoch": 22.112, "grad_norm": 1.6015625, "kl": 4.698148727416992, "learning_rate": 3.418363739666137e-05, "loss": 0.1879, "reward": 2.2531096935272217, "reward_std": 2.863631248474121, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.541551351547241, "rewards/no_repetition_reward_func": -0.2728165239095688, "rewards/verse_reward_func": -0.015625, "step": 2764 }, { "completion_length": 252.546875, "epoch": 22.12, "grad_norm": 4.65625, "kl": 4.6735053062438965, "learning_rate": 3.417064953471911e-05, "loss": 0.1869, "reward": 2.6611104011535645, "reward_std": 2.940781831741333, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9592500925064087, "rewards/no_repetition_reward_func": -0.2981394827365875, "rewards/verse_reward_func": 0.0, "step": 2765 }, { "completion_length": 251.09375, "epoch": 22.128, "grad_norm": 2.078125, "kl": 4.915053129196167, "learning_rate": 3.415765881219236e-05, "loss": 0.1966, "reward": 2.064933180809021, "reward_std": 2.7279934883117676, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.306015729904175, "rewards/no_repetition_reward_func": -0.24108266830444336, "rewards/verse_reward_func": 0.0, "step": 2766 }, { "completion_length": 253.171875, "epoch": 22.136, "grad_norm": 5.90625, "kl": 5.47817325592041, "learning_rate": 3.414466523313332e-05, "loss": 0.2191, "reward": 2.0096627473831177, "reward_std": 2.9026944637298584, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2440985441207886, "rewards/no_repetition_reward_func": -0.23443593829870224, "rewards/verse_reward_func": 0.0, "step": 2767 }, { "completion_length": 247.90625, "epoch": 22.144, "grad_norm": 3.875, "kl": 4.0011632442474365, "learning_rate": 3.4131668801595027e-05, "loss": 0.16, "reward": 3.2087442874908447, "reward_std": 3.5354323387145996, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.493510603904724, "rewards/no_repetition_reward_func": -0.2769537717103958, "rewards/verse_reward_func": -0.0078125, "step": 2768 }, { "completion_length": 251.703125, "epoch": 22.152, "grad_norm": 2.734375, "kl": 4.274991273880005, "learning_rate": 3.411866952163146e-05, "loss": 0.171, "reward": 2.2435840368270874, "reward_std": 3.0197556018829346, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5427180528640747, "rewards/no_repetition_reward_func": -0.29913410544395447, "rewards/verse_reward_func": 0.0, "step": 2769 }, { "completion_length": 250.140625, "epoch": 22.16, "grad_norm": 3.3125, "kl": 3.8396689891815186, "learning_rate": 3.410566739729746e-05, "loss": 0.1536, "reward": 2.9686241149902344, "reward_std": 3.2765272855758667, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2611260414123535, "rewards/no_repetition_reward_func": -0.2846895158290863, "rewards/verse_reward_func": -0.0078125, "step": 2770 }, { "completion_length": 251.703125, "epoch": 22.168, "grad_norm": 3.28125, "kl": 3.8242335319519043, "learning_rate": 3.409266243264874e-05, "loss": 0.153, "reward": 3.0403518676757812, "reward_std": 2.71550190448761, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3344361782073975, "rewards/no_repetition_reward_func": -0.29408422112464905, "rewards/verse_reward_func": 0.0, "step": 2771 }, { "completion_length": 249.09375, "epoch": 22.176, "grad_norm": 4.125, "kl": 4.729024887084961, "learning_rate": 3.407965463174192e-05, "loss": 0.1892, "reward": 2.656785488128662, "reward_std": 3.291304588317871, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.952207088470459, "rewards/no_repetition_reward_func": -0.2876090630888939, "rewards/verse_reward_func": -0.0078125, "step": 2772 }, { "completion_length": 254.953125, "epoch": 22.184, "grad_norm": 2.171875, "kl": 4.571130752563477, "learning_rate": 3.4066643998634505e-05, "loss": 0.1828, "reward": 2.537906527519226, "reward_std": 3.1671277284622192, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8316181898117065, "rewards/no_repetition_reward_func": -0.2858991324901581, "rewards/verse_reward_func": -0.0078125, "step": 2773 }, { "completion_length": 251.21875, "epoch": 22.192, "grad_norm": 2.65625, "kl": 5.05556321144104, "learning_rate": 3.4053630537384885e-05, "loss": 0.2022, "reward": 2.32712459564209, "reward_std": 3.0081180334091187, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6018145084381104, "rewards/no_repetition_reward_func": -0.2746897339820862, "rewards/verse_reward_func": 0.0, "step": 2774 }, { "completion_length": 251.0, "epoch": 22.2, "grad_norm": 3.0, "kl": 5.176998138427734, "learning_rate": 3.4040614252052305e-05, "loss": 0.2071, "reward": 2.951568841934204, "reward_std": 3.4706801176071167, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2422282695770264, "rewards/no_repetition_reward_func": -0.282847061753273, "rewards/verse_reward_func": -0.0078125, "step": 2775 }, { "completion_length": 250.65625, "epoch": 22.208, "grad_norm": 3.375, "kl": 6.6779046058654785, "learning_rate": 3.402759514669694e-05, "loss": 0.2671, "reward": 2.122614622116089, "reward_std": 2.8845043182373047, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3918890953063965, "rewards/no_repetition_reward_func": -0.2614619731903076, "rewards/verse_reward_func": -0.0078125, "step": 2776 }, { "completion_length": 250.421875, "epoch": 22.216, "grad_norm": 3.328125, "kl": 5.4458723068237305, "learning_rate": 3.401457322537979e-05, "loss": 0.2178, "reward": 2.229120969772339, "reward_std": 2.9157938957214355, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5155922770500183, "rewards/no_repetition_reward_func": -0.2786589413881302, "rewards/verse_reward_func": -0.0078125, "step": 2777 }, { "completion_length": 249.453125, "epoch": 22.224, "grad_norm": 3.28125, "kl": 5.3219687938690186, "learning_rate": 3.400154849216278e-05, "loss": 0.2129, "reward": 1.6194607615470886, "reward_std": 2.8797852993011475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9016326665878296, "rewards/no_repetition_reward_func": -0.26654694229364395, "rewards/verse_reward_func": -0.015625, "step": 2778 }, { "completion_length": 247.328125, "epoch": 22.232, "grad_norm": 1.578125, "kl": 4.943697452545166, "learning_rate": 3.398852095110868e-05, "loss": 0.1977, "reward": 2.817260980606079, "reward_std": 3.1122279167175293, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1012074947357178, "rewards/no_repetition_reward_func": -0.27613402903079987, "rewards/verse_reward_func": -0.0078125, "step": 2779 }, { "completion_length": 249.109375, "epoch": 22.24, "grad_norm": 2.09375, "kl": 4.437243700027466, "learning_rate": 3.397549060628116e-05, "loss": 0.1775, "reward": 2.875845432281494, "reward_std": 2.9499993324279785, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1653964519500732, "rewards/no_repetition_reward_func": -0.28955118358135223, "rewards/verse_reward_func": 0.0, "step": 2780 }, { "completion_length": 253.96875, "epoch": 22.248, "grad_norm": 3.96875, "kl": 5.274521827697754, "learning_rate": 3.396245746174473e-05, "loss": 0.211, "reward": 2.2712244987487793, "reward_std": 2.738180637359619, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5603792667388916, "rewards/no_repetition_reward_func": -0.2891547977924347, "rewards/verse_reward_func": 0.0, "step": 2781 }, { "completion_length": 252.21875, "epoch": 22.256, "grad_norm": 1.6796875, "kl": 4.915526390075684, "learning_rate": 3.394942152156482e-05, "loss": 0.1966, "reward": 2.1996909379959106, "reward_std": 2.914797306060791, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5006057024002075, "rewards/no_repetition_reward_func": -0.29310232400894165, "rewards/verse_reward_func": -0.0078125, "step": 2782 }, { "completion_length": 248.640625, "epoch": 22.264, "grad_norm": 2.90625, "kl": 3.91885769367218, "learning_rate": 3.39363827898077e-05, "loss": 0.1568, "reward": 2.88555645942688, "reward_std": 3.3617475032806396, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2029006481170654, "rewards/no_repetition_reward_func": -0.30953162908554077, "rewards/verse_reward_func": -0.0078125, "step": 2783 }, { "completion_length": 247.109375, "epoch": 22.272, "grad_norm": 2.78125, "kl": 5.407742261886597, "learning_rate": 3.392334127054051e-05, "loss": 0.2163, "reward": 1.8009673357009888, "reward_std": 2.746389389038086, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.0655155777931213, "rewards/no_repetition_reward_func": -0.25673580169677734, "rewards/verse_reward_func": -0.0078125, "step": 2784 }, { "completion_length": 256.0, "epoch": 22.28, "grad_norm": 3.796875, "kl": 3.3118234872817993, "learning_rate": 3.3910296967831266e-05, "loss": 0.1325, "reward": 3.723656177520752, "reward_std": 3.0660606622695923, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 4.093783020973206, "rewards/no_repetition_reward_func": -0.3701266050338745, "rewards/verse_reward_func": 0.0, "step": 2785 }, { "completion_length": 249.078125, "epoch": 22.288, "grad_norm": 4.09375, "kl": 3.738231897354126, "learning_rate": 3.389724988574887e-05, "loss": 0.1495, "reward": 3.1085259914398193, "reward_std": 3.348386526107788, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4310221672058105, "rewards/no_repetition_reward_func": -0.3146836459636688, "rewards/verse_reward_func": -0.0078125, "step": 2786 }, { "completion_length": 246.609375, "epoch": 22.296, "grad_norm": 2.84375, "kl": 4.603002071380615, "learning_rate": 3.388420002836307e-05, "loss": 0.1841, "reward": 2.556846499443054, "reward_std": 3.074911594390869, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8509600162506104, "rewards/no_repetition_reward_func": -0.27067597210407257, "rewards/verse_reward_func": -0.0234375, "step": 2787 }, { "completion_length": 241.484375, "epoch": 22.304, "grad_norm": 2.203125, "kl": 4.322221040725708, "learning_rate": 3.387114739974448e-05, "loss": 0.1729, "reward": 2.352898418903351, "reward_std": 2.7220855951309204, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6049901843070984, "rewards/no_repetition_reward_func": -0.2442794367671013, "rewards/verse_reward_func": -0.0078125, "step": 2788 }, { "completion_length": 253.0, "epoch": 22.312, "grad_norm": 6.03125, "kl": 5.9019505977630615, "learning_rate": 3.3858092003964594e-05, "loss": 0.2361, "reward": 1.7217688262462616, "reward_std": 2.1630667448043823, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9971641302108765, "rewards/no_repetition_reward_func": -0.27539536356925964, "rewards/verse_reward_func": 0.0, "step": 2789 }, { "completion_length": 249.828125, "epoch": 22.32, "grad_norm": 1.625, "kl": 4.8629608154296875, "learning_rate": 3.384503384509574e-05, "loss": 0.1945, "reward": 2.7656989097595215, "reward_std": 3.2352722883224487, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0448940992355347, "rewards/no_repetition_reward_func": -0.271382600069046, "rewards/verse_reward_func": -0.0078125, "step": 2790 }, { "completion_length": 249.59375, "epoch": 22.328, "grad_norm": 2.0625, "kl": 5.647979259490967, "learning_rate": 3.3831972927211135e-05, "loss": 0.2259, "reward": 2.1565194725990295, "reward_std": 3.060384750366211, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.459846019744873, "rewards/no_repetition_reward_func": -0.2955140769481659, "rewards/verse_reward_func": -0.0078125, "step": 2791 }, { "completion_length": 243.8125, "epoch": 22.336, "grad_norm": 2.015625, "kl": 5.583501815795898, "learning_rate": 3.381890925438486e-05, "loss": 0.2233, "reward": 1.9632230997085571, "reward_std": 2.886281132698059, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.203697443008423, "rewards/no_repetition_reward_func": -0.2404743954539299, "rewards/verse_reward_func": 0.0, "step": 2792 }, { "completion_length": 242.953125, "epoch": 22.344, "grad_norm": 1.9453125, "kl": 4.20197319984436, "learning_rate": 3.380584283069183e-05, "loss": 0.1681, "reward": 2.576211929321289, "reward_std": 3.1193877458572388, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8649888038635254, "rewards/no_repetition_reward_func": -0.2887769639492035, "rewards/verse_reward_func": 0.0, "step": 2793 }, { "completion_length": 248.265625, "epoch": 22.352, "grad_norm": 4.46875, "kl": 5.590630054473877, "learning_rate": 3.379277366020782e-05, "loss": 0.2236, "reward": 1.729519009590149, "reward_std": 2.534600615501404, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.968418836593628, "rewards/no_repetition_reward_func": -0.2388998121023178, "rewards/verse_reward_func": 0.0, "step": 2794 }, { "completion_length": 251.921875, "epoch": 22.36, "grad_norm": 2.203125, "kl": 4.845333576202393, "learning_rate": 3.3779701747009504e-05, "loss": 0.1938, "reward": 1.9945650100708008, "reward_std": 2.4400073289871216, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.243605852127075, "rewards/no_repetition_reward_func": -0.24904076009988785, "rewards/verse_reward_func": 0.0, "step": 2795 }, { "completion_length": 255.171875, "epoch": 22.368, "grad_norm": 1.2890625, "kl": 4.541144609451294, "learning_rate": 3.376662709517435e-05, "loss": 0.1816, "reward": 2.5347604751586914, "reward_std": 2.8679944276809692, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8127471208572388, "rewards/no_repetition_reward_func": -0.27798672020435333, "rewards/verse_reward_func": 0.0, "step": 2796 }, { "completion_length": 250.75, "epoch": 22.376, "grad_norm": 3.09375, "kl": 3.8982203006744385, "learning_rate": 3.375354970878073e-05, "loss": 0.1559, "reward": 3.0053333044052124, "reward_std": 3.1586188077926636, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.30403733253479, "rewards/no_repetition_reward_func": -0.29870426654815674, "rewards/verse_reward_func": 0.0, "step": 2797 }, { "completion_length": 250.375, "epoch": 22.384, "grad_norm": 5.34375, "kl": 3.854576349258423, "learning_rate": 3.374046959190786e-05, "loss": 0.1542, "reward": 2.9431705474853516, "reward_std": 3.0716593265533447, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2496126890182495, "rewards/no_repetition_reward_func": -0.30644217133522034, "rewards/verse_reward_func": 0.0, "step": 2798 }, { "completion_length": 256.0, "epoch": 22.392, "grad_norm": 2.125, "kl": 5.112916946411133, "learning_rate": 3.372738674863577e-05, "loss": 0.2045, "reward": 2.1745829582214355, "reward_std": 2.83764910697937, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4569785594940186, "rewards/no_repetition_reward_func": -0.28239551186561584, "rewards/verse_reward_func": 0.0, "step": 2799 }, { "completion_length": 253.984375, "epoch": 22.4, "grad_norm": 1.9765625, "kl": 4.270992040634155, "learning_rate": 3.3714301183045385e-05, "loss": 0.1708, "reward": 2.2665380239486694, "reward_std": 2.839469313621521, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5202053785324097, "rewards/no_repetition_reward_func": -0.253667414188385, "rewards/verse_reward_func": 0.0, "step": 2800 }, { "completion_length": 256.0, "epoch": 22.408, "grad_norm": 3.03125, "kl": 3.6067779064178467, "learning_rate": 3.370121289921845e-05, "loss": 0.1443, "reward": 2.748945474624634, "reward_std": 3.1593865156173706, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.017045259475708, "rewards/no_repetition_reward_func": -0.268099807202816, "rewards/verse_reward_func": 0.0, "step": 2801 }, { "completion_length": 246.203125, "epoch": 22.416, "grad_norm": 2.8125, "kl": 4.532712340354919, "learning_rate": 3.368812190123759e-05, "loss": 0.1813, "reward": 2.428742289543152, "reward_std": 3.1252723932266235, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7281399965286255, "rewards/no_repetition_reward_func": -0.2915852218866348, "rewards/verse_reward_func": -0.0078125, "step": 2802 }, { "completion_length": 247.1875, "epoch": 22.424, "grad_norm": 2.5, "kl": 4.3037919998168945, "learning_rate": 3.367502819318624e-05, "loss": 0.1722, "reward": 3.041495680809021, "reward_std": 3.2038021087646484, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.378035545349121, "rewards/no_repetition_reward_func": -0.32872718572616577, "rewards/verse_reward_func": -0.0078125, "step": 2803 }, { "completion_length": 255.4375, "epoch": 22.432, "grad_norm": 3.984375, "kl": 5.226150751113892, "learning_rate": 3.3661931779148707e-05, "loss": 0.209, "reward": 1.8845310807228088, "reward_std": 2.729324221611023, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1364338397979736, "rewards/no_repetition_reward_func": -0.25190284848213196, "rewards/verse_reward_func": 0.0, "step": 2804 }, { "completion_length": 249.953125, "epoch": 22.44, "grad_norm": 2.515625, "kl": 4.463900327682495, "learning_rate": 3.3648832663210124e-05, "loss": 0.1786, "reward": 2.9856125116348267, "reward_std": 3.237255573272705, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2540040016174316, "rewards/no_repetition_reward_func": -0.26839156448841095, "rewards/verse_reward_func": 0.0, "step": 2805 }, { "completion_length": 247.765625, "epoch": 22.448, "grad_norm": 3.484375, "kl": 6.119170188903809, "learning_rate": 3.363573084945648e-05, "loss": 0.2448, "reward": 2.1394442319869995, "reward_std": 2.9895052909851074, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.406069040298462, "rewards/no_repetition_reward_func": -0.2588123753666878, "rewards/verse_reward_func": -0.0078125, "step": 2806 }, { "completion_length": 247.09375, "epoch": 22.456, "grad_norm": 4.75, "kl": 3.9506949186325073, "learning_rate": 3.3622626341974594e-05, "loss": 0.158, "reward": 3.127257227897644, "reward_std": 3.2842037677764893, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.467918634414673, "rewards/no_repetition_reward_func": -0.3172239810228348, "rewards/verse_reward_func": -0.0234375, "step": 2807 }, { "completion_length": 242.234375, "epoch": 22.464, "grad_norm": 1.46875, "kl": 5.0581159591674805, "learning_rate": 3.360951914485215e-05, "loss": 0.2023, "reward": 2.4177186489105225, "reward_std": 3.0063745975494385, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.722818374633789, "rewards/no_repetition_reward_func": -0.2894747257232666, "rewards/verse_reward_func": -0.015625, "step": 2808 }, { "completion_length": 255.671875, "epoch": 22.472, "grad_norm": 3.484375, "kl": 6.7140586376190186, "learning_rate": 3.359640926217763e-05, "loss": 0.2686, "reward": 2.0088343620300293, "reward_std": 2.898525595664978, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3224101066589355, "rewards/no_repetition_reward_func": -0.31357572972774506, "rewards/verse_reward_func": 0.0, "step": 2809 }, { "completion_length": 249.875, "epoch": 22.48, "grad_norm": 3.5625, "kl": 4.108373761177063, "learning_rate": 3.3583296698040384e-05, "loss": 0.1643, "reward": 2.84937584400177, "reward_std": 2.87324595451355, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.109548568725586, "rewards/no_repetition_reward_func": -0.2601727694272995, "rewards/verse_reward_func": 0.0, "step": 2810 }, { "completion_length": 249.453125, "epoch": 22.488, "grad_norm": 4.4375, "kl": 4.737394094467163, "learning_rate": 3.35701814565306e-05, "loss": 0.1895, "reward": 2.568895101547241, "reward_std": 3.05113422870636, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.847628116607666, "rewards/no_repetition_reward_func": -0.2787330746650696, "rewards/verse_reward_func": 0.0, "step": 2811 }, { "completion_length": 245.390625, "epoch": 22.496, "grad_norm": 3.78125, "kl": 5.2043890953063965, "learning_rate": 3.355706354173928e-05, "loss": 0.2082, "reward": 2.4808309078216553, "reward_std": 3.0369341373443604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.739256739616394, "rewards/no_repetition_reward_func": -0.2506130188703537, "rewards/verse_reward_func": -0.0078125, "step": 2812 }, { "completion_length": 250.34375, "epoch": 22.504, "grad_norm": 4.6875, "kl": 6.279245376586914, "learning_rate": 3.354394295775829e-05, "loss": 0.2512, "reward": 1.4867507219314575, "reward_std": 2.4599651098251343, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.7212620377540588, "rewards/no_repetition_reward_func": -0.23451130837202072, "rewards/verse_reward_func": 0.0, "step": 2813 }, { "completion_length": 252.984375, "epoch": 22.512, "grad_norm": 5.78125, "kl": 5.40388822555542, "learning_rate": 3.3530819708680286e-05, "loss": 0.2162, "reward": 2.32201886177063, "reward_std": 2.9273687601089478, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6110787391662598, "rewards/no_repetition_reward_func": -0.2812473922967911, "rewards/verse_reward_func": -0.0078125, "step": 2814 }, { "completion_length": 251.234375, "epoch": 22.52, "grad_norm": 2.375, "kl": 5.031213164329529, "learning_rate": 3.35176937985988e-05, "loss": 0.2012, "reward": 2.614893674850464, "reward_std": 2.6251124143600464, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.898335814476013, "rewards/no_repetition_reward_func": -0.2756297141313553, "rewards/verse_reward_func": -0.0078125, "step": 2815 }, { "completion_length": 250.265625, "epoch": 22.528, "grad_norm": 2.09375, "kl": 5.071358680725098, "learning_rate": 3.350456523160815e-05, "loss": 0.2029, "reward": 2.5109550952911377, "reward_std": 3.0338733196258545, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.776226043701172, "rewards/no_repetition_reward_func": -0.2652709484100342, "rewards/verse_reward_func": 0.0, "step": 2816 }, { "completion_length": 246.34375, "epoch": 22.536, "grad_norm": 1.8359375, "kl": 3.963864326477051, "learning_rate": 3.349143401180354e-05, "loss": 0.1586, "reward": 2.9258840084075928, "reward_std": 3.087047815322876, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2019046545028687, "rewards/no_repetition_reward_func": -0.2760205715894699, "rewards/verse_reward_func": 0.0, "step": 2817 }, { "completion_length": 252.46875, "epoch": 22.544, "grad_norm": 2.328125, "kl": 4.617338418960571, "learning_rate": 3.347830014328094e-05, "loss": 0.1847, "reward": 2.860585331916809, "reward_std": 2.997983932495117, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1185097694396973, "rewards/no_repetition_reward_func": -0.25792457163333893, "rewards/verse_reward_func": 0.0, "step": 2818 }, { "completion_length": 252.15625, "epoch": 22.552, "grad_norm": 2.046875, "kl": 3.6851868629455566, "learning_rate": 3.346516363013719e-05, "loss": 0.1474, "reward": 2.7748950719833374, "reward_std": 2.9529740810394287, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.095306634902954, "rewards/no_repetition_reward_func": -0.32041148841381073, "rewards/verse_reward_func": 0.0, "step": 2819 }, { "completion_length": 251.53125, "epoch": 22.56, "grad_norm": 2.453125, "kl": 4.1479082107543945, "learning_rate": 3.3452024476469934e-05, "loss": 0.1659, "reward": 2.8574423789978027, "reward_std": 2.811690092086792, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.136544704437256, "rewards/no_repetition_reward_func": -0.27910229563713074, "rewards/verse_reward_func": 0.0, "step": 2820 }, { "completion_length": 253.171875, "epoch": 22.568, "grad_norm": 2.546875, "kl": 4.377747178077698, "learning_rate": 3.343888268637765e-05, "loss": 0.1751, "reward": 2.8467103242874146, "reward_std": 3.077554941177368, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1399619579315186, "rewards/no_repetition_reward_func": -0.2932516187429428, "rewards/verse_reward_func": 0.0, "step": 2821 }, { "completion_length": 246.296875, "epoch": 22.576, "grad_norm": 4.5, "kl": 5.327047824859619, "learning_rate": 3.3425738263959615e-05, "loss": 0.2131, "reward": 2.562211751937866, "reward_std": 3.26343834400177, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8510740995407104, "rewards/no_repetition_reward_func": -0.2810498774051666, "rewards/verse_reward_func": -0.0078125, "step": 2822 }, { "completion_length": 253.703125, "epoch": 22.584, "grad_norm": 4.15625, "kl": 4.660847425460815, "learning_rate": 3.341259121331597e-05, "loss": 0.1864, "reward": 2.318153500556946, "reward_std": 2.9459917545318604, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.575658440589905, "rewards/no_repetition_reward_func": -0.25750498473644257, "rewards/verse_reward_func": 0.0, "step": 2823 }, { "completion_length": 249.328125, "epoch": 22.592, "grad_norm": 4.25, "kl": 4.6230149269104, "learning_rate": 3.339944153854764e-05, "loss": 0.1849, "reward": 2.447578191757202, "reward_std": 3.098610520362854, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7837870121002197, "rewards/no_repetition_reward_func": -0.3283964991569519, "rewards/verse_reward_func": -0.0078125, "step": 2824 }, { "completion_length": 251.84375, "epoch": 22.6, "grad_norm": 4.21875, "kl": 4.327915072441101, "learning_rate": 3.338628924375638e-05, "loss": 0.1731, "reward": 2.5640275478363037, "reward_std": 2.950805902481079, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.858295202255249, "rewards/no_repetition_reward_func": -0.29426784813404083, "rewards/verse_reward_func": 0.0, "step": 2825 }, { "completion_length": 250.59375, "epoch": 22.608, "grad_norm": 4.0, "kl": 4.072060823440552, "learning_rate": 3.3373134333044756e-05, "loss": 0.1629, "reward": 3.232539176940918, "reward_std": 2.936974883079529, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5692864656448364, "rewards/no_repetition_reward_func": -0.3367473781108856, "rewards/verse_reward_func": 0.0, "step": 2826 }, { "completion_length": 246.34375, "epoch": 22.616, "grad_norm": 2.0, "kl": 4.4591710567474365, "learning_rate": 3.3359976810516164e-05, "loss": 0.1784, "reward": 2.7624354362487793, "reward_std": 3.3192323446273804, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0812885761260986, "rewards/no_repetition_reward_func": -0.30322808027267456, "rewards/verse_reward_func": -0.015625, "step": 2827 }, { "completion_length": 251.40625, "epoch": 22.624, "grad_norm": 4.9375, "kl": 5.404510498046875, "learning_rate": 3.334681668027481e-05, "loss": 0.2162, "reward": 2.8680810928344727, "reward_std": 3.1119651794433594, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.121616005897522, "rewards/no_repetition_reward_func": -0.25353483855724335, "rewards/verse_reward_func": 0.0, "step": 2828 }, { "completion_length": 248.265625, "epoch": 22.632, "grad_norm": 3.78125, "kl": 4.944680690765381, "learning_rate": 3.33336539464257e-05, "loss": 0.1978, "reward": 2.731067657470703, "reward_std": 2.7458890676498413, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0272626876831055, "rewards/no_repetition_reward_func": -0.28838253021240234, "rewards/verse_reward_func": -0.0078125, "step": 2829 }, { "completion_length": 249.984375, "epoch": 22.64, "grad_norm": 3.421875, "kl": 5.76235032081604, "learning_rate": 3.332048861307467e-05, "loss": 0.2305, "reward": 2.2690168619155884, "reward_std": 3.0219770669937134, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.56532621383667, "rewards/no_repetition_reward_func": -0.2963094860315323, "rewards/verse_reward_func": 0.0, "step": 2830 }, { "completion_length": 237.796875, "epoch": 22.648, "grad_norm": 7.78125, "kl": 5.084722518920898, "learning_rate": 3.3307320684328354e-05, "loss": 0.2034, "reward": 2.253013014793396, "reward_std": 2.9908642768859863, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5326151847839355, "rewards/no_repetition_reward_func": -0.27178965508937836, "rewards/verse_reward_func": -0.0078125, "step": 2831 }, { "completion_length": 244.15625, "epoch": 22.656, "grad_norm": 6.28125, "kl": 4.8206866979599, "learning_rate": 3.3294150164294204e-05, "loss": 0.1928, "reward": 2.7222758531570435, "reward_std": 3.1027276515960693, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.017064392566681, "rewards/no_repetition_reward_func": -0.2791634798049927, "rewards/verse_reward_func": -0.015625, "step": 2832 }, { "completion_length": 254.515625, "epoch": 22.664, "grad_norm": 2.390625, "kl": 5.545173406600952, "learning_rate": 3.328097705708047e-05, "loss": 0.2218, "reward": 2.5821285247802734, "reward_std": 3.2056336402893066, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.882086753845215, "rewards/no_repetition_reward_func": -0.29995840787887573, "rewards/verse_reward_func": 0.0, "step": 2833 }, { "completion_length": 250.5625, "epoch": 22.672, "grad_norm": 4.46875, "kl": 5.605589389801025, "learning_rate": 3.326780136679623e-05, "loss": 0.2242, "reward": 2.002321481704712, "reward_std": 3.188023567199707, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2680375576019287, "rewards/no_repetition_reward_func": -0.2657160609960556, "rewards/verse_reward_func": 0.0, "step": 2834 }, { "completion_length": 253.328125, "epoch": 22.68, "grad_norm": 3.0625, "kl": 3.5717475414276123, "learning_rate": 3.325462309755134e-05, "loss": 0.1429, "reward": 3.255751132965088, "reward_std": 2.9951928853988647, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5712841749191284, "rewards/no_repetition_reward_func": -0.30772052705287933, "rewards/verse_reward_func": -0.0078125, "step": 2835 }, { "completion_length": 249.84375, "epoch": 22.688, "grad_norm": 4.8125, "kl": 5.293958902359009, "learning_rate": 3.324144225345649e-05, "loss": 0.2118, "reward": 1.8638139963150024, "reward_std": 2.466667592525482, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.111099421977997, "rewards/no_repetition_reward_func": -0.24728547036647797, "rewards/verse_reward_func": 0.0, "step": 2836 }, { "completion_length": 247.484375, "epoch": 22.696, "grad_norm": 1.4765625, "kl": 4.927963376045227, "learning_rate": 3.322825883862314e-05, "loss": 0.1971, "reward": 2.34497606754303, "reward_std": 2.8778750896453857, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.620155096054077, "rewards/no_repetition_reward_func": -0.2751789093017578, "rewards/verse_reward_func": 0.0, "step": 2837 }, { "completion_length": 242.109375, "epoch": 22.704, "grad_norm": 4.0625, "kl": 4.609142065048218, "learning_rate": 3.321507285716357e-05, "loss": 0.1844, "reward": 2.2967612743377686, "reward_std": 3.164729952812195, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6060367822647095, "rewards/no_repetition_reward_func": -0.3014630973339081, "rewards/verse_reward_func": -0.0078125, "step": 2838 }, { "completion_length": 253.078125, "epoch": 22.712, "grad_norm": 2.25, "kl": 5.095664739608765, "learning_rate": 3.320188431319088e-05, "loss": 0.2038, "reward": 2.1974210143089294, "reward_std": 2.751243472099304, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.456796884536743, "rewards/no_repetition_reward_func": -0.2593759000301361, "rewards/verse_reward_func": 0.0, "step": 2839 }, { "completion_length": 248.578125, "epoch": 22.72, "grad_norm": 3.171875, "kl": 4.704469203948975, "learning_rate": 3.318869321081892e-05, "loss": 0.1882, "reward": 2.6593973636627197, "reward_std": 3.4666978120803833, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.94881272315979, "rewards/no_repetition_reward_func": -0.28160277009010315, "rewards/verse_reward_func": -0.0078125, "step": 2840 }, { "completion_length": 253.0625, "epoch": 22.728, "grad_norm": 3.8125, "kl": 3.961555242538452, "learning_rate": 3.31754995541624e-05, "loss": 0.1585, "reward": 2.8648606538772583, "reward_std": 3.101278066635132, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.156077027320862, "rewards/no_repetition_reward_func": -0.28340400755405426, "rewards/verse_reward_func": -0.0078125, "step": 2841 }, { "completion_length": 247.96875, "epoch": 22.736, "grad_norm": 3.59375, "kl": 3.682844042778015, "learning_rate": 3.3162303347336764e-05, "loss": 0.1473, "reward": 3.113444685935974, "reward_std": 3.130784034729004, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4150925874710083, "rewards/no_repetition_reward_func": -0.3016479164361954, "rewards/verse_reward_func": 0.0, "step": 2842 }, { "completion_length": 253.671875, "epoch": 22.744, "grad_norm": 3.75, "kl": 4.442539215087891, "learning_rate": 3.31491045944583e-05, "loss": 0.1777, "reward": 2.497568726539612, "reward_std": 3.0233631134033203, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7797945737838745, "rewards/no_repetition_reward_func": -0.2822258844971657, "rewards/verse_reward_func": 0.0, "step": 2843 }, { "completion_length": 250.078125, "epoch": 22.752, "grad_norm": 1.5234375, "kl": 4.513769030570984, "learning_rate": 3.313590329964406e-05, "loss": 0.1806, "reward": 2.3070828914642334, "reward_std": 2.6531604528427124, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5996049642562866, "rewards/no_repetition_reward_func": -0.29252205789089203, "rewards/verse_reward_func": 0.0, "step": 2844 }, { "completion_length": 249.09375, "epoch": 22.76, "grad_norm": 4.71875, "kl": 3.680316686630249, "learning_rate": 3.312269946701191e-05, "loss": 0.1472, "reward": 2.8395718336105347, "reward_std": 3.1318325996398926, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.178872227668762, "rewards/no_repetition_reward_func": -0.3236752599477768, "rewards/verse_reward_func": -0.015625, "step": 2845 }, { "completion_length": 253.34375, "epoch": 22.768, "grad_norm": 2.296875, "kl": 5.638324022293091, "learning_rate": 3.31094931006805e-05, "loss": 0.2255, "reward": 1.8708564639091492, "reward_std": 2.446930706501007, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.09506493806839, "rewards/no_repetition_reward_func": -0.22420857846736908, "rewards/verse_reward_func": 0.0, "step": 2846 }, { "completion_length": 251.15625, "epoch": 22.776, "grad_norm": 4.40625, "kl": 4.548605680465698, "learning_rate": 3.309628420476926e-05, "loss": 0.1819, "reward": 3.428528070449829, "reward_std": 3.4061877727508545, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.7583647966384888, "rewards/no_repetition_reward_func": -0.32983657717704773, "rewards/verse_reward_func": 0.0, "step": 2847 }, { "completion_length": 245.21875, "epoch": 22.784, "grad_norm": 2.125, "kl": 4.050958871841431, "learning_rate": 3.3083072783398416e-05, "loss": 0.162, "reward": 2.578429937362671, "reward_std": 3.0678411722183228, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.84528124332428, "rewards/no_repetition_reward_func": -0.2668512836098671, "rewards/verse_reward_func": 0.0, "step": 2848 }, { "completion_length": 255.765625, "epoch": 22.792, "grad_norm": 2.5, "kl": 5.610403776168823, "learning_rate": 3.3069858840688994e-05, "loss": 0.2244, "reward": 1.9862035512924194, "reward_std": 2.693169593811035, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2519370317459106, "rewards/no_repetition_reward_func": -0.2579207941889763, "rewards/verse_reward_func": -0.0078125, "step": 2849 }, { "completion_length": 248.640625, "epoch": 22.8, "grad_norm": 2.625, "kl": 4.124446392059326, "learning_rate": 3.305664238076278e-05, "loss": 0.165, "reward": 3.408184289932251, "reward_std": 3.0631093978881836, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.7020190954208374, "rewards/no_repetition_reward_func": -0.2938346117734909, "rewards/verse_reward_func": 0.0, "step": 2850 }, { "completion_length": 248.65625, "epoch": 22.808, "grad_norm": 1.3984375, "kl": 5.299062967300415, "learning_rate": 3.3043423407742375e-05, "loss": 0.212, "reward": 2.325048565864563, "reward_std": 3.0928231477737427, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.623804211616516, "rewards/no_repetition_reward_func": -0.290942907333374, "rewards/verse_reward_func": -0.0078125, "step": 2851 }, { "completion_length": 245.4375, "epoch": 22.816, "grad_norm": 3.296875, "kl": 3.6745901107788086, "learning_rate": 3.3030201925751145e-05, "loss": 0.147, "reward": 2.7738003730773926, "reward_std": 3.0424948930740356, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.091549277305603, "rewards/no_repetition_reward_func": -0.30993637442588806, "rewards/verse_reward_func": -0.0078125, "step": 2852 }, { "completion_length": 256.0, "epoch": 22.824, "grad_norm": 2.5625, "kl": 5.018601417541504, "learning_rate": 3.301697793891324e-05, "loss": 0.2007, "reward": 2.736294388771057, "reward_std": 3.2965874671936035, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0407114028930664, "rewards/no_repetition_reward_func": -0.3044169098138809, "rewards/verse_reward_func": 0.0, "step": 2853 }, { "completion_length": 250.0625, "epoch": 22.832, "grad_norm": 2.4375, "kl": 5.772607088088989, "learning_rate": 3.300375145135361e-05, "loss": 0.2309, "reward": 2.15216600894928, "reward_std": 2.7373751401901245, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.437393546104431, "rewards/no_repetition_reward_func": -0.285227507352829, "rewards/verse_reward_func": 0.0, "step": 2854 }, { "completion_length": 251.046875, "epoch": 22.84, "grad_norm": 3.40625, "kl": 6.16949200630188, "learning_rate": 3.299052246719795e-05, "loss": 0.2468, "reward": 1.9516202807426453, "reward_std": 2.6602423191070557, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1886640787124634, "rewards/no_repetition_reward_func": -0.23704388737678528, "rewards/verse_reward_func": 0.0, "step": 2855 }, { "completion_length": 243.5, "epoch": 22.848, "grad_norm": 3.375, "kl": 5.35313868522644, "learning_rate": 3.297729099057277e-05, "loss": 0.2141, "reward": 2.483123779296875, "reward_std": 3.20319139957428, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7644219398498535, "rewards/no_repetition_reward_func": -0.28129808604717255, "rewards/verse_reward_func": 0.0, "step": 2856 }, { "completion_length": 249.5, "epoch": 22.856, "grad_norm": 4.1875, "kl": 5.747232437133789, "learning_rate": 3.296405702560532e-05, "loss": 0.2299, "reward": 2.7340474128723145, "reward_std": 3.1107473373413086, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0081326961517334, "rewards/no_repetition_reward_func": -0.27408522367477417, "rewards/verse_reward_func": 0.0, "step": 2857 }, { "completion_length": 244.875, "epoch": 22.864, "grad_norm": 2.4375, "kl": 4.087369203567505, "learning_rate": 3.295082057642367e-05, "loss": 0.1635, "reward": 2.6120041608810425, "reward_std": 3.012656569480896, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9362659454345703, "rewards/no_repetition_reward_func": -0.31644921004772186, "rewards/verse_reward_func": -0.0078125, "step": 2858 }, { "completion_length": 245.9375, "epoch": 22.872, "grad_norm": 3.484375, "kl": 6.244154214859009, "learning_rate": 3.293758164715663e-05, "loss": 0.2498, "reward": 1.8974847793579102, "reward_std": 2.7514086961746216, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1372318267822266, "rewards/no_repetition_reward_func": -0.23193451762199402, "rewards/verse_reward_func": -0.0078125, "step": 2859 }, { "completion_length": 249.375, "epoch": 22.88, "grad_norm": 1.4609375, "kl": 5.837256193161011, "learning_rate": 3.29243402419338e-05, "loss": 0.2335, "reward": 2.2946200370788574, "reward_std": 3.0881153345108032, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5964303016662598, "rewards/no_repetition_reward_func": -0.28618529438972473, "rewards/verse_reward_func": -0.015625, "step": 2860 }, { "completion_length": 252.421875, "epoch": 22.888, "grad_norm": 2.015625, "kl": 4.372730731964111, "learning_rate": 3.2911096364885544e-05, "loss": 0.1749, "reward": 3.021857976913452, "reward_std": 3.005382776260376, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3022481203079224, "rewards/no_repetition_reward_func": -0.2803901582956314, "rewards/verse_reward_func": 0.0, "step": 2861 }, { "completion_length": 252.6875, "epoch": 22.896, "grad_norm": 1.9921875, "kl": 5.106991291046143, "learning_rate": 3.2897850020143005e-05, "loss": 0.2043, "reward": 2.5008217096328735, "reward_std": 3.242276191711426, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.797531843185425, "rewards/no_repetition_reward_func": -0.29671019315719604, "rewards/verse_reward_func": 0.0, "step": 2862 }, { "completion_length": 246.140625, "epoch": 22.904, "grad_norm": 1.21875, "kl": 5.581990480422974, "learning_rate": 3.2884601211838085e-05, "loss": 0.2233, "reward": 2.274866819381714, "reward_std": 3.056201457977295, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5180087089538574, "rewards/no_repetition_reward_func": -0.24314196407794952, "rewards/verse_reward_func": 0.0, "step": 2863 }, { "completion_length": 251.09375, "epoch": 22.912, "grad_norm": 2.5, "kl": 5.094503402709961, "learning_rate": 3.287134994410347e-05, "loss": 0.2038, "reward": 2.5846920013427734, "reward_std": 2.935439109802246, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.895320177078247, "rewards/no_repetition_reward_func": -0.3106282204389572, "rewards/verse_reward_func": 0.0, "step": 2864 }, { "completion_length": 252.078125, "epoch": 22.92, "grad_norm": 2.09375, "kl": 4.44184935092926, "learning_rate": 3.28580962210726e-05, "loss": 0.1777, "reward": 2.5538100004196167, "reward_std": 3.0196369886398315, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.833162307739258, "rewards/no_repetition_reward_func": -0.2793523371219635, "rewards/verse_reward_func": 0.0, "step": 2865 }, { "completion_length": 251.8125, "epoch": 22.928, "grad_norm": 8.125, "kl": 4.662191390991211, "learning_rate": 3.2844840046879686e-05, "loss": 0.1865, "reward": 2.5991145968437195, "reward_std": 2.813862919807434, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8873730897903442, "rewards/no_repetition_reward_func": -0.28044593334198, "rewards/verse_reward_func": -0.0078125, "step": 2866 }, { "completion_length": 250.09375, "epoch": 22.936, "grad_norm": 4.84375, "kl": 5.660806179046631, "learning_rate": 3.283158142565971e-05, "loss": 0.2264, "reward": 2.056081771850586, "reward_std": 2.629735231399536, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.329821228981018, "rewards/no_repetition_reward_func": -0.26592691242694855, "rewards/verse_reward_func": -0.0078125, "step": 2867 }, { "completion_length": 251.234375, "epoch": 22.944, "grad_norm": 5.59375, "kl": 5.182812929153442, "learning_rate": 3.28183203615484e-05, "loss": 0.2073, "reward": 2.432847499847412, "reward_std": 2.9952374696731567, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7009371519088745, "rewards/no_repetition_reward_func": -0.26808983087539673, "rewards/verse_reward_func": 0.0, "step": 2868 }, { "completion_length": 251.78125, "epoch": 22.951999999999998, "grad_norm": 2.25, "kl": 4.38680100440979, "learning_rate": 3.280505685868226e-05, "loss": 0.1755, "reward": 2.8843882083892822, "reward_std": 2.8438539505004883, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.194772720336914, "rewards/no_repetition_reward_func": -0.31038467586040497, "rewards/verse_reward_func": 0.0, "step": 2869 }, { "completion_length": 249.3125, "epoch": 22.96, "grad_norm": 4.21875, "kl": 5.038759708404541, "learning_rate": 3.279179092119855e-05, "loss": 0.2016, "reward": 2.2138765454292297, "reward_std": 2.9967050552368164, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5111420154571533, "rewards/no_repetition_reward_func": -0.2972653955221176, "rewards/verse_reward_func": 0.0, "step": 2870 }, { "completion_length": 253.96875, "epoch": 22.968, "grad_norm": 2.796875, "kl": 5.536611795425415, "learning_rate": 3.277852255323529e-05, "loss": 0.2215, "reward": 2.176646113395691, "reward_std": 3.05713152885437, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4291142225265503, "rewards/no_repetition_reward_func": -0.25246816873550415, "rewards/verse_reward_func": 0.0, "step": 2871 }, { "completion_length": 251.6875, "epoch": 22.976, "grad_norm": 2.609375, "kl": 3.675793409347534, "learning_rate": 3.276525175893126e-05, "loss": 0.147, "reward": 2.605372905731201, "reward_std": 2.759589195251465, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9121159315109253, "rewards/no_repetition_reward_func": -0.29111775755882263, "rewards/verse_reward_func": -0.015625, "step": 2872 }, { "completion_length": 256.0, "epoch": 22.984, "grad_norm": 2.046875, "kl": 3.942355990409851, "learning_rate": 3.2751978542425995e-05, "loss": 0.1577, "reward": 2.900728702545166, "reward_std": 3.4128122329711914, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2036691904067993, "rewards/no_repetition_reward_func": -0.30294069647789, "rewards/verse_reward_func": 0.0, "step": 2873 }, { "completion_length": 252.109375, "epoch": 22.992, "grad_norm": 2.125, "kl": 3.5805262327194214, "learning_rate": 3.273870290785979e-05, "loss": 0.1432, "reward": 2.730155348777771, "reward_std": 3.1568905115127563, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.034708261489868, "rewards/no_repetition_reward_func": -0.29674041271209717, "rewards/verse_reward_func": -0.0078125, "step": 2874 }, { "completion_length": 242.9375, "epoch": 23.0, "grad_norm": 1.5, "kl": 4.508564233779907, "learning_rate": 3.272542485937369e-05, "loss": 0.1803, "reward": 2.613793969154358, "reward_std": 2.75009822845459, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.903220057487488, "rewards/no_repetition_reward_func": -0.2738010287284851, "rewards/verse_reward_func": -0.015625, "step": 2875 }, { "completion_length": 249.796875, "epoch": 23.008, "grad_norm": 2.5625, "kl": 4.785177707672119, "learning_rate": 3.271214440110948e-05, "loss": 0.1914, "reward": 2.606805920600891, "reward_std": 3.2048912048339844, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.896850347518921, "rewards/no_repetition_reward_func": -0.2900443822145462, "rewards/verse_reward_func": 0.0, "step": 2876 }, { "completion_length": 254.59375, "epoch": 23.016, "grad_norm": 5.375, "kl": 3.3854446411132812, "learning_rate": 3.269886153720972e-05, "loss": 0.1354, "reward": 3.5109918117523193, "reward_std": 3.3430514335632324, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.8377912044525146, "rewards/no_repetition_reward_func": -0.3267992287874222, "rewards/verse_reward_func": 0.0, "step": 2877 }, { "completion_length": 252.046875, "epoch": 23.024, "grad_norm": 2.96875, "kl": 3.6474868059158325, "learning_rate": 3.2685576271817716e-05, "loss": 0.1459, "reward": 2.3034189343452454, "reward_std": 2.7235631942749023, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5756351947784424, "rewards/no_repetition_reward_func": -0.2722163796424866, "rewards/verse_reward_func": 0.0, "step": 2878 }, { "completion_length": 253.0625, "epoch": 23.032, "grad_norm": 4.90625, "kl": 4.846370220184326, "learning_rate": 3.267228860907751e-05, "loss": 0.1939, "reward": 2.815461277961731, "reward_std": 3.0483336448669434, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0839626789093018, "rewards/no_repetition_reward_func": -0.2685014009475708, "rewards/verse_reward_func": 0.0, "step": 2879 }, { "completion_length": 252.0, "epoch": 23.04, "grad_norm": 2.484375, "kl": 5.837278842926025, "learning_rate": 3.2658998553133895e-05, "loss": 0.2335, "reward": 2.4810407161712646, "reward_std": 2.673998236656189, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7818543314933777, "rewards/no_repetition_reward_func": -0.30081379413604736, "rewards/verse_reward_func": 0.0, "step": 2880 }, { "completion_length": 256.0, "epoch": 23.048, "grad_norm": 3.0625, "kl": 4.497572422027588, "learning_rate": 3.2645706108132424e-05, "loss": 0.1799, "reward": 3.215235114097595, "reward_std": 3.146897792816162, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.512881636619568, "rewards/no_repetition_reward_func": -0.29764655232429504, "rewards/verse_reward_func": 0.0, "step": 2881 }, { "completion_length": 246.25, "epoch": 23.056, "grad_norm": 5.25, "kl": 4.6695640087127686, "learning_rate": 3.263241127821938e-05, "loss": 0.1868, "reward": 3.0448468923568726, "reward_std": 3.118504285812378, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3598012924194336, "rewards/no_repetition_reward_func": -0.31495456397533417, "rewards/verse_reward_func": 0.0, "step": 2882 }, { "completion_length": 253.15625, "epoch": 23.064, "grad_norm": 2.890625, "kl": 4.9578752517700195, "learning_rate": 3.2619114067541796e-05, "loss": 0.1983, "reward": 3.2186633348464966, "reward_std": 2.9927706718444824, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5305861234664917, "rewards/no_repetition_reward_func": -0.3119228184223175, "rewards/verse_reward_func": 0.0, "step": 2883 }, { "completion_length": 244.828125, "epoch": 23.072, "grad_norm": 3.0625, "kl": 4.502011299133301, "learning_rate": 3.260581448024745e-05, "loss": 0.1801, "reward": 2.950634241104126, "reward_std": 3.351157784461975, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.235474944114685, "rewards/no_repetition_reward_func": -0.2848408371210098, "rewards/verse_reward_func": 0.0, "step": 2884 }, { "completion_length": 253.5, "epoch": 23.08, "grad_norm": 2.859375, "kl": 5.68350625038147, "learning_rate": 3.2592512520484856e-05, "loss": 0.2273, "reward": 2.4280983209609985, "reward_std": 2.714774966239929, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.699134111404419, "rewards/no_repetition_reward_func": -0.27103571593761444, "rewards/verse_reward_func": 0.0, "step": 2885 }, { "completion_length": 253.21875, "epoch": 23.088, "grad_norm": 2.15625, "kl": 5.0929529666900635, "learning_rate": 3.257920819240328e-05, "loss": 0.2037, "reward": 2.7914576530456543, "reward_std": 3.181488037109375, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.074176788330078, "rewards/no_repetition_reward_func": -0.28271911293268204, "rewards/verse_reward_func": 0.0, "step": 2886 }, { "completion_length": 252.171875, "epoch": 23.096, "grad_norm": 2.640625, "kl": 5.153334379196167, "learning_rate": 3.25659015001527e-05, "loss": 0.2061, "reward": 2.5462400913238525, "reward_std": 2.4683663845062256, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8373043537139893, "rewards/no_repetition_reward_func": -0.2910643517971039, "rewards/verse_reward_func": 0.0, "step": 2887 }, { "completion_length": 248.859375, "epoch": 23.104, "grad_norm": 1.984375, "kl": 4.560495138168335, "learning_rate": 3.2552592447883865e-05, "loss": 0.1824, "reward": 2.7853622436523438, "reward_std": 3.0246622562408447, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.072322726249695, "rewards/no_repetition_reward_func": -0.2791481018066406, "rewards/verse_reward_func": -0.0078125, "step": 2888 }, { "completion_length": 254.40625, "epoch": 23.112, "grad_norm": 2.234375, "kl": 6.214599370956421, "learning_rate": 3.253928103974823e-05, "loss": 0.2486, "reward": 2.541933536529541, "reward_std": 3.0759459733963013, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.826786160469055, "rewards/no_repetition_reward_func": -0.28485265374183655, "rewards/verse_reward_func": 0.0, "step": 2889 }, { "completion_length": 254.296875, "epoch": 23.12, "grad_norm": 4.0625, "kl": 6.242789030075073, "learning_rate": 3.2525967279898015e-05, "loss": 0.2497, "reward": 2.5916460752487183, "reward_std": 2.744908571243286, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.857449769973755, "rewards/no_repetition_reward_func": -0.25799138844013214, "rewards/verse_reward_func": -0.0078125, "step": 2890 }, { "completion_length": 250.5625, "epoch": 23.128, "grad_norm": 2.1875, "kl": 5.679834365844727, "learning_rate": 3.251265117248614e-05, "loss": 0.2272, "reward": 3.0079398155212402, "reward_std": 3.0643731355667114, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3345015048980713, "rewards/no_repetition_reward_func": -0.3187493085861206, "rewards/verse_reward_func": -0.0078125, "step": 2891 }, { "completion_length": 251.359375, "epoch": 23.136, "grad_norm": 4.71875, "kl": 6.2632246017456055, "learning_rate": 3.249933272166629e-05, "loss": 0.2505, "reward": 2.1546478271484375, "reward_std": 2.2087682485580444, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.395377516746521, "rewards/no_repetition_reward_func": -0.2407297044992447, "rewards/verse_reward_func": 0.0, "step": 2892 }, { "completion_length": 256.0, "epoch": 23.144, "grad_norm": 3.5, "kl": 4.749752640724182, "learning_rate": 3.248601193159287e-05, "loss": 0.19, "reward": 2.828648328781128, "reward_std": 2.8336349725723267, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1438517570495605, "rewards/no_repetition_reward_func": -0.31520339846611023, "rewards/verse_reward_func": 0.0, "step": 2893 }, { "completion_length": 246.609375, "epoch": 23.152, "grad_norm": 7.53125, "kl": 7.031430959701538, "learning_rate": 3.247268880642098e-05, "loss": 0.2813, "reward": 2.2557623982429504, "reward_std": 3.0077123641967773, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5224651098251343, "rewards/no_repetition_reward_func": -0.2667028456926346, "rewards/verse_reward_func": 0.0, "step": 2894 }, { "completion_length": 251.734375, "epoch": 23.16, "grad_norm": 7.5, "kl": 5.062967538833618, "learning_rate": 3.245936335030651e-05, "loss": 0.2025, "reward": 3.4371511936187744, "reward_std": 3.343305826187134, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.7460360527038574, "rewards/no_repetition_reward_func": -0.29325979202985764, "rewards/verse_reward_func": -0.015625, "step": 2895 }, { "completion_length": 250.59375, "epoch": 23.168, "grad_norm": 3.234375, "kl": 5.44281530380249, "learning_rate": 3.244603556740603e-05, "loss": 0.2177, "reward": 2.337759256362915, "reward_std": 2.815225839614868, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.631902813911438, "rewards/no_repetition_reward_func": -0.2863309532403946, "rewards/verse_reward_func": -0.0078125, "step": 2896 }, { "completion_length": 246.21875, "epoch": 23.176, "grad_norm": 2.171875, "kl": 4.998854398727417, "learning_rate": 3.243270546187687e-05, "loss": 0.2, "reward": 2.841051459312439, "reward_std": 3.054234743118286, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1049877405166626, "rewards/no_repetition_reward_func": -0.2639364153146744, "rewards/verse_reward_func": 0.0, "step": 2897 }, { "completion_length": 247.1875, "epoch": 23.184, "grad_norm": 2.296875, "kl": 4.632772207260132, "learning_rate": 3.241937303787703e-05, "loss": 0.1853, "reward": 2.673154354095459, "reward_std": 2.7591472864151, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.927351951599121, "rewards/no_repetition_reward_func": -0.2541976273059845, "rewards/verse_reward_func": 0.0, "step": 2898 }, { "completion_length": 250.546875, "epoch": 23.192, "grad_norm": 1.9765625, "kl": 5.065965175628662, "learning_rate": 3.240603829956531e-05, "loss": 0.2026, "reward": 2.4834941625595093, "reward_std": 3.0432465076446533, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7367361783981323, "rewards/no_repetition_reward_func": -0.24542949348688126, "rewards/verse_reward_func": -0.0078125, "step": 2899 }, { "completion_length": 251.4375, "epoch": 23.2, "grad_norm": 1.8359375, "kl": 3.991194725036621, "learning_rate": 3.239270125110117e-05, "loss": 0.1596, "reward": 3.0907187461853027, "reward_std": 3.160212516784668, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.433494806289673, "rewards/no_repetition_reward_func": -0.33496367931365967, "rewards/verse_reward_func": -0.0078125, "step": 2900 }, { "completion_length": 250.328125, "epoch": 23.208, "grad_norm": 1.5859375, "kl": 4.598675012588501, "learning_rate": 3.2379361896644816e-05, "loss": 0.1839, "reward": 2.3178882598876953, "reward_std": 2.765236735343933, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.586424469947815, "rewards/no_repetition_reward_func": -0.26072371006011963, "rewards/verse_reward_func": -0.0078125, "step": 2901 }, { "completion_length": 252.953125, "epoch": 23.216, "grad_norm": 3.734375, "kl": 4.425536394119263, "learning_rate": 3.236602024035716e-05, "loss": 0.177, "reward": 2.5771899223327637, "reward_std": 2.6443378925323486, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.823966145515442, "rewards/no_repetition_reward_func": -0.24677642434835434, "rewards/verse_reward_func": 0.0, "step": 2902 }, { "completion_length": 246.5625, "epoch": 23.224, "grad_norm": 4.90625, "kl": 3.684748411178589, "learning_rate": 3.235267628639987e-05, "loss": 0.1474, "reward": 2.0821120738983154, "reward_std": 2.78627347946167, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3647507429122925, "rewards/no_repetition_reward_func": -0.2826385051012039, "rewards/verse_reward_func": 0.0, "step": 2903 }, { "completion_length": 256.0, "epoch": 23.232, "grad_norm": 4.25, "kl": 3.526005506515503, "learning_rate": 3.2339330038935265e-05, "loss": 0.141, "reward": 2.622506618499756, "reward_std": 2.993556261062622, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.900503396987915, "rewards/no_repetition_reward_func": -0.27799682319164276, "rewards/verse_reward_func": 0.0, "step": 2904 }, { "completion_length": 248.75, "epoch": 23.24, "grad_norm": 2.40625, "kl": 4.109607934951782, "learning_rate": 3.2325981502126433e-05, "loss": 0.1644, "reward": 2.4852463603019714, "reward_std": 3.0230581760406494, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.813163638114929, "rewards/no_repetition_reward_func": -0.304479718208313, "rewards/verse_reward_func": -0.0234375, "step": 2905 }, { "completion_length": 245.828125, "epoch": 23.248, "grad_norm": 3.90625, "kl": 3.0715221166610718, "learning_rate": 3.2312630680137175e-05, "loss": 0.1229, "reward": 2.4260517358779907, "reward_std": 2.8733936548233032, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6738765239715576, "rewards/no_repetition_reward_func": -0.24782492220401764, "rewards/verse_reward_func": 0.0, "step": 2906 }, { "completion_length": 243.59375, "epoch": 23.256, "grad_norm": 2.21875, "kl": 4.738713383674622, "learning_rate": 3.229927757713196e-05, "loss": 0.1895, "reward": 2.445376396179199, "reward_std": 2.8721296787261963, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7580342292785645, "rewards/no_repetition_reward_func": -0.2892203629016876, "rewards/verse_reward_func": -0.0234375, "step": 2907 }, { "completion_length": 251.6875, "epoch": 23.264, "grad_norm": 2.3125, "kl": 4.38228440284729, "learning_rate": 3.228592219727602e-05, "loss": 0.1753, "reward": 2.2410742044448853, "reward_std": 3.0008692741394043, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5599082708358765, "rewards/no_repetition_reward_func": -0.31883394718170166, "rewards/verse_reward_func": 0.0, "step": 2908 }, { "completion_length": 248.1875, "epoch": 23.272, "grad_norm": 1.3359375, "kl": 4.062582015991211, "learning_rate": 3.227256454473526e-05, "loss": 0.1625, "reward": 2.651850700378418, "reward_std": 2.7681180238723755, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.895028531551361, "rewards/no_repetition_reward_func": -0.2431776076555252, "rewards/verse_reward_func": 0.0, "step": 2909 }, { "completion_length": 253.140625, "epoch": 23.28, "grad_norm": 4.6875, "kl": 3.794203519821167, "learning_rate": 3.225920462367632e-05, "loss": 0.1518, "reward": 2.7109934091567993, "reward_std": 2.8695391416549683, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9818243980407715, "rewards/no_repetition_reward_func": -0.2708311155438423, "rewards/verse_reward_func": 0.0, "step": 2910 }, { "completion_length": 250.09375, "epoch": 23.288, "grad_norm": 2.078125, "kl": 4.308750152587891, "learning_rate": 3.2245842438266526e-05, "loss": 0.1724, "reward": 2.830206274986267, "reward_std": 2.912667155265808, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.120707869529724, "rewards/no_repetition_reward_func": -0.2905017137527466, "rewards/verse_reward_func": 0.0, "step": 2911 }, { "completion_length": 246.65625, "epoch": 23.296, "grad_norm": 2.0625, "kl": 4.387230396270752, "learning_rate": 3.223247799267394e-05, "loss": 0.1755, "reward": 2.173493266105652, "reward_std": 3.066106081008911, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4651832580566406, "rewards/no_repetition_reward_func": -0.2682524472475052, "rewards/verse_reward_func": -0.0234375, "step": 2912 }, { "completion_length": 253.453125, "epoch": 23.304, "grad_norm": 2.953125, "kl": 3.642019510269165, "learning_rate": 3.221911129106728e-05, "loss": 0.1457, "reward": 3.8712621927261353, "reward_std": 2.995686888694763, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 4.233771562576294, "rewards/no_repetition_reward_func": -0.36250917613506317, "rewards/verse_reward_func": 0.0, "step": 2913 }, { "completion_length": 256.0, "epoch": 23.312, "grad_norm": 2.90625, "kl": 4.567452311515808, "learning_rate": 3.220574233761603e-05, "loss": 0.1827, "reward": 2.39154052734375, "reward_std": 3.070488691329956, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6990504264831543, "rewards/no_repetition_reward_func": -0.3075098544359207, "rewards/verse_reward_func": 0.0, "step": 2914 }, { "completion_length": 252.703125, "epoch": 23.32, "grad_norm": 1.375, "kl": 5.311326503753662, "learning_rate": 3.219237113649032e-05, "loss": 0.2125, "reward": 2.326733648777008, "reward_std": 3.1283047199249268, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6255552768707275, "rewards/no_repetition_reward_func": -0.2988217920064926, "rewards/verse_reward_func": 0.0, "step": 2915 }, { "completion_length": 255.9375, "epoch": 23.328, "grad_norm": 2.421875, "kl": 4.244477272033691, "learning_rate": 3.2178997691861014e-05, "loss": 0.1698, "reward": 3.418074369430542, "reward_std": 2.931931257247925, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.7230857610702515, "rewards/no_repetition_reward_func": -0.3050116002559662, "rewards/verse_reward_func": 0.0, "step": 2916 }, { "completion_length": 246.828125, "epoch": 23.336, "grad_norm": 2.875, "kl": 4.797587156295776, "learning_rate": 3.2165622007899676e-05, "loss": 0.1919, "reward": 2.6019808053970337, "reward_std": 2.9780389070510864, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.875980257987976, "rewards/no_repetition_reward_func": -0.2739994525909424, "rewards/verse_reward_func": 0.0, "step": 2917 }, { "completion_length": 247.421875, "epoch": 23.344, "grad_norm": 3.265625, "kl": 4.436721920967102, "learning_rate": 3.215224408877854e-05, "loss": 0.1775, "reward": 2.7892322540283203, "reward_std": 3.223597526550293, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0791614055633545, "rewards/no_repetition_reward_func": -0.28992924094200134, "rewards/verse_reward_func": 0.0, "step": 2918 }, { "completion_length": 249.921875, "epoch": 23.352, "grad_norm": 2.890625, "kl": 4.867849111557007, "learning_rate": 3.213886393867057e-05, "loss": 0.1947, "reward": 3.232826590538025, "reward_std": 3.2365365028381348, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.5592522621154785, "rewards/no_repetition_reward_func": -0.31861330568790436, "rewards/verse_reward_func": -0.0078125, "step": 2919 }, { "completion_length": 256.0, "epoch": 23.36, "grad_norm": 1.875, "kl": 5.099873185157776, "learning_rate": 3.21254815617494e-05, "loss": 0.204, "reward": 2.972576379776001, "reward_std": 3.242214798927307, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.286511540412903, "rewards/no_repetition_reward_func": -0.31393514573574066, "rewards/verse_reward_func": 0.0, "step": 2920 }, { "completion_length": 252.9375, "epoch": 23.368, "grad_norm": 2.984375, "kl": 5.556242227554321, "learning_rate": 3.21120969621894e-05, "loss": 0.2222, "reward": 2.3103238344192505, "reward_std": 2.9482311010360718, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.625493049621582, "rewards/no_repetition_reward_func": -0.2995441257953644, "rewards/verse_reward_func": -0.015625, "step": 2921 }, { "completion_length": 255.359375, "epoch": 23.376, "grad_norm": 8.6875, "kl": 6.786177635192871, "learning_rate": 3.209871014416557e-05, "loss": 0.2714, "reward": 2.1394487619400024, "reward_std": 2.710962653160095, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4231176376342773, "rewards/no_repetition_reward_func": -0.2836688160896301, "rewards/verse_reward_func": 0.0, "step": 2922 }, { "completion_length": 253.765625, "epoch": 23.384, "grad_norm": 3.53125, "kl": 3.683037042617798, "learning_rate": 3.208532111185365e-05, "loss": 0.1473, "reward": 3.5531762838363647, "reward_std": 3.0282657146453857, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.8591736555099487, "rewards/no_repetition_reward_func": -0.3059975653886795, "rewards/verse_reward_func": 0.0, "step": 2923 }, { "completion_length": 248.359375, "epoch": 23.392, "grad_norm": 3.53125, "kl": 6.70300030708313, "learning_rate": 3.207192986943006e-05, "loss": 0.2681, "reward": 2.054566740989685, "reward_std": 2.901571273803711, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.367965281009674, "rewards/no_repetition_reward_func": -0.2899608612060547, "rewards/verse_reward_func": -0.0234375, "step": 2924 }, { "completion_length": 246.671875, "epoch": 23.4, "grad_norm": 4.21875, "kl": 7.792448043823242, "learning_rate": 3.205853642107192e-05, "loss": 0.3117, "reward": 2.080842614173889, "reward_std": 2.7265613079071045, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.361764073371887, "rewards/no_repetition_reward_func": -0.2731090635061264, "rewards/verse_reward_func": -0.0078125, "step": 2925 }, { "completion_length": 253.0625, "epoch": 23.408, "grad_norm": 2.640625, "kl": 6.4222893714904785, "learning_rate": 3.204514077095699e-05, "loss": 0.2569, "reward": 2.4012720584869385, "reward_std": 2.983275532722473, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6821283102035522, "rewards/no_repetition_reward_func": -0.28085649013519287, "rewards/verse_reward_func": 0.0, "step": 2926 }, { "completion_length": 250.078125, "epoch": 23.416, "grad_norm": 6.375, "kl": 7.072510719299316, "learning_rate": 3.203174292326378e-05, "loss": 0.2829, "reward": 2.1671489477157593, "reward_std": 2.843383312225342, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.436126947402954, "rewards/no_repetition_reward_func": -0.2689779996871948, "rewards/verse_reward_func": 0.0, "step": 2927 }, { "completion_length": 244.890625, "epoch": 23.424, "grad_norm": 3.1875, "kl": 6.0562968254089355, "learning_rate": 3.2018342882171445e-05, "loss": 0.2423, "reward": 2.513888716697693, "reward_std": 3.0791990756988525, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8089256286621094, "rewards/no_repetition_reward_func": -0.2872244715690613, "rewards/verse_reward_func": -0.0078125, "step": 2928 }, { "completion_length": 248.796875, "epoch": 23.432, "grad_norm": 6.03125, "kl": 6.725503206253052, "learning_rate": 3.2004940651859844e-05, "loss": 0.269, "reward": 2.1248069405555725, "reward_std": 3.096023917198181, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4194225072860718, "rewards/no_repetition_reward_func": -0.2868030071258545, "rewards/verse_reward_func": -0.0078125, "step": 2929 }, { "completion_length": 252.1875, "epoch": 23.44, "grad_norm": 1.984375, "kl": 5.600532293319702, "learning_rate": 3.19915362365095e-05, "loss": 0.224, "reward": 3.131295144557953, "reward_std": 3.0124837160110474, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.423938512802124, "rewards/no_repetition_reward_func": -0.2848309874534607, "rewards/verse_reward_func": -0.0078125, "step": 2930 }, { "completion_length": 248.96875, "epoch": 23.448, "grad_norm": 2.46875, "kl": 5.645550489425659, "learning_rate": 3.197812964030164e-05, "loss": 0.2258, "reward": 2.418593406677246, "reward_std": 2.9717400074005127, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6835033893585205, "rewards/no_repetition_reward_func": -0.2649097964167595, "rewards/verse_reward_func": 0.0, "step": 2931 }, { "completion_length": 251.375, "epoch": 23.456, "grad_norm": 2.484375, "kl": 5.3776373863220215, "learning_rate": 3.196472086741815e-05, "loss": 0.2151, "reward": 2.047680377960205, "reward_std": 3.126461982727051, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3667640686035156, "rewards/no_repetition_reward_func": -0.31127113103866577, "rewards/verse_reward_func": -0.0078125, "step": 2932 }, { "completion_length": 248.328125, "epoch": 23.464, "grad_norm": 2.84375, "kl": 4.048956394195557, "learning_rate": 3.195130992204161e-05, "loss": 0.162, "reward": 2.6217763423919678, "reward_std": 2.8147716522216797, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9006154537200928, "rewards/no_repetition_reward_func": -0.27102671563625336, "rewards/verse_reward_func": -0.0078125, "step": 2933 }, { "completion_length": 255.96875, "epoch": 23.472, "grad_norm": 1.890625, "kl": 4.920663356781006, "learning_rate": 3.193789680835527e-05, "loss": 0.1968, "reward": 2.147469639778137, "reward_std": 2.7511298656463623, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.401240587234497, "rewards/no_repetition_reward_func": -0.25377099961042404, "rewards/verse_reward_func": 0.0, "step": 2934 }, { "completion_length": 250.390625, "epoch": 23.48, "grad_norm": 3.1875, "kl": 3.2651867866516113, "learning_rate": 3.192448153054306e-05, "loss": 0.1306, "reward": 2.7919936180114746, "reward_std": 2.871166467666626, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0384896993637085, "rewards/no_repetition_reward_func": -0.2464960440993309, "rewards/verse_reward_func": 0.0, "step": 2935 }, { "completion_length": 250.25, "epoch": 23.488, "grad_norm": 3.734375, "kl": 2.9016600847244263, "learning_rate": 3.191106409278959e-05, "loss": 0.1161, "reward": 3.334972858428955, "reward_std": 3.286835193634033, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.670320510864258, "rewards/no_repetition_reward_func": -0.33534786105155945, "rewards/verse_reward_func": 0.0, "step": 2936 }, { "completion_length": 256.0, "epoch": 23.496, "grad_norm": 3.703125, "kl": 2.9444658756256104, "learning_rate": 3.189764449928012e-05, "loss": 0.1178, "reward": 3.34978711605072, "reward_std": 3.2306026220321655, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.7104995250701904, "rewards/no_repetition_reward_func": -0.3607124090194702, "rewards/verse_reward_func": 0.0, "step": 2937 }, { "completion_length": 246.984375, "epoch": 23.504, "grad_norm": 2.671875, "kl": 4.224492788314819, "learning_rate": 3.1884222754200625e-05, "loss": 0.169, "reward": 2.290341377258301, "reward_std": 2.957060217857361, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5644410848617554, "rewards/no_repetition_reward_func": -0.25847454369068146, "rewards/verse_reward_func": -0.015625, "step": 2938 }, { "completion_length": 243.71875, "epoch": 23.512, "grad_norm": 2.234375, "kl": 3.6904410123825073, "learning_rate": 3.1870798861737705e-05, "loss": 0.1476, "reward": 2.468954563140869, "reward_std": 2.5299153327941895, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7258936166763306, "rewards/no_repetition_reward_func": -0.25693898648023605, "rewards/verse_reward_func": 0.0, "step": 2939 }, { "completion_length": 254.328125, "epoch": 23.52, "grad_norm": 2.0625, "kl": 4.005591511726379, "learning_rate": 3.185737282607867e-05, "loss": 0.1602, "reward": 2.4277628660202026, "reward_std": 2.8826671838760376, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.707442045211792, "rewards/no_repetition_reward_func": -0.2796793282032013, "rewards/verse_reward_func": 0.0, "step": 2940 }, { "completion_length": 254.53125, "epoch": 23.528, "grad_norm": 2.421875, "kl": 4.508047103881836, "learning_rate": 3.1843944651411456e-05, "loss": 0.1803, "reward": 3.0899306535720825, "reward_std": 3.386237144470215, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4138561487197876, "rewards/no_repetition_reward_func": -0.3239254802465439, "rewards/verse_reward_func": 0.0, "step": 2941 }, { "completion_length": 251.765625, "epoch": 23.536, "grad_norm": 3.015625, "kl": 4.732879638671875, "learning_rate": 3.183051434192471e-05, "loss": 0.1893, "reward": 2.710882604122162, "reward_std": 3.0109798908233643, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.0040920972824097, "rewards/no_repetition_reward_func": -0.2853972166776657, "rewards/verse_reward_func": -0.0078125, "step": 2942 }, { "completion_length": 247.390625, "epoch": 23.544, "grad_norm": 4.0, "kl": 4.07801628112793, "learning_rate": 3.181708190180771e-05, "loss": 0.1631, "reward": 2.778223752975464, "reward_std": 3.252761125564575, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.096638798713684, "rewards/no_repetition_reward_func": -0.31060245633125305, "rewards/verse_reward_func": -0.0078125, "step": 2943 }, { "completion_length": 248.296875, "epoch": 23.552, "grad_norm": 3.359375, "kl": 5.028027534484863, "learning_rate": 3.180364733525043e-05, "loss": 0.2011, "reward": 2.2446686029434204, "reward_std": 2.9468486309051514, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.495680332183838, "rewards/no_repetition_reward_func": -0.25101158767938614, "rewards/verse_reward_func": 0.0, "step": 2944 }, { "completion_length": 248.578125, "epoch": 23.56, "grad_norm": 2.859375, "kl": 5.251175403594971, "learning_rate": 3.179021064644347e-05, "loss": 0.21, "reward": 1.8819907307624817, "reward_std": 2.778993844985962, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.1547547578811646, "rewards/no_repetition_reward_func": -0.27276408672332764, "rewards/verse_reward_func": 0.0, "step": 2945 }, { "completion_length": 248.46875, "epoch": 23.568, "grad_norm": 2.078125, "kl": 4.4652910232543945, "learning_rate": 3.177677183957813e-05, "loss": 0.1786, "reward": 2.398245096206665, "reward_std": 3.1388930082321167, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.713927745819092, "rewards/no_repetition_reward_func": -0.300057590007782, "rewards/verse_reward_func": -0.015625, "step": 2946 }, { "completion_length": 252.484375, "epoch": 23.576, "grad_norm": 2.03125, "kl": 4.390887975692749, "learning_rate": 3.176333091884635e-05, "loss": 0.1756, "reward": 3.132864832878113, "reward_std": 3.269501566886902, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4676908254623413, "rewards/no_repetition_reward_func": -0.32701361179351807, "rewards/verse_reward_func": -0.0078125, "step": 2947 }, { "completion_length": 253.234375, "epoch": 23.584, "grad_norm": 2.5625, "kl": 4.642548084259033, "learning_rate": 3.174988788844072e-05, "loss": 0.1857, "reward": 3.0376826524734497, "reward_std": 3.220319986343384, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3379091024398804, "rewards/no_repetition_reward_func": -0.30022649466991425, "rewards/verse_reward_func": 0.0, "step": 2948 }, { "completion_length": 252.3125, "epoch": 23.592, "grad_norm": 2.84375, "kl": 4.400968551635742, "learning_rate": 3.173644275255451e-05, "loss": 0.176, "reward": 3.0072101354599, "reward_std": 3.0458039045333862, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3240902423858643, "rewards/no_repetition_reward_func": -0.31688034534454346, "rewards/verse_reward_func": 0.0, "step": 2949 }, { "completion_length": 252.90625, "epoch": 23.6, "grad_norm": 3.3125, "kl": 4.762156248092651, "learning_rate": 3.172299551538164e-05, "loss": 0.1905, "reward": 3.091443657875061, "reward_std": 3.0516204833984375, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3788561820983887, "rewards/no_repetition_reward_func": -0.2795998305082321, "rewards/verse_reward_func": -0.0078125, "step": 2950 }, { "completion_length": 250.734375, "epoch": 23.608, "grad_norm": 1.8984375, "kl": 5.114767551422119, "learning_rate": 3.170954618111669e-05, "loss": 0.2046, "reward": 2.899174451828003, "reward_std": 3.1909605264663696, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.202522397041321, "rewards/no_repetition_reward_func": -0.30334794521331787, "rewards/verse_reward_func": 0.0, "step": 2951 }, { "completion_length": 252.59375, "epoch": 23.616, "grad_norm": 2.484375, "kl": 5.35274600982666, "learning_rate": 3.169609475395486e-05, "loss": 0.2141, "reward": 2.361617386341095, "reward_std": 2.472975492477417, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.644015073776245, "rewards/no_repetition_reward_func": -0.2745853662490845, "rewards/verse_reward_func": -0.0078125, "step": 2952 }, { "completion_length": 245.375, "epoch": 23.624, "grad_norm": 2.5625, "kl": 4.963543891906738, "learning_rate": 3.1682641238092064e-05, "loss": 0.1985, "reward": 2.3556859493255615, "reward_std": 2.7483056783676147, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.623940110206604, "rewards/no_repetition_reward_func": -0.26044175028800964, "rewards/verse_reward_func": -0.0078125, "step": 2953 }, { "completion_length": 249.78125, "epoch": 23.632, "grad_norm": 1.8125, "kl": 5.809852838516235, "learning_rate": 3.166918563772481e-05, "loss": 0.2324, "reward": 2.5673182010650635, "reward_std": 3.200206160545349, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8917495012283325, "rewards/no_repetition_reward_func": -0.324431374669075, "rewards/verse_reward_func": 0.0, "step": 2954 }, { "completion_length": 247.15625, "epoch": 23.64, "grad_norm": 3.265625, "kl": 6.036417007446289, "learning_rate": 3.1655727957050285e-05, "loss": 0.2415, "reward": 2.9818379878997803, "reward_std": 3.277377128601074, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.2638763189315796, "rewards/no_repetition_reward_func": -0.28203848004341125, "rewards/verse_reward_func": 0.0, "step": 2955 }, { "completion_length": 251.671875, "epoch": 23.648, "grad_norm": 5.875, "kl": 6.6481781005859375, "learning_rate": 3.1642268200266317e-05, "loss": 0.2659, "reward": 2.729393482208252, "reward_std": 3.4128358364105225, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.026747226715088, "rewards/no_repetition_reward_func": -0.2895413190126419, "rewards/verse_reward_func": -0.0078125, "step": 2956 }, { "completion_length": 252.78125, "epoch": 23.656, "grad_norm": 5.5625, "kl": 5.14867639541626, "learning_rate": 3.162880637157139e-05, "loss": 0.2059, "reward": 2.568422317504883, "reward_std": 3.0564332008361816, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.854193329811096, "rewards/no_repetition_reward_func": -0.28577081859111786, "rewards/verse_reward_func": 0.0, "step": 2957 }, { "completion_length": 250.109375, "epoch": 23.664, "grad_norm": 4.4375, "kl": 5.301247835159302, "learning_rate": 3.1615342475164636e-05, "loss": 0.212, "reward": 2.02980637550354, "reward_std": 2.7996432781219482, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3144790530204773, "rewards/no_repetition_reward_func": -0.27686021476984024, "rewards/verse_reward_func": -0.0078125, "step": 2958 }, { "completion_length": 252.28125, "epoch": 23.672, "grad_norm": 2.5, "kl": 4.77146315574646, "learning_rate": 3.16018765152458e-05, "loss": 0.1909, "reward": 2.6023961305618286, "reward_std": 3.1513150930404663, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.907130241394043, "rewards/no_repetition_reward_func": -0.30473411083221436, "rewards/verse_reward_func": 0.0, "step": 2959 }, { "completion_length": 248.921875, "epoch": 23.68, "grad_norm": 4.84375, "kl": 6.645594835281372, "learning_rate": 3.158840849601532e-05, "loss": 0.2658, "reward": 1.5041192770004272, "reward_std": 2.4700201749801636, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.768092155456543, "rewards/no_repetition_reward_func": -0.2639729082584381, "rewards/verse_reward_func": 0.0, "step": 2960 }, { "completion_length": 247.5625, "epoch": 23.688, "grad_norm": 3.421875, "kl": 4.478116750717163, "learning_rate": 3.157493842167423e-05, "loss": 0.1791, "reward": 2.66098690032959, "reward_std": 3.1554254293441772, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.971148371696472, "rewards/no_repetition_reward_func": -0.2945364713668823, "rewards/verse_reward_func": -0.015625, "step": 2961 }, { "completion_length": 240.890625, "epoch": 23.696, "grad_norm": 8.5625, "kl": 7.4530274868011475, "learning_rate": 3.156146629642425e-05, "loss": 0.2981, "reward": 1.138789176940918, "reward_std": 1.8850326538085938, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.3719606399536133, "rewards/no_repetition_reward_func": -0.21754635125398636, "rewards/verse_reward_func": -0.015625, "step": 2962 }, { "completion_length": 251.828125, "epoch": 23.704, "grad_norm": 1.734375, "kl": 5.194585084915161, "learning_rate": 3.15479921244677e-05, "loss": 0.2078, "reward": 2.816285014152527, "reward_std": 3.339402437210083, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.1150243282318115, "rewards/no_repetition_reward_func": -0.2909267842769623, "rewards/verse_reward_func": -0.0078125, "step": 2963 }, { "completion_length": 253.34375, "epoch": 23.712, "grad_norm": 2.234375, "kl": 4.19823431968689, "learning_rate": 3.153451591000756e-05, "loss": 0.1679, "reward": 2.6210230588912964, "reward_std": 2.75546395778656, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.9110809564590454, "rewards/no_repetition_reward_func": -0.2900580167770386, "rewards/verse_reward_func": 0.0, "step": 2964 }, { "completion_length": 252.828125, "epoch": 23.72, "grad_norm": 1.9921875, "kl": 5.823908090591431, "learning_rate": 3.152103765724743e-05, "loss": 0.233, "reward": 2.3366535902023315, "reward_std": 3.169995665550232, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.589608669281006, "rewards/no_repetition_reward_func": -0.2451426461338997, "rewards/verse_reward_func": -0.0078125, "step": 2965 }, { "completion_length": 254.4375, "epoch": 23.728, "grad_norm": 3.3125, "kl": 5.736942291259766, "learning_rate": 3.150755737039157e-05, "loss": 0.2295, "reward": 1.902836799621582, "reward_std": 2.631964325904846, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.134510338306427, "rewards/no_repetition_reward_func": -0.22386101633310318, "rewards/verse_reward_func": -0.0078125, "step": 2966 }, { "completion_length": 250.984375, "epoch": 23.736, "grad_norm": 8.875, "kl": 5.94452428817749, "learning_rate": 3.149407505364486e-05, "loss": 0.2378, "reward": 2.0423758029937744, "reward_std": 2.4762723445892334, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.3168269991874695, "rewards/no_repetition_reward_func": -0.27445119619369507, "rewards/verse_reward_func": 0.0, "step": 2967 }, { "completion_length": 249.0625, "epoch": 23.744, "grad_norm": 1.9765625, "kl": 4.35528028011322, "learning_rate": 3.148059071121282e-05, "loss": 0.1742, "reward": 2.2350900173187256, "reward_std": 2.876222610473633, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5328011512756348, "rewards/no_repetition_reward_func": -0.28989867866039276, "rewards/verse_reward_func": -0.0078125, "step": 2968 }, { "completion_length": 251.796875, "epoch": 23.752, "grad_norm": 2.640625, "kl": 5.9742279052734375, "learning_rate": 3.146710434730159e-05, "loss": 0.239, "reward": 1.6595627665519714, "reward_std": 2.386244773864746, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.9213481545448303, "rewards/no_repetition_reward_func": -0.253973051905632, "rewards/verse_reward_func": -0.0078125, "step": 2969 }, { "completion_length": 242.5, "epoch": 23.76, "grad_norm": 1.9765625, "kl": 4.326176881790161, "learning_rate": 3.145361596611795e-05, "loss": 0.173, "reward": 2.8246397972106934, "reward_std": 2.8778666257858276, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.130977153778076, "rewards/no_repetition_reward_func": -0.29852481186389923, "rewards/verse_reward_func": -0.0078125, "step": 2970 }, { "completion_length": 244.390625, "epoch": 23.768, "grad_norm": 3.078125, "kl": 4.133926630020142, "learning_rate": 3.1440125571869306e-05, "loss": 0.1654, "reward": 3.1946282386779785, "reward_std": 3.3444888591766357, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4864777326583862, "rewards/no_repetition_reward_func": -0.2684119790792465, "rewards/verse_reward_func": -0.0234375, "step": 2971 }, { "completion_length": 256.0, "epoch": 23.776, "grad_norm": 2.34375, "kl": 4.2671003341674805, "learning_rate": 3.142663316876368e-05, "loss": 0.1707, "reward": 2.4852635860443115, "reward_std": 2.8562170267105103, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.7808340787887573, "rewards/no_repetition_reward_func": -0.2955704927444458, "rewards/verse_reward_func": 0.0, "step": 2972 }, { "completion_length": 250.046875, "epoch": 23.784, "grad_norm": 4.875, "kl": 3.6868479251861572, "learning_rate": 3.141313876100976e-05, "loss": 0.1475, "reward": 3.0617785453796387, "reward_std": 3.0284756422042847, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3884247541427612, "rewards/no_repetition_reward_func": -0.3266463428735733, "rewards/verse_reward_func": 0.0, "step": 2973 }, { "completion_length": 244.265625, "epoch": 23.792, "grad_norm": 1.4453125, "kl": 4.725560545921326, "learning_rate": 3.139964235281682e-05, "loss": 0.189, "reward": 2.629732370376587, "reward_std": 3.1465765237808228, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.904516577720642, "rewards/no_repetition_reward_func": -0.274784192442894, "rewards/verse_reward_func": 0.0, "step": 2974 }, { "completion_length": 253.40625, "epoch": 23.8, "grad_norm": 3.59375, "kl": 4.90722131729126, "learning_rate": 3.138614394839476e-05, "loss": 0.1963, "reward": 1.8978312015533447, "reward_std": 2.510649800300598, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.164921283721924, "rewards/no_repetition_reward_func": -0.2670901045203209, "rewards/verse_reward_func": 0.0, "step": 2975 }, { "completion_length": 248.921875, "epoch": 23.808, "grad_norm": 1.3515625, "kl": 4.664176940917969, "learning_rate": 3.137264355195413e-05, "loss": 0.1866, "reward": 2.2948999404907227, "reward_std": 2.6725995540618896, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5573720932006836, "rewards/no_repetition_reward_func": -0.2546597719192505, "rewards/verse_reward_func": -0.0078125, "step": 2976 }, { "completion_length": 240.8125, "epoch": 23.816, "grad_norm": 1.3671875, "kl": 4.449814319610596, "learning_rate": 3.135914116770609e-05, "loss": 0.178, "reward": 2.3148247599601746, "reward_std": 2.7487813234329224, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.575141429901123, "rewards/no_repetition_reward_func": -0.25250399112701416, "rewards/verse_reward_func": -0.0078125, "step": 2977 }, { "completion_length": 243.484375, "epoch": 23.824, "grad_norm": 2.046875, "kl": 4.953411817550659, "learning_rate": 3.134563679986238e-05, "loss": 0.1981, "reward": 2.3406269550323486, "reward_std": 3.2774808406829834, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6021323204040527, "rewards/no_repetition_reward_func": -0.25369296222925186, "rewards/verse_reward_func": -0.0078125, "step": 2978 }, { "completion_length": 251.171875, "epoch": 23.832, "grad_norm": 1.5234375, "kl": 5.103280544281006, "learning_rate": 3.133213045263543e-05, "loss": 0.2041, "reward": 2.002312421798706, "reward_std": 2.8208677768707275, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.2892906069755554, "rewards/no_repetition_reward_func": -0.27916572988033295, "rewards/verse_reward_func": -0.0078125, "step": 2979 }, { "completion_length": 246.890625, "epoch": 23.84, "grad_norm": 3.296875, "kl": 4.0988686084747314, "learning_rate": 3.1318622130238236e-05, "loss": 0.164, "reward": 3.150997281074524, "reward_std": 3.0029101371765137, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.403026580810547, "rewards/no_repetition_reward_func": -0.2520291581749916, "rewards/verse_reward_func": 0.0, "step": 2980 }, { "completion_length": 249.828125, "epoch": 23.848, "grad_norm": 2.75, "kl": 3.6284555196762085, "learning_rate": 3.1305111836884425e-05, "loss": 0.1451, "reward": 3.3576632738113403, "reward_std": 3.121533155441284, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.6778485774993896, "rewards/no_repetition_reward_func": -0.32018527388572693, "rewards/verse_reward_func": 0.0, "step": 2981 }, { "completion_length": 255.796875, "epoch": 23.856, "grad_norm": 3.46875, "kl": 3.887643814086914, "learning_rate": 3.129159957678824e-05, "loss": 0.1555, "reward": 3.0871453285217285, "reward_std": 3.1124805212020874, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.396891474723816, "rewards/no_repetition_reward_func": -0.30974605679512024, "rewards/verse_reward_func": 0.0, "step": 2982 }, { "completion_length": 246.015625, "epoch": 23.864, "grad_norm": 4.0, "kl": 4.961579322814941, "learning_rate": 3.127808535416454e-05, "loss": 0.1985, "reward": 2.5783982276916504, "reward_std": 2.9243184328079224, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.886088252067566, "rewards/no_repetition_reward_func": -0.29987749457359314, "rewards/verse_reward_func": -0.0078125, "step": 2983 }, { "completion_length": 252.234375, "epoch": 23.872, "grad_norm": 3.015625, "kl": 4.912227630615234, "learning_rate": 3.126456917322878e-05, "loss": 0.1965, "reward": 2.567390203475952, "reward_std": 3.127951502799988, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.8466131687164307, "rewards/no_repetition_reward_func": -0.27922289073467255, "rewards/verse_reward_func": 0.0, "step": 2984 }, { "completion_length": 254.765625, "epoch": 23.88, "grad_norm": 4.125, "kl": 5.368544578552246, "learning_rate": 3.1251051038197055e-05, "loss": 0.2147, "reward": 2.034441351890564, "reward_std": 2.8266249895095825, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.348509669303894, "rewards/no_repetition_reward_func": -0.3062557801604271, "rewards/verse_reward_func": -0.0078125, "step": 2985 }, { "completion_length": 245.984375, "epoch": 23.888, "grad_norm": 3.328125, "kl": 4.073462247848511, "learning_rate": 3.123753095328604e-05, "loss": 0.1629, "reward": 3.1744959354400635, "reward_std": 2.931050181388855, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.4896024465560913, "rewards/no_repetition_reward_func": -0.29948167502880096, "rewards/verse_reward_func": -0.015625, "step": 2986 }, { "completion_length": 248.46875, "epoch": 23.896, "grad_norm": 3.5, "kl": 5.224755764007568, "learning_rate": 3.1224008922713044e-05, "loss": 0.209, "reward": 2.3681883811950684, "reward_std": 2.987470269203186, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6202951669692993, "rewards/no_repetition_reward_func": -0.25210700184106827, "rewards/verse_reward_func": 0.0, "step": 2987 }, { "completion_length": 243.4375, "epoch": 23.904, "grad_norm": 3.359375, "kl": 3.7962793111801147, "learning_rate": 3.121048495069596e-05, "loss": 0.1519, "reward": 3.4263510704040527, "reward_std": 2.987104892730713, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.7620949745178223, "rewards/no_repetition_reward_func": -0.3279312700033188, "rewards/verse_reward_func": -0.0078125, "step": 2988 }, { "completion_length": 256.0, "epoch": 23.912, "grad_norm": 4.78125, "kl": 5.069756507873535, "learning_rate": 3.11969590414533e-05, "loss": 0.2028, "reward": 2.603839874267578, "reward_std": 3.1214765310287476, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.885442018508911, "rewards/no_repetition_reward_func": -0.28160223364830017, "rewards/verse_reward_func": 0.0, "step": 2989 }, { "completion_length": 252.78125, "epoch": 23.92, "grad_norm": 3.234375, "kl": 5.620175123214722, "learning_rate": 3.118343119920418e-05, "loss": 0.2248, "reward": 2.3021263480186462, "reward_std": 2.8267048597335815, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.5428558588027954, "rewards/no_repetition_reward_func": -0.24072961509227753, "rewards/verse_reward_func": 0.0, "step": 2990 }, { "completion_length": 251.765625, "epoch": 23.928, "grad_norm": 4.46875, "kl": 5.487448215484619, "learning_rate": 3.11699014281683e-05, "loss": 0.2195, "reward": 3.075846791267395, "reward_std": 3.493288278579712, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.3717575073242188, "rewards/no_repetition_reward_func": -0.2959108203649521, "rewards/verse_reward_func": 0.0, "step": 2991 }, { "completion_length": 251.859375, "epoch": 23.936, "grad_norm": 7.6875, "kl": 5.829293251037598, "learning_rate": 3.1156369732566006e-05, "loss": 0.2332, "reward": 2.2138837575912476, "reward_std": 2.6585992574691772, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.4947121143341064, "rewards/no_repetition_reward_func": -0.2808285355567932, "rewards/verse_reward_func": 0.0, "step": 2992 }, { "completion_length": 245.6875, "epoch": 23.944, "grad_norm": 3.984375, "kl": 4.289042711257935, "learning_rate": 3.114283611661818e-05, "loss": 0.1716, "reward": 3.2459163665771484, "reward_std": 3.3250977993011475, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 3.539999485015869, "rewards/no_repetition_reward_func": -0.29408323764801025, "rewards/verse_reward_func": 0.0, "step": 2993 }, { "completion_length": 254.75, "epoch": 23.951999999999998, "grad_norm": 4.4375, "kl": 5.672282934188843, "learning_rate": 3.1129300584546375e-05, "loss": 0.2269, "reward": 1.7567574381828308, "reward_std": 2.379573941230774, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 1.96185302734375, "rewards/no_repetition_reward_func": -0.2050955444574356, "rewards/verse_reward_func": 0.0, "step": 2994 }, { "completion_length": 256.0, "epoch": 23.96, "grad_norm": 1.8671875, "kl": 5.12778639793396, "learning_rate": 3.111576314057268e-05, "loss": 0.2051, "reward": 2.339706778526306, "reward_std": 2.8263076543807983, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6261974573135376, "rewards/no_repetition_reward_func": -0.270865797996521, "rewards/verse_reward_func": -0.015625, "step": 2995 }, { "completion_length": 243.484375, "epoch": 23.968, "grad_norm": 3.171875, "kl": 5.251835107803345, "learning_rate": 3.1102223788919824e-05, "loss": 0.2101, "reward": 2.305238664150238, "reward_std": 2.9614421129226685, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.581118583679199, "rewards/no_repetition_reward_func": -0.2602549344301224, "rewards/verse_reward_func": -0.015625, "step": 2996 }, { "completion_length": 248.65625, "epoch": 23.976, "grad_norm": 1.4921875, "kl": 5.458730459213257, "learning_rate": 3.10886825338111e-05, "loss": 0.2183, "reward": 2.388221025466919, "reward_std": 2.782841682434082, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.6294111013412476, "rewards/no_repetition_reward_func": -0.2333776131272316, "rewards/verse_reward_func": -0.0078125, "step": 2997 }, { "completion_length": 250.484375, "epoch": 23.984, "grad_norm": 1.6484375, "kl": 5.052369594573975, "learning_rate": 3.107513937947041e-05, "loss": 0.2021, "reward": 2.137341797351837, "reward_std": 2.7948638200759888, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.393793523311615, "rewards/no_repetition_reward_func": -0.25645167380571365, "rewards/verse_reward_func": 0.0, "step": 2998 }, { "completion_length": 244.515625, "epoch": 23.992, "grad_norm": 1.71875, "kl": 4.356399774551392, "learning_rate": 3.1061594330122246e-05, "loss": 0.1743, "reward": 2.6260764598846436, "reward_std": 2.9453532695770264, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.923661708831787, "rewards/no_repetition_reward_func": -0.297585129737854, "rewards/verse_reward_func": 0.0, "step": 2999 }, { "completion_length": 256.0, "epoch": 24.0, "grad_norm": 1.6875, "kl": 5.521050930023193, "learning_rate": 3.104804738999169e-05, "loss": 0.2208, "reward": 2.097045421600342, "reward_std": 2.955470561981201, "rewards/check_divine_comedy_plagiarism": 0.0, "rewards/endecasillabo_reward_func": 2.372663378715515, "rewards/no_repetition_reward_func": -0.2678053304553032, "rewards/verse_reward_func": -0.0078125, "step": 3000 } ], "logging_steps": 1, "max_steps": 6250, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }