diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,15033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 442.78125, + "epoch": 0.008, + "grad_norm": 0.31640625, + "kl": 0.0, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "reward": 0.46993792057037354, + "reward_std": 0.8625249862670898, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 0.6264747679233551, + "rewards/no_repetition_reward_func": -0.04716183803975582, + "rewards/verse_reward_func": 0.0, + "step": 1 + }, + { + "completion_length": 461.078125, + "epoch": 0.016, + "grad_norm": 0.36328125, + "kl": 0.0, + "learning_rate": 8.000000000000001e-07, + "loss": -0.0, + "reward": 0.3795546889305115, + "reward_std": 0.748875766992569, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 0.5617085993289948, + "rewards/no_repetition_reward_func": -0.04152892902493477, + "rewards/verse_reward_func": 0.0, + "step": 2 + }, + { + "completion_length": 426.515625, + "epoch": 0.024, + "grad_norm": 0.70703125, + "kl": 0.0009579287725500762, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0, + "reward": 0.42995311319828033, + "reward_std": 0.7965883165597916, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.6267977207899094, + "rewards/no_repetition_reward_func": -0.040594594553112984, + "rewards/verse_reward_func": 0.0, + "step": 3 + }, + { + "completion_length": 483.578125, + "epoch": 0.032, + "grad_norm": 0.3125, + "kl": 0.0010316886473447084, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0, + "reward": 0.34902599453926086, + "reward_std": 0.5390940457582474, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 0.5808515548706055, + "rewards/no_repetition_reward_func": -0.044325582683086395, + "rewards/verse_reward_func": 0.0, + "step": 4 + }, + { + "completion_length": 426.0, + "epoch": 0.04, + "grad_norm": 0.53515625, + "kl": 0.000956712057814002, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0, + "reward": 0.44258762896060944, + "reward_std": 1.0057218968868256, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.6770601570606232, + "rewards/no_repetition_reward_func": -0.0469724927097559, + "rewards/verse_reward_func": -0.015625, + "step": 5 + }, + { + "completion_length": 450.609375, + "epoch": 0.048, + "grad_norm": 0.640625, + "kl": 0.0010197210358455777, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0, + "reward": 0.4333910793066025, + "reward_std": 0.6018790304660797, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 0.5409813523292542, + "rewards/no_repetition_reward_func": -0.03727777861058712, + "rewards/verse_reward_func": -0.0078125, + "step": 6 + }, + { + "completion_length": 484.75, + "epoch": 0.056, + "grad_norm": 0.341796875, + "kl": 0.0010506573598831892, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0, + "reward": 0.31627973914146423, + "reward_std": 0.43295249342918396, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.510929599404335, + "rewards/no_repetition_reward_func": -0.03839987888932228, + "rewards/verse_reward_func": 0.0, + "step": 7 + }, + { + "completion_length": 429.234375, + "epoch": 0.064, + "grad_norm": 0.58984375, + "kl": 0.0009898374555632472, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0, + "reward": 0.3031166270375252, + "reward_std": 0.5785702913999557, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.5378998965024948, + "rewards/no_repetition_reward_func": -0.04728328995406628, + "rewards/verse_reward_func": -0.015625, + "step": 8 + }, + { + "completion_length": 471.328125, + "epoch": 0.072, + "grad_norm": 0.34375, + "kl": 0.00097603106405586, + "learning_rate": 3.6e-06, + "loss": 0.0, + "reward": 0.23348665237426758, + "reward_std": 0.4668666422367096, + "rewards/check_divine_comedy_plagiarism": -0.203125, + "rewards/endecasillabo_reward_func": 0.48599667847156525, + "rewards/no_repetition_reward_func": -0.04938501678407192, + "rewards/verse_reward_func": 0.0, + "step": 9 + }, + { + "completion_length": 463.390625, + "epoch": 0.08, + "grad_norm": 0.39453125, + "kl": 0.0010067435214295983, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0, + "reward": 0.2730628699064255, + "reward_std": 0.519858330488205, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 0.4666738659143448, + "rewards/no_repetition_reward_func": -0.045173484832048416, + "rewards/verse_reward_func": -0.0078125, + "step": 10 + }, + { + "completion_length": 475.140625, + "epoch": 0.088, + "grad_norm": 0.34765625, + "kl": 0.0009860311402007937, + "learning_rate": 4.4e-06, + "loss": 0.0, + "reward": 0.2627307251095772, + "reward_std": 0.43047034740448, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.46805787086486816, + "rewards/no_repetition_reward_func": -0.049077119678258896, + "rewards/verse_reward_func": 0.0, + "step": 11 + }, + { + "completion_length": 473.09375, + "epoch": 0.096, + "grad_norm": 0.58203125, + "kl": 0.0010468198452144861, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0, + "reward": 0.3099640756845474, + "reward_std": 0.42746463418006897, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.5119439959526062, + "rewards/no_repetition_reward_func": -0.037917450070381165, + "rewards/verse_reward_func": -0.0078125, + "step": 12 + }, + { + "completion_length": 457.078125, + "epoch": 0.104, + "grad_norm": 0.4921875, + "kl": 0.0009915704722516239, + "learning_rate": 5.2e-06, + "loss": 0.0, + "reward": 0.37225714325904846, + "reward_std": 0.48906467854976654, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 0.6121878623962402, + "rewards/no_repetition_reward_func": -0.044618215411901474, + "rewards/verse_reward_func": -0.0078125, + "step": 13 + }, + { + "completion_length": 489.078125, + "epoch": 0.112, + "grad_norm": 0.306640625, + "kl": 0.0009826593450270593, + "learning_rate": 5.600000000000001e-06, + "loss": 0.0, + "reward": 0.2486378364264965, + "reward_std": 0.45466698706150055, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 0.4032948464155197, + "rewards/no_repetition_reward_func": -0.045282019302248955, + "rewards/verse_reward_func": 0.0, + "step": 14 + }, + { + "completion_length": 470.25, + "epoch": 0.12, + "grad_norm": 0.353515625, + "kl": 0.0010016491869464517, + "learning_rate": 6e-06, + "loss": 0.0, + "reward": 0.2742869704961777, + "reward_std": 0.5216390788555145, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.5065180063247681, + "rewards/no_repetition_reward_func": -0.044731052592396736, + "rewards/verse_reward_func": -0.015625, + "step": 15 + }, + { + "completion_length": 449.859375, + "epoch": 0.128, + "grad_norm": 0.48828125, + "kl": 0.0009753390913829207, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.0, + "reward": 0.2656503915786743, + "reward_std": 0.3852406293153763, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.47733578085899353, + "rewards/no_repetition_reward_func": -0.04762289486825466, + "rewards/verse_reward_func": -0.0078125, + "step": 16 + }, + { + "completion_length": 483.96875, + "epoch": 0.136, + "grad_norm": 0.32421875, + "kl": 0.0009817428654059768, + "learning_rate": 6.800000000000001e-06, + "loss": 0.0, + "reward": 0.3551361411809921, + "reward_std": 0.4344247132539749, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 0.46094539761543274, + "rewards/no_repetition_reward_func": -0.04330925829708576, + "rewards/verse_reward_func": 0.0, + "step": 17 + }, + { + "completion_length": 431.8125, + "epoch": 0.144, + "grad_norm": 0.90625, + "kl": 0.0010344594193156809, + "learning_rate": 7.2e-06, + "loss": 0.0, + "reward": 0.657691091299057, + "reward_std": 1.2022887468338013, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 0.8098189830780029, + "rewards/no_repetition_reward_func": -0.034940388053655624, + "rewards/verse_reward_func": -0.0234375, + "step": 18 + }, + { + "completion_length": 447.578125, + "epoch": 0.152, + "grad_norm": 1.515625, + "kl": 0.0009738605294842273, + "learning_rate": 7.6e-06, + "loss": 0.0, + "reward": 0.39017675817012787, + "reward_std": 0.7737232744693756, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 0.5691927373409271, + "rewards/no_repetition_reward_func": -0.046203507110476494, + "rewards/verse_reward_func": -0.0078125, + "step": 19 + }, + { + "completion_length": 440.09375, + "epoch": 0.16, + "grad_norm": 0.60546875, + "kl": 0.0009960970492102206, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0, + "reward": 0.5408629775047302, + "reward_std": 0.953283280134201, + "rewards/check_divine_comedy_plagiarism": -0.296875, + "rewards/endecasillabo_reward_func": 0.8857129812240601, + "rewards/no_repetition_reward_func": -0.040162453427910805, + "rewards/verse_reward_func": -0.0078125, + "step": 20 + }, + { + "completion_length": 416.15625, + "epoch": 0.168, + "grad_norm": 2.390625, + "kl": 0.000912841351237148, + "learning_rate": 8.400000000000001e-06, + "loss": 0.0, + "reward": 0.32790185511112213, + "reward_std": 0.9803280830383301, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.5582650005817413, + "rewards/no_repetition_reward_func": -0.03505067713558674, + "rewards/verse_reward_func": -0.0390625, + "step": 21 + }, + { + "completion_length": 454.359375, + "epoch": 0.176, + "grad_norm": 0.44140625, + "kl": 0.0009786133305169642, + "learning_rate": 8.8e-06, + "loss": 0.0, + "reward": 0.5803987979888916, + "reward_std": 0.8945976495742798, + "rewards/check_divine_comedy_plagiarism": -0.21875, + "rewards/endecasillabo_reward_func": 0.8402650356292725, + "rewards/no_repetition_reward_func": -0.04111618548631668, + "rewards/verse_reward_func": 0.0, + "step": 22 + }, + { + "completion_length": 430.296875, + "epoch": 0.184, + "grad_norm": 0.47265625, + "kl": 0.0010353561374358833, + "learning_rate": 9.2e-06, + "loss": 0.0, + "reward": 0.3766457438468933, + "reward_std": 0.8662939071655273, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 0.622977077960968, + "rewards/no_repetition_reward_func": -0.043206335976719856, + "rewards/verse_reward_func": -0.015625, + "step": 23 + }, + { + "completion_length": 463.515625, + "epoch": 0.192, + "grad_norm": 0.80859375, + "kl": 0.0009999867179431021, + "learning_rate": 9.600000000000001e-06, + "loss": 0.0, + "reward": 0.30690987408161163, + "reward_std": 0.5106116235256195, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 0.48487940430641174, + "rewards/no_repetition_reward_func": -0.04515702649950981, + "rewards/verse_reward_func": -0.0078125, + "step": 24 + }, + { + "completion_length": 470.0, + "epoch": 0.2, + "grad_norm": 0.41796875, + "kl": 0.0009808654431253672, + "learning_rate": 1e-05, + "loss": 0.0, + "reward": 0.20252301543951035, + "reward_std": 0.39985404908657074, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.4198773056268692, + "rewards/no_repetition_reward_func": -0.04547928832471371, + "rewards/verse_reward_func": 0.0, + "step": 25 + }, + { + "completion_length": 441.78125, + "epoch": 0.208, + "grad_norm": 0.62109375, + "kl": 0.0010012995917350054, + "learning_rate": 1.04e-05, + "loss": 0.0, + "reward": 0.3315119594335556, + "reward_std": 0.516377717256546, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 0.5217783451080322, + "rewards/no_repetition_reward_func": -0.04182891733944416, + "rewards/verse_reward_func": -0.0078125, + "step": 26 + }, + { + "completion_length": 433.828125, + "epoch": 0.216, + "grad_norm": 0.359375, + "kl": 0.0009515023266430944, + "learning_rate": 1.08e-05, + "loss": 0.0, + "reward": 0.5474312603473663, + "reward_std": 0.8645108640193939, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 0.6717439889907837, + "rewards/no_repetition_reward_func": -0.04618772864341736, + "rewards/verse_reward_func": 0.0, + "step": 27 + }, + { + "completion_length": 444.421875, + "epoch": 0.224, + "grad_norm": 0.361328125, + "kl": 0.0009452465455979109, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.0, + "reward": 0.4300694763660431, + "reward_std": 0.5645215660333633, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.6407122015953064, + "rewards/no_repetition_reward_func": -0.04658020846545696, + "rewards/verse_reward_func": -0.0078125, + "step": 28 + }, + { + "completion_length": 434.984375, + "epoch": 0.232, + "grad_norm": 0.470703125, + "kl": 0.0009487473289482296, + "learning_rate": 1.16e-05, + "loss": 0.0, + "reward": 0.7558678686618805, + "reward_std": 1.0926580131053925, + "rewards/check_divine_comedy_plagiarism": -0.21875, + "rewards/endecasillabo_reward_func": 1.0141460448503494, + "rewards/no_repetition_reward_func": -0.03952809423208237, + "rewards/verse_reward_func": 0.0, + "step": 29 + }, + { + "completion_length": 467.0625, + "epoch": 0.24, + "grad_norm": 0.380859375, + "kl": 0.0010360184824094176, + "learning_rate": 1.2e-05, + "loss": 0.0, + "reward": 0.3131437599658966, + "reward_std": 0.5231723785400391, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.5078757107257843, + "rewards/no_repetition_reward_func": -0.0384819321334362, + "rewards/verse_reward_func": 0.0, + "step": 30 + }, + { + "completion_length": 455.171875, + "epoch": 0.248, + "grad_norm": 0.61328125, + "kl": 0.0010010850965045393, + "learning_rate": 1.24e-05, + "loss": 0.0, + "reward": 0.4912389814853668, + "reward_std": 0.8905033022165298, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 0.6457822322845459, + "rewards/no_repetition_reward_func": -0.04516823962330818, + "rewards/verse_reward_func": 0.0, + "step": 31 + }, + { + "completion_length": 427.765625, + "epoch": 0.256, + "grad_norm": 0.578125, + "kl": 0.0010338124120607972, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.0, + "reward": 0.33854614198207855, + "reward_std": 0.5102540701627731, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 0.5036978125572205, + "rewards/no_repetition_reward_func": -0.04796416498720646, + "rewards/verse_reward_func": -0.0234375, + "step": 32 + }, + { + "completion_length": 461.984375, + "epoch": 0.264, + "grad_norm": 0.462890625, + "kl": 0.0010262654977850616, + "learning_rate": 1.32e-05, + "loss": 0.0, + "reward": 0.31029289960861206, + "reward_std": 0.48135456442832947, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 0.416566863656044, + "rewards/no_repetition_reward_func": -0.04377397149801254, + "rewards/verse_reward_func": 0.0, + "step": 33 + }, + { + "completion_length": 461.703125, + "epoch": 0.272, + "grad_norm": 0.353515625, + "kl": 0.0009811387863010168, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.0, + "reward": 0.25785423815250397, + "reward_std": 0.4621564447879791, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 0.4109620451927185, + "rewards/no_repetition_reward_func": -0.043732818216085434, + "rewards/verse_reward_func": 0.0, + "step": 34 + }, + { + "completion_length": 469.890625, + "epoch": 0.28, + "grad_norm": 0.478515625, + "kl": 0.0009994769934564829, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.0, + "reward": 0.3228471875190735, + "reward_std": 0.4422815591096878, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 0.4409794211387634, + "rewards/no_repetition_reward_func": -0.04000721871852875, + "rewards/verse_reward_func": -0.015625, + "step": 35 + }, + { + "completion_length": 423.015625, + "epoch": 0.288, + "grad_norm": 0.99609375, + "kl": 0.001061782764736563, + "learning_rate": 1.44e-05, + "loss": 0.0, + "reward": 0.4374115616083145, + "reward_std": 0.775359034538269, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 0.6044479310512543, + "rewards/no_repetition_reward_func": -0.04203632287681103, + "rewards/verse_reward_func": -0.03125, + "step": 36 + }, + { + "completion_length": 433.5, + "epoch": 0.296, + "grad_norm": 0.75, + "kl": 0.000980492535745725, + "learning_rate": 1.48e-05, + "loss": 0.0, + "reward": 0.42973655462265015, + "reward_std": 0.9926208555698395, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 0.7014500200748444, + "rewards/no_repetition_reward_func": -0.04515094868838787, + "rewards/verse_reward_func": -0.0390625, + "step": 37 + }, + { + "completion_length": 478.0, + "epoch": 0.304, + "grad_norm": 0.31640625, + "kl": 0.001019574177917093, + "learning_rate": 1.52e-05, + "loss": 0.0, + "reward": 0.3557324782013893, + "reward_std": 0.5584861487150192, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.5744373500347137, + "rewards/no_repetition_reward_func": -0.046829864382743835, + "rewards/verse_reward_func": 0.0, + "step": 38 + }, + { + "completion_length": 484.296875, + "epoch": 0.312, + "grad_norm": 0.671875, + "kl": 0.0010648351162672043, + "learning_rate": 1.56e-05, + "loss": 0.0, + "reward": 0.5536489188671112, + "reward_std": 0.9015166163444519, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 0.7964134216308594, + "rewards/no_repetition_reward_func": -0.039639439433813095, + "rewards/verse_reward_func": -0.015625, + "step": 39 + }, + { + "completion_length": 421.8125, + "epoch": 0.32, + "grad_norm": 1.0546875, + "kl": 0.0010699689737521112, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0, + "reward": 0.2781845033168793, + "reward_std": 0.8514465391635895, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 0.5512334555387497, + "rewards/no_repetition_reward_func": -0.03867390751838684, + "rewards/verse_reward_func": -0.046875, + "step": 40 + }, + { + "completion_length": 458.515625, + "epoch": 0.328, + "grad_norm": 0.8828125, + "kl": 0.0010424097417853773, + "learning_rate": 1.6400000000000002e-05, + "loss": 0.0, + "reward": 0.10929479449987411, + "reward_std": 0.476631835103035, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 0.31693705916404724, + "rewards/no_repetition_reward_func": -0.04357975721359253, + "rewards/verse_reward_func": -0.0234375, + "step": 41 + }, + { + "completion_length": 454.046875, + "epoch": 0.336, + "grad_norm": 0.546875, + "kl": 0.001036638393998146, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.0, + "reward": 0.3044891655445099, + "reward_std": 0.5535133183002472, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 0.5395083129405975, + "rewards/no_repetition_reward_func": -0.039706628769636154, + "rewards/verse_reward_func": -0.0078125, + "step": 42 + }, + { + "completion_length": 464.390625, + "epoch": 0.344, + "grad_norm": 1.84375, + "kl": 0.001063900301232934, + "learning_rate": 1.7199999999999998e-05, + "loss": 0.0, + "reward": 0.5829049497842789, + "reward_std": 0.9153306186199188, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 0.6906334757804871, + "rewards/no_repetition_reward_func": -0.03741606883704662, + "rewards/verse_reward_func": -0.0078125, + "step": 43 + }, + { + "completion_length": 459.890625, + "epoch": 0.352, + "grad_norm": 0.453125, + "kl": 0.0010322544258087873, + "learning_rate": 1.76e-05, + "loss": 0.0, + "reward": 0.3220553919672966, + "reward_std": 0.4588521271944046, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.5344728976488113, + "rewards/no_repetition_reward_func": -0.040542466565966606, + "rewards/verse_reward_func": 0.0, + "step": 44 + }, + { + "completion_length": 464.203125, + "epoch": 0.36, + "grad_norm": 0.39453125, + "kl": 0.001039309543557465, + "learning_rate": 1.8e-05, + "loss": 0.0, + "reward": 0.630920872092247, + "reward_std": 0.7587323784828186, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 0.7412005662918091, + "rewards/no_repetition_reward_func": -0.047779686748981476, + "rewards/verse_reward_func": 0.0, + "step": 45 + }, + { + "completion_length": 442.53125, + "epoch": 0.368, + "grad_norm": 0.359375, + "kl": 0.0010214670619461685, + "learning_rate": 1.84e-05, + "loss": 0.0, + "reward": 0.6546954810619354, + "reward_std": 0.8025196194648743, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 0.8016980290412903, + "rewards/no_repetition_reward_func": -0.03762756288051605, + "rewards/verse_reward_func": 0.0, + "step": 46 + }, + { + "completion_length": 478.890625, + "epoch": 0.376, + "grad_norm": 0.345703125, + "kl": 0.0011248913942836225, + "learning_rate": 1.88e-05, + "loss": 0.0, + "reward": 0.3511745184659958, + "reward_std": 0.4347453713417053, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 0.4835871309041977, + "rewards/no_repetition_reward_func": -0.038662564009428024, + "rewards/verse_reward_func": 0.0, + "step": 47 + }, + { + "completion_length": 451.453125, + "epoch": 0.384, + "grad_norm": 0.36328125, + "kl": 0.0010035860468633473, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.0, + "reward": 0.28165166825056076, + "reward_std": 0.5988867878913879, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 0.4827439785003662, + "rewards/no_repetition_reward_func": -0.0448423158377409, + "rewards/verse_reward_func": -0.015625, + "step": 48 + }, + { + "completion_length": 451.75, + "epoch": 0.392, + "grad_norm": 0.404296875, + "kl": 0.0010795429116114974, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.0, + "reward": 0.3654388040304184, + "reward_std": 0.5081634372472763, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 0.5157425552606583, + "rewards/no_repetition_reward_func": -0.04092877358198166, + "rewards/verse_reward_func": 0.0, + "step": 49 + }, + { + "completion_length": 463.234375, + "epoch": 0.4, + "grad_norm": 0.478515625, + "kl": 0.0010620596585795283, + "learning_rate": 2e-05, + "loss": 0.0, + "reward": 0.30326101928949356, + "reward_std": 0.7666293084621429, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 0.4668480157852173, + "rewards/no_repetition_reward_func": -0.03858700022101402, + "rewards/verse_reward_func": -0.03125, + "step": 50 + }, + { + "completion_length": 454.359375, + "epoch": 0.408, + "grad_norm": 2.25, + "kl": 0.001037747715599835, + "learning_rate": 2.04e-05, + "loss": 0.0, + "reward": 0.3108884021639824, + "reward_std": 0.623764306306839, + "rewards/check_divine_comedy_plagiarism": -0.21875, + "rewards/endecasillabo_reward_func": 0.5842487215995789, + "rewards/no_repetition_reward_func": -0.03898531757295132, + "rewards/verse_reward_func": -0.015625, + "step": 51 + }, + { + "completion_length": 449.234375, + "epoch": 0.416, + "grad_norm": 0.484375, + "kl": 0.0010128822177648544, + "learning_rate": 2.08e-05, + "loss": 0.0, + "reward": 0.3826422244310379, + "reward_std": 0.7567698657512665, + "rewards/check_divine_comedy_plagiarism": -0.3125, + "rewards/endecasillabo_reward_func": 0.7441763281822205, + "rewards/no_repetition_reward_func": -0.049034105613827705, + "rewards/verse_reward_func": 0.0, + "step": 52 + }, + { + "completion_length": 449.96875, + "epoch": 0.424, + "grad_norm": 0.365234375, + "kl": 0.00099859171314165, + "learning_rate": 2.12e-05, + "loss": 0.0, + "reward": 0.23517782986164093, + "reward_std": 0.3163221478462219, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 0.4160512685775757, + "rewards/no_repetition_reward_func": -0.04024842567741871, + "rewards/verse_reward_func": 0.0, + "step": 53 + }, + { + "completion_length": 454.515625, + "epoch": 0.432, + "grad_norm": 0.55859375, + "kl": 0.0010698023834265769, + "learning_rate": 2.16e-05, + "loss": 0.0, + "reward": 0.5303485542535782, + "reward_std": 0.8272091150283813, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 0.6717264354228973, + "rewards/no_repetition_reward_func": -0.03981535695493221, + "rewards/verse_reward_func": -0.0078125, + "step": 54 + }, + { + "completion_length": 468.6875, + "epoch": 0.44, + "grad_norm": 0.455078125, + "kl": 0.0011855590855702758, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0, + "reward": 0.5519488900899887, + "reward_std": 0.6910285353660583, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 0.7171671986579895, + "rewards/no_repetition_reward_func": -0.0402182973921299, + "rewards/verse_reward_func": -0.015625, + "step": 55 + }, + { + "completion_length": 451.25, + "epoch": 0.448, + "grad_norm": 0.69140625, + "kl": 0.001168492017313838, + "learning_rate": 2.2400000000000002e-05, + "loss": 0.0, + "reward": 0.44948333501815796, + "reward_std": 0.9772664904594421, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 0.6366576254367828, + "rewards/no_repetition_reward_func": -0.03873679041862488, + "rewards/verse_reward_func": -0.0234375, + "step": 56 + }, + { + "completion_length": 434.5625, + "epoch": 0.456, + "grad_norm": 1.015625, + "kl": 0.0011411018203943968, + "learning_rate": 2.2800000000000002e-05, + "loss": 0.0, + "reward": 0.32538358867168427, + "reward_std": 0.7170661389827728, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.5499090552330017, + "rewards/no_repetition_reward_func": -0.037025460973381996, + "rewards/verse_reward_func": -0.03125, + "step": 57 + }, + { + "completion_length": 486.15625, + "epoch": 0.464, + "grad_norm": 0.330078125, + "kl": 0.001174404809717089, + "learning_rate": 2.32e-05, + "loss": 0.0, + "reward": 0.4274017661809921, + "reward_std": 0.5160686075687408, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 0.5328443646430969, + "rewards/no_repetition_reward_func": -0.04294262453913689, + "rewards/verse_reward_func": 0.0, + "step": 58 + }, + { + "completion_length": 443.5, + "epoch": 0.472, + "grad_norm": 0.39453125, + "kl": 0.0010773239773698151, + "learning_rate": 2.36e-05, + "loss": 0.0, + "reward": 0.3753148168325424, + "reward_std": 0.7595551162958145, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.5897012948989868, + "rewards/no_repetition_reward_func": -0.05032398737967014, + "rewards/verse_reward_func": -0.0078125, + "step": 59 + }, + { + "completion_length": 454.078125, + "epoch": 0.48, + "grad_norm": 0.349609375, + "kl": 0.0010995475458912551, + "learning_rate": 2.4e-05, + "loss": 0.0, + "reward": 0.5049470663070679, + "reward_std": 0.6428529024124146, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.7054009139537811, + "rewards/no_repetition_reward_func": -0.044203853234648705, + "rewards/verse_reward_func": 0.0, + "step": 60 + }, + { + "completion_length": 440.9375, + "epoch": 0.488, + "grad_norm": 0.66015625, + "kl": 0.001055776490829885, + "learning_rate": 2.44e-05, + "loss": 0.0, + "reward": 0.561566025018692, + "reward_std": 1.0304906368255615, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.7870776057243347, + "rewards/no_repetition_reward_func": -0.045824069529771805, + "rewards/verse_reward_func": -0.0078125, + "step": 61 + }, + { + "completion_length": 470.359375, + "epoch": 0.496, + "grad_norm": 0.337890625, + "kl": 0.0011617125128395855, + "learning_rate": 2.48e-05, + "loss": 0.0, + "reward": 0.5021537840366364, + "reward_std": 0.72962686419487, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.7172861993312836, + "rewards/no_repetition_reward_func": -0.04325737804174423, + "rewards/verse_reward_func": 0.0, + "step": 62 + }, + { + "completion_length": 473.171875, + "epoch": 0.504, + "grad_norm": 0.412109375, + "kl": 0.0011741581256501377, + "learning_rate": 2.5200000000000003e-05, + "loss": 0.0, + "reward": 0.38993144035339355, + "reward_std": 0.5557703971862793, + "rewards/check_divine_comedy_plagiarism": -0.234375, + "rewards/endecasillabo_reward_func": 0.6673791408538818, + "rewards/no_repetition_reward_func": -0.043072719126939774, + "rewards/verse_reward_func": 0.0, + "step": 63 + }, + { + "completion_length": 438.75, + "epoch": 0.512, + "grad_norm": 0.984375, + "kl": 0.0012093976838514209, + "learning_rate": 2.5600000000000002e-05, + "loss": 0.0, + "reward": 0.1896609589457512, + "reward_std": 0.4734877943992615, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 0.3750666230916977, + "rewards/no_repetition_reward_func": -0.0447806678712368, + "rewards/verse_reward_func": -0.015625, + "step": 64 + }, + { + "completion_length": 455.21875, + "epoch": 0.52, + "grad_norm": 0.5234375, + "kl": 0.001271282962989062, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0001, + "reward": 0.4583015441894531, + "reward_std": 0.9345590770244598, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.6510221660137177, + "rewards/no_repetition_reward_func": -0.03647058829665184, + "rewards/verse_reward_func": 0.0, + "step": 65 + }, + { + "completion_length": 480.953125, + "epoch": 0.528, + "grad_norm": 0.61328125, + "kl": 0.001169272349216044, + "learning_rate": 2.64e-05, + "loss": 0.0, + "reward": 0.3920941650867462, + "reward_std": 0.7842485308647156, + "rewards/check_divine_comedy_plagiarism": -0.234375, + "rewards/endecasillabo_reward_func": 0.6819698214530945, + "rewards/no_repetition_reward_func": -0.047688135877251625, + "rewards/verse_reward_func": -0.0078125, + "step": 66 + }, + { + "completion_length": 462.484375, + "epoch": 0.536, + "grad_norm": 0.4609375, + "kl": 0.001222919614519924, + "learning_rate": 2.6800000000000004e-05, + "loss": 0.0, + "reward": 0.5391064584255219, + "reward_std": 0.8698362112045288, + "rewards/check_divine_comedy_plagiarism": -0.28125, + "rewards/endecasillabo_reward_func": 0.8622455298900604, + "rewards/no_repetition_reward_func": -0.04188907891511917, + "rewards/verse_reward_func": 0.0, + "step": 67 + }, + { + "completion_length": 477.890625, + "epoch": 0.544, + "grad_norm": 0.359375, + "kl": 0.001242329424712807, + "learning_rate": 2.7200000000000004e-05, + "loss": 0.0, + "reward": 0.44049011170864105, + "reward_std": 0.7336624264717102, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.6425206959247589, + "rewards/no_repetition_reward_func": -0.04578063264489174, + "rewards/verse_reward_func": 0.0, + "step": 68 + }, + { + "completion_length": 453.53125, + "epoch": 0.552, + "grad_norm": 0.625, + "kl": 0.0012650403659790754, + "learning_rate": 2.7600000000000003e-05, + "loss": 0.0001, + "reward": 0.4064372777938843, + "reward_std": 0.4910520166158676, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 0.501196950674057, + "rewards/no_repetition_reward_func": -0.04007217846810818, + "rewards/verse_reward_func": -0.0078125, + "step": 69 + }, + { + "completion_length": 434.375, + "epoch": 0.56, + "grad_norm": 0.546875, + "kl": 0.0013900414342060685, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0001, + "reward": 0.5550951957702637, + "reward_std": 0.7382708489894867, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.750353068113327, + "rewards/no_repetition_reward_func": -0.03900782763957977, + "rewards/verse_reward_func": 0.0, + "step": 70 + }, + { + "completion_length": 409.90625, + "epoch": 0.568, + "grad_norm": 0.58203125, + "kl": 0.0012728217989206314, + "learning_rate": 2.84e-05, + "loss": 0.0001, + "reward": 0.40740008652210236, + "reward_std": 0.850542813539505, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 0.6336213052272797, + "rewards/no_repetition_reward_func": -0.04653371684253216, + "rewards/verse_reward_func": -0.0390625, + "step": 71 + }, + { + "completion_length": 471.6875, + "epoch": 0.576, + "grad_norm": 0.396484375, + "kl": 0.0013068553525954485, + "learning_rate": 2.88e-05, + "loss": 0.0001, + "reward": 0.3512554317712784, + "reward_std": 0.6798404455184937, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 0.52145816385746, + "rewards/no_repetition_reward_func": -0.045202720910310745, + "rewards/verse_reward_func": 0.0, + "step": 72 + }, + { + "completion_length": 441.546875, + "epoch": 0.584, + "grad_norm": 0.466796875, + "kl": 0.0013001265469938517, + "learning_rate": 2.9199999999999998e-05, + "loss": 0.0001, + "reward": 0.6552914381027222, + "reward_std": 0.872844934463501, + "rewards/check_divine_comedy_plagiarism": -0.21875, + "rewards/endecasillabo_reward_func": 0.9238253235816956, + "rewards/no_repetition_reward_func": -0.041971392929553986, + "rewards/verse_reward_func": -0.0078125, + "step": 73 + }, + { + "completion_length": 437.390625, + "epoch": 0.592, + "grad_norm": 0.455078125, + "kl": 0.0015052505768835545, + "learning_rate": 2.96e-05, + "loss": 0.0001, + "reward": 0.43935707211494446, + "reward_std": 0.5489522516727448, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.6715380549430847, + "rewards/no_repetition_reward_func": -0.052493490278720856, + "rewards/verse_reward_func": -0.0078125, + "step": 74 + }, + { + "completion_length": 452.265625, + "epoch": 0.6, + "grad_norm": 0.47265625, + "kl": 0.001475457742344588, + "learning_rate": 3e-05, + "loss": 0.0001, + "reward": 0.4264806807041168, + "reward_std": 0.4902953654527664, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 0.6091857254505157, + "rewards/no_repetition_reward_func": -0.04208003915846348, + "rewards/verse_reward_func": -0.015625, + "step": 75 + }, + { + "completion_length": 463.6875, + "epoch": 0.608, + "grad_norm": 0.59375, + "kl": 0.0015924317413009703, + "learning_rate": 3.04e-05, + "loss": 0.0001, + "reward": 0.6591902673244476, + "reward_std": 0.7578963935375214, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.879812628030777, + "rewards/no_repetition_reward_func": -0.04093487560749054, + "rewards/verse_reward_func": -0.0078125, + "step": 76 + }, + { + "completion_length": 442.546875, + "epoch": 0.616, + "grad_norm": 0.58203125, + "kl": 0.001539963181130588, + "learning_rate": 3.08e-05, + "loss": 0.0001, + "reward": 0.7385431826114655, + "reward_std": 0.8185318410396576, + "rewards/check_divine_comedy_plagiarism": -0.28125, + "rewards/endecasillabo_reward_func": 1.0681604146957397, + "rewards/no_repetition_reward_func": -0.048367176204919815, + "rewards/verse_reward_func": 0.0, + "step": 77 + }, + { + "completion_length": 443.96875, + "epoch": 0.624, + "grad_norm": 0.4296875, + "kl": 0.0018141276086680591, + "learning_rate": 3.12e-05, + "loss": 0.0001, + "reward": 0.6653846502304077, + "reward_std": 1.0444616377353668, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.8856775760650635, + "rewards/no_repetition_reward_func": -0.040605345740914345, + "rewards/verse_reward_func": -0.0234375, + "step": 78 + }, + { + "completion_length": 438.90625, + "epoch": 0.632, + "grad_norm": 0.474609375, + "kl": 0.0017487509758211672, + "learning_rate": 3.16e-05, + "loss": 0.0001, + "reward": 0.5087510347366333, + "reward_std": 0.8166515231132507, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.7211425006389618, + "rewards/no_repetition_reward_func": -0.04051646403968334, + "rewards/verse_reward_func": 0.0, + "step": 79 + }, + { + "completion_length": 466.65625, + "epoch": 0.64, + "grad_norm": 0.34765625, + "kl": 0.001835956412833184, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0001, + "reward": 0.49886706471443176, + "reward_std": 0.7031179368495941, + "rewards/check_divine_comedy_plagiarism": -0.203125, + "rewards/endecasillabo_reward_func": 0.7478644847869873, + "rewards/no_repetition_reward_func": -0.04587242193520069, + "rewards/verse_reward_func": 0.0, + "step": 80 + }, + { + "completion_length": 457.78125, + "epoch": 0.648, + "grad_norm": 0.392578125, + "kl": 0.001807117136195302, + "learning_rate": 3.24e-05, + "loss": 0.0001, + "reward": 0.5122014582157135, + "reward_std": 0.6293427646160126, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 0.6492699086666107, + "rewards/no_repetition_reward_func": -0.04331846721470356, + "rewards/verse_reward_func": 0.0, + "step": 81 + }, + { + "completion_length": 434.578125, + "epoch": 0.656, + "grad_norm": 0.66796875, + "kl": 0.0019234501523897052, + "learning_rate": 3.2800000000000004e-05, + "loss": 0.0001, + "reward": 0.5489621609449387, + "reward_std": 0.9564897119998932, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 0.6791821718215942, + "rewards/no_repetition_reward_func": -0.04428247548639774, + "rewards/verse_reward_func": -0.0234375, + "step": 82 + }, + { + "completion_length": 454.96875, + "epoch": 0.664, + "grad_norm": 0.453125, + "kl": 0.001922730531077832, + "learning_rate": 3.32e-05, + "loss": 0.0001, + "reward": 0.5500216782093048, + "reward_std": 1.075914353132248, + "rewards/check_divine_comedy_plagiarism": -0.328125, + "rewards/endecasillabo_reward_func": 0.9353676438331604, + "rewards/no_repetition_reward_func": -0.049408430233597755, + "rewards/verse_reward_func": -0.0078125, + "step": 83 + }, + { + "completion_length": 474.875, + "epoch": 0.672, + "grad_norm": 0.326171875, + "kl": 0.0023362882202491164, + "learning_rate": 3.3600000000000004e-05, + "loss": 0.0001, + "reward": 0.4945976436138153, + "reward_std": 0.820168748497963, + "rewards/check_divine_comedy_plagiarism": -0.234375, + "rewards/endecasillabo_reward_func": 0.776361957192421, + "rewards/no_repetition_reward_func": -0.047389307990670204, + "rewards/verse_reward_func": 0.0, + "step": 84 + }, + { + "completion_length": 483.796875, + "epoch": 0.68, + "grad_norm": 0.3828125, + "kl": 0.0022403817856684327, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0001, + "reward": 0.48556697368621826, + "reward_std": 0.6005744934082031, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.7062385976314545, + "rewards/no_repetition_reward_func": -0.040984077379107475, + "rewards/verse_reward_func": -0.0078125, + "step": 85 + }, + { + "completion_length": 458.734375, + "epoch": 0.688, + "grad_norm": 0.361328125, + "kl": 0.0023493263870477676, + "learning_rate": 3.4399999999999996e-05, + "loss": 0.0001, + "reward": 0.5546786934137344, + "reward_std": 0.651825338602066, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 0.676111251115799, + "rewards/no_repetition_reward_func": -0.04330756887793541, + "rewards/verse_reward_func": 0.0, + "step": 86 + }, + { + "completion_length": 477.96875, + "epoch": 0.696, + "grad_norm": 0.37890625, + "kl": 0.002819496556185186, + "learning_rate": 3.48e-05, + "loss": 0.0001, + "reward": 0.5138974189758301, + "reward_std": 0.4925805330276489, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.7126910090446472, + "rewards/no_repetition_reward_func": -0.042543599382042885, + "rewards/verse_reward_func": 0.0, + "step": 87 + }, + { + "completion_length": 445.484375, + "epoch": 0.704, + "grad_norm": 0.55859375, + "kl": 0.002539936453104019, + "learning_rate": 3.52e-05, + "loss": 0.0001, + "reward": 0.4791719615459442, + "reward_std": 0.6686698496341705, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.693199098110199, + "rewards/no_repetition_reward_func": -0.042152104899287224, + "rewards/verse_reward_func": 0.0, + "step": 88 + }, + { + "completion_length": 439.875, + "epoch": 0.712, + "grad_norm": 0.376953125, + "kl": 0.002765906974673271, + "learning_rate": 3.56e-05, + "loss": 0.0001, + "reward": 0.6942241489887238, + "reward_std": 0.9540721774101257, + "rewards/check_divine_comedy_plagiarism": -0.234375, + "rewards/endecasillabo_reward_func": 0.9816505908966064, + "rewards/no_repetition_reward_func": -0.045238932594656944, + "rewards/verse_reward_func": -0.0078125, + "step": 89 + }, + { + "completion_length": 473.609375, + "epoch": 0.72, + "grad_norm": 0.39453125, + "kl": 0.002947235479950905, + "learning_rate": 3.6e-05, + "loss": 0.0001, + "reward": 0.5723871886730194, + "reward_std": 0.5487515032291412, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 0.7002376914024353, + "rewards/no_repetition_reward_func": -0.04191298596560955, + "rewards/verse_reward_func": -0.0078125, + "step": 90 + }, + { + "completion_length": 468.546875, + "epoch": 0.728, + "grad_norm": 0.392578125, + "kl": 0.003494329168461263, + "learning_rate": 3.6400000000000004e-05, + "loss": 0.0001, + "reward": 0.6305270493030548, + "reward_std": 0.6275006532669067, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 0.7874877154827118, + "rewards/no_repetition_reward_func": -0.03977314755320549, + "rewards/verse_reward_func": -0.0078125, + "step": 91 + }, + { + "completion_length": 462.359375, + "epoch": 0.736, + "grad_norm": 0.4375, + "kl": 0.003277092124335468, + "learning_rate": 3.68e-05, + "loss": 0.0001, + "reward": 0.5596085488796234, + "reward_std": 0.6728924810886383, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 0.7358681261539459, + "rewards/no_repetition_reward_func": -0.04344706051051617, + "rewards/verse_reward_func": -0.0078125, + "step": 92 + }, + { + "completion_length": 452.40625, + "epoch": 0.744, + "grad_norm": 0.51953125, + "kl": 0.003290782100521028, + "learning_rate": 3.72e-05, + "loss": 0.0001, + "reward": 0.36139945685863495, + "reward_std": 0.5229135304689407, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 0.49371547996997833, + "rewards/no_repetition_reward_func": -0.03856601007282734, + "rewards/verse_reward_func": 0.0, + "step": 93 + }, + { + "completion_length": 451.125, + "epoch": 0.752, + "grad_norm": 0.66796875, + "kl": 0.003465011017397046, + "learning_rate": 3.76e-05, + "loss": 0.0001, + "reward": 0.707070380449295, + "reward_std": 0.9462063610553741, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.9313420057296753, + "rewards/no_repetition_reward_func": -0.044584210962057114, + "rewards/verse_reward_func": -0.0078125, + "step": 94 + }, + { + "completion_length": 420.65625, + "epoch": 0.76, + "grad_norm": 0.81640625, + "kl": 0.0036122059682384133, + "learning_rate": 3.8e-05, + "loss": 0.0001, + "reward": 0.4508479833602905, + "reward_std": 0.7178867161273956, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 0.6356421411037445, + "rewards/no_repetition_reward_func": -0.03635665401816368, + "rewards/verse_reward_func": -0.0390625, + "step": 95 + }, + { + "completion_length": 471.4375, + "epoch": 0.768, + "grad_norm": 0.3828125, + "kl": 0.004234255291521549, + "learning_rate": 3.8400000000000005e-05, + "loss": 0.0002, + "reward": 0.7394529432058334, + "reward_std": 0.6801340878009796, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 0.9212369322776794, + "rewards/no_repetition_reward_func": -0.04115900211036205, + "rewards/verse_reward_func": 0.0, + "step": 96 + }, + { + "completion_length": 476.34375, + "epoch": 0.776, + "grad_norm": 0.36328125, + "kl": 0.004443926038220525, + "learning_rate": 3.88e-05, + "loss": 0.0002, + "reward": 0.6130691170692444, + "reward_std": 0.5307350903749466, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 0.7337587773799896, + "rewards/no_repetition_reward_func": -0.042564632371068, + "rewards/verse_reward_func": 0.0, + "step": 97 + }, + { + "completion_length": 485.59375, + "epoch": 0.784, + "grad_norm": 0.328125, + "kl": 0.004306312650442123, + "learning_rate": 3.9200000000000004e-05, + "loss": 0.0002, + "reward": 0.5574144124984741, + "reward_std": 0.6155531406402588, + "rewards/check_divine_comedy_plagiarism": -0.21875, + "rewards/endecasillabo_reward_func": 0.8191530704498291, + "rewards/no_repetition_reward_func": -0.04298866167664528, + "rewards/verse_reward_func": 0.0, + "step": 98 + }, + { + "completion_length": 448.875, + "epoch": 0.792, + "grad_norm": 0.4140625, + "kl": 0.0050332932732999325, + "learning_rate": 3.960000000000001e-05, + "loss": 0.0002, + "reward": 0.46039316058158875, + "reward_std": 0.6048887670040131, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 0.6413399577140808, + "rewards/no_repetition_reward_func": -0.04032176919281483, + "rewards/verse_reward_func": 0.0, + "step": 99 + }, + { + "completion_length": 477.125, + "epoch": 0.8, + "grad_norm": 0.36328125, + "kl": 0.004812171449884772, + "learning_rate": 4e-05, + "loss": 0.0002, + "reward": 0.6085090935230255, + "reward_std": 0.5330921858549118, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 0.7476239204406738, + "rewards/no_repetition_reward_func": -0.03755236603319645, + "rewards/verse_reward_func": -0.0078125, + "step": 100 + }, + { + "completion_length": 470.625, + "epoch": 0.808, + "grad_norm": 0.50390625, + "kl": 0.005407911725342274, + "learning_rate": 4.0400000000000006e-05, + "loss": 0.0002, + "reward": 0.733242392539978, + "reward_std": 0.6894570589065552, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 0.9524353742599487, + "rewards/no_repetition_reward_func": -0.039505431428551674, + "rewards/verse_reward_func": -0.0078125, + "step": 101 + }, + { + "completion_length": 461.171875, + "epoch": 0.816, + "grad_norm": 0.515625, + "kl": 0.005672885803505778, + "learning_rate": 4.08e-05, + "loss": 0.0002, + "reward": 0.44740112125873566, + "reward_std": 0.5750540494918823, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 0.6811063885688782, + "rewards/no_repetition_reward_func": -0.03839276544749737, + "rewards/verse_reward_func": -0.0078125, + "step": 102 + }, + { + "completion_length": 466.484375, + "epoch": 0.824, + "grad_norm": 0.6171875, + "kl": 0.006099381018429995, + "learning_rate": 4.12e-05, + "loss": 0.0002, + "reward": 0.9195497334003448, + "reward_std": 0.9506909847259521, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 1.1526776552200317, + "rewards/no_repetition_reward_func": -0.04562799073755741, + "rewards/verse_reward_func": 0.0, + "step": 103 + }, + { + "completion_length": 459.03125, + "epoch": 0.832, + "grad_norm": 0.490234375, + "kl": 0.006588040618225932, + "learning_rate": 4.16e-05, + "loss": 0.0003, + "reward": 0.5874980688095093, + "reward_std": 0.7220757007598877, + "rewards/check_divine_comedy_plagiarism": -0.25, + "rewards/endecasillabo_reward_func": 0.8936180174350739, + "rewards/no_repetition_reward_func": -0.0404950100928545, + "rewards/verse_reward_func": -0.015625, + "step": 104 + }, + { + "completion_length": 473.046875, + "epoch": 0.84, + "grad_norm": 0.3828125, + "kl": 0.0066501060500741005, + "learning_rate": 4.2e-05, + "loss": 0.0003, + "reward": 0.6646659970283508, + "reward_std": 0.7094574272632599, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 0.8258110880851746, + "rewards/no_repetition_reward_func": -0.0361450519412756, + "rewards/verse_reward_func": 0.0, + "step": 105 + }, + { + "completion_length": 476.671875, + "epoch": 0.848, + "grad_norm": 0.359375, + "kl": 0.007067217491567135, + "learning_rate": 4.24e-05, + "loss": 0.0003, + "reward": 0.795961856842041, + "reward_std": 0.7297905087471008, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 0.9793335795402527, + "rewards/no_repetition_reward_func": -0.04274674318730831, + "rewards/verse_reward_func": 0.0, + "step": 106 + }, + { + "completion_length": 485.109375, + "epoch": 0.856, + "grad_norm": 0.3203125, + "kl": 0.006970577174797654, + "learning_rate": 4.2800000000000004e-05, + "loss": 0.0003, + "reward": 0.6358576118946075, + "reward_std": 0.6645030081272125, + "rewards/check_divine_comedy_plagiarism": -0.203125, + "rewards/endecasillabo_reward_func": 0.8805362582206726, + "rewards/no_repetition_reward_func": -0.04155364818871021, + "rewards/verse_reward_func": 0.0, + "step": 107 + }, + { + "completion_length": 478.6875, + "epoch": 0.864, + "grad_norm": 0.59765625, + "kl": 0.007131955120712519, + "learning_rate": 4.32e-05, + "loss": 0.0003, + "reward": 0.6863177418708801, + "reward_std": 0.9047320783138275, + "rewards/check_divine_comedy_plagiarism": -0.25, + "rewards/endecasillabo_reward_func": 0.9897831082344055, + "rewards/no_repetition_reward_func": -0.04565282724797726, + "rewards/verse_reward_func": -0.0078125, + "step": 108 + }, + { + "completion_length": 444.15625, + "epoch": 0.872, + "grad_norm": 0.7734375, + "kl": 0.008131057024002075, + "learning_rate": 4.36e-05, + "loss": 0.0003, + "reward": 0.851494163274765, + "reward_std": 0.979163646697998, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 1.0877526104450226, + "rewards/no_repetition_reward_func": -0.04094594903290272, + "rewards/verse_reward_func": -0.0078125, + "step": 109 + }, + { + "completion_length": 472.265625, + "epoch": 0.88, + "grad_norm": 0.4609375, + "kl": 0.008217557333409786, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0003, + "reward": 0.9206395745277405, + "reward_std": 0.7483803927898407, + "rewards/check_divine_comedy_plagiarism": -0.25, + "rewards/endecasillabo_reward_func": 1.2120715975761414, + "rewards/no_repetition_reward_func": -0.04143206216394901, + "rewards/verse_reward_func": 0.0, + "step": 110 + }, + { + "completion_length": 495.140625, + "epoch": 0.888, + "grad_norm": 0.3203125, + "kl": 0.008913136087357998, + "learning_rate": 4.44e-05, + "loss": 0.0004, + "reward": 0.6849752068519592, + "reward_std": 0.5745950639247894, + "rewards/check_divine_comedy_plagiarism": -0.203125, + "rewards/endecasillabo_reward_func": 0.9286605715751648, + "rewards/no_repetition_reward_func": -0.040560441091656685, + "rewards/verse_reward_func": 0.0, + "step": 111 + }, + { + "completion_length": 464.046875, + "epoch": 0.896, + "grad_norm": 0.435546875, + "kl": 0.009137400425970554, + "learning_rate": 4.4800000000000005e-05, + "loss": 0.0004, + "reward": 0.8248386681079865, + "reward_std": 0.9044426679611206, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 1.0555282235145569, + "rewards/no_repetition_reward_func": -0.04318949021399021, + "rewards/verse_reward_func": -0.015625, + "step": 112 + }, + { + "completion_length": 473.4375, + "epoch": 0.904, + "grad_norm": 0.349609375, + "kl": 0.009209902491420507, + "learning_rate": 4.52e-05, + "loss": 0.0004, + "reward": 0.857606053352356, + "reward_std": 0.7786423563957214, + "rewards/check_divine_comedy_plagiarism": -0.21875, + "rewards/endecasillabo_reward_func": 1.1189203262329102, + "rewards/no_repetition_reward_func": -0.04256429709494114, + "rewards/verse_reward_func": 0.0, + "step": 113 + }, + { + "completion_length": 491.25, + "epoch": 0.912, + "grad_norm": 0.328125, + "kl": 0.008855299558490515, + "learning_rate": 4.5600000000000004e-05, + "loss": 0.0004, + "reward": 1.102285385131836, + "reward_std": 0.9765981137752533, + "rewards/check_divine_comedy_plagiarism": -0.25, + "rewards/endecasillabo_reward_func": 1.3921913504600525, + "rewards/no_repetition_reward_func": -0.039906105026602745, + "rewards/verse_reward_func": 0.0, + "step": 114 + }, + { + "completion_length": 484.765625, + "epoch": 0.92, + "grad_norm": 0.396484375, + "kl": 0.009566637221723795, + "learning_rate": 4.600000000000001e-05, + "loss": 0.0004, + "reward": 0.8023201823234558, + "reward_std": 0.7653357088565826, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 0.9083880186080933, + "rewards/no_repetition_reward_func": -0.04356779903173447, + "rewards/verse_reward_func": 0.0, + "step": 115 + }, + { + "completion_length": 496.109375, + "epoch": 0.928, + "grad_norm": 0.322265625, + "kl": 0.011151209939271212, + "learning_rate": 4.64e-05, + "loss": 0.0004, + "reward": 0.7127190828323364, + "reward_std": 0.5722455978393555, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.9199798703193665, + "rewards/no_repetition_reward_func": -0.043198222294449806, + "rewards/verse_reward_func": -0.0078125, + "step": 116 + }, + { + "completion_length": 456.859375, + "epoch": 0.936, + "grad_norm": 1.1171875, + "kl": 0.009951935149729252, + "learning_rate": 4.6800000000000006e-05, + "loss": 0.0004, + "reward": 0.9817891418933868, + "reward_std": 0.9654392004013062, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 1.2080682516098022, + "rewards/no_repetition_reward_func": -0.046591589227318764, + "rewards/verse_reward_func": -0.0078125, + "step": 117 + }, + { + "completion_length": 483.21875, + "epoch": 0.944, + "grad_norm": 0.3671875, + "kl": 0.009547273628413677, + "learning_rate": 4.72e-05, + "loss": 0.0004, + "reward": 0.6107205599546432, + "reward_std": 0.6476789712905884, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 0.7286277413368225, + "rewards/no_repetition_reward_func": -0.03978218324482441, + "rewards/verse_reward_func": 0.0, + "step": 118 + }, + { + "completion_length": 492.78125, + "epoch": 0.952, + "grad_norm": 0.37109375, + "kl": 0.011888631153851748, + "learning_rate": 4.76e-05, + "loss": 0.0005, + "reward": 0.8390526175498962, + "reward_std": 0.6329848170280457, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 1.0240559875965118, + "rewards/no_repetition_reward_func": -0.04437839426100254, + "rewards/verse_reward_func": 0.0, + "step": 119 + }, + { + "completion_length": 478.40625, + "epoch": 0.96, + "grad_norm": 0.4375, + "kl": 0.012119731400161982, + "learning_rate": 4.8e-05, + "loss": 0.0005, + "reward": 0.8530257046222687, + "reward_std": 0.9176691770553589, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 1.0964869260787964, + "rewards/no_repetition_reward_func": -0.040336210280656815, + "rewards/verse_reward_func": -0.015625, + "step": 120 + }, + { + "completion_length": 455.390625, + "epoch": 0.968, + "grad_norm": 0.54296875, + "kl": 0.011116213165223598, + "learning_rate": 4.8400000000000004e-05, + "loss": 0.0004, + "reward": 0.7823189496994019, + "reward_std": 0.6147425174713135, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 1.0120322704315186, + "rewards/no_repetition_reward_func": -0.04221332259476185, + "rewards/verse_reward_func": 0.0, + "step": 121 + }, + { + "completion_length": 486.953125, + "epoch": 0.976, + "grad_norm": 0.4296875, + "kl": 0.013799747452139854, + "learning_rate": 4.88e-05, + "loss": 0.0006, + "reward": 0.7792432308197021, + "reward_std": 0.7618407607078552, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 0.9792907238006592, + "rewards/no_repetition_reward_func": -0.0437975637614727, + "rewards/verse_reward_func": 0.0, + "step": 122 + }, + { + "completion_length": 473.6875, + "epoch": 0.984, + "grad_norm": 0.34375, + "kl": 0.012516144197434187, + "learning_rate": 4.92e-05, + "loss": 0.0005, + "reward": 0.936005175113678, + "reward_std": 0.6899940371513367, + "rewards/check_divine_comedy_plagiarism": -0.234375, + "rewards/endecasillabo_reward_func": 1.2143414616584778, + "rewards/no_repetition_reward_func": -0.043961264193058014, + "rewards/verse_reward_func": 0.0, + "step": 123 + }, + { + "completion_length": 465.640625, + "epoch": 0.992, + "grad_norm": 1.1171875, + "kl": 0.011856945231556892, + "learning_rate": 4.96e-05, + "loss": 0.0005, + "reward": 1.0431022942066193, + "reward_std": 1.0352391600608826, + "rewards/check_divine_comedy_plagiarism": -0.203125, + "rewards/endecasillabo_reward_func": 1.3153480887413025, + "rewards/no_repetition_reward_func": -0.05349575914442539, + "rewards/verse_reward_func": -0.015625, + "step": 124 + }, + { + "completion_length": 468.5625, + "epoch": 1.0, + "grad_norm": 0.30859375, + "kl": 0.01528834830969572, + "learning_rate": 5e-05, + "loss": 0.0006, + "reward": 0.8121639490127563, + "reward_std": 0.6622692346572876, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 1.029076486825943, + "rewards/no_repetition_reward_func": -0.04503757320344448, + "rewards/verse_reward_func": 0.0, + "step": 125 + }, + { + "completion_length": 479.84375, + "epoch": 1.008, + "grad_norm": 0.6953125, + "kl": 0.014597195666283369, + "learning_rate": 4.9999902522489015e-05, + "loss": 0.0006, + "reward": 1.0765548348426819, + "reward_std": 1.1424457430839539, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 1.2840838432312012, + "rewards/no_repetition_reward_func": -0.04346649721264839, + "rewards/verse_reward_func": -0.0078125, + "step": 126 + }, + { + "completion_length": 461.109375, + "epoch": 1.016, + "grad_norm": 0.69921875, + "kl": 0.013781292364001274, + "learning_rate": 4.999961009071621e-05, + "loss": 0.0006, + "reward": 0.8688473105430603, + "reward_std": 0.6538267731666565, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 0.9901761114597321, + "rewards/no_repetition_reward_func": -0.043203793466091156, + "rewards/verse_reward_func": -0.015625, + "step": 127 + }, + { + "completion_length": 479.5625, + "epoch": 1.024, + "grad_norm": 0.3203125, + "kl": 0.015115554444491863, + "learning_rate": 4.999912270696202e-05, + "loss": 0.0006, + "reward": 0.8847565650939941, + "reward_std": 0.812307596206665, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 1.0976204872131348, + "rewards/no_repetition_reward_func": -0.0409888681024313, + "rewards/verse_reward_func": 0.0, + "step": 128 + }, + { + "completion_length": 477.875, + "epoch": 1.032, + "grad_norm": 0.3828125, + "kl": 0.014818621333688498, + "learning_rate": 4.9998440375027166e-05, + "loss": 0.0006, + "reward": 1.0980111956596375, + "reward_std": 0.9823193550109863, + "rewards/check_divine_comedy_plagiarism": -0.234375, + "rewards/endecasillabo_reward_func": 1.3864076733589172, + "rewards/no_repetition_reward_func": -0.05402154475450516, + "rewards/verse_reward_func": 0.0, + "step": 129 + }, + { + "completion_length": 470.9375, + "epoch": 1.04, + "grad_norm": 0.47265625, + "kl": 0.015702282544225454, + "learning_rate": 4.999756310023261e-05, + "loss": 0.0006, + "reward": 1.1779870986938477, + "reward_std": 0.8420811593532562, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 1.321848750114441, + "rewards/no_repetition_reward_func": -0.042299093678593636, + "rewards/verse_reward_func": -0.0078125, + "step": 130 + }, + { + "completion_length": 482.328125, + "epoch": 1.048, + "grad_norm": 0.396484375, + "kl": 0.01675122231245041, + "learning_rate": 4.9996490889419514e-05, + "loss": 0.0007, + "reward": 1.1713331937789917, + "reward_std": 0.9093182682991028, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 1.384926676750183, + "rewards/no_repetition_reward_func": -0.04171852022409439, + "rewards/verse_reward_func": 0.0, + "step": 131 + }, + { + "completion_length": 469.546875, + "epoch": 1.056, + "grad_norm": 0.67578125, + "kl": 0.018543750047683716, + "learning_rate": 4.999522375094919e-05, + "loss": 0.0007, + "reward": 1.1633066534996033, + "reward_std": 1.2020546197891235, + "rewards/check_divine_comedy_plagiarism": -0.25, + "rewards/endecasillabo_reward_func": 1.4616034030914307, + "rewards/no_repetition_reward_func": -0.04048439301550388, + "rewards/verse_reward_func": -0.0078125, + "step": 132 + }, + { + "completion_length": 465.6875, + "epoch": 1.064, + "grad_norm": 0.408203125, + "kl": 0.01848715078085661, + "learning_rate": 4.999376169470306e-05, + "loss": 0.0007, + "reward": 1.2777025699615479, + "reward_std": 0.8314717710018158, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 1.480246663093567, + "rewards/no_repetition_reward_func": -0.04629413038492203, + "rewards/verse_reward_func": 0.0, + "step": 133 + }, + { + "completion_length": 496.4375, + "epoch": 1.072, + "grad_norm": 0.34765625, + "kl": 0.0208482276648283, + "learning_rate": 4.99921047320825e-05, + "loss": 0.0008, + "reward": 1.2080789804458618, + "reward_std": 0.8436665832996368, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 1.4035176038742065, + "rewards/no_repetition_reward_func": -0.03918857127428055, + "rewards/verse_reward_func": 0.0, + "step": 134 + }, + { + "completion_length": 479.953125, + "epoch": 1.08, + "grad_norm": 0.38671875, + "kl": 0.02021243143826723, + "learning_rate": 4.999025287600886e-05, + "loss": 0.0008, + "reward": 1.117868721485138, + "reward_std": 0.817039430141449, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 1.3430280685424805, + "rewards/no_repetition_reward_func": -0.04547187685966492, + "rewards/verse_reward_func": -0.0078125, + "step": 135 + }, + { + "completion_length": 467.0625, + "epoch": 1.088, + "grad_norm": 0.466796875, + "kl": 0.0192941315472126, + "learning_rate": 4.998820614092328e-05, + "loss": 0.0008, + "reward": 0.9254534840583801, + "reward_std": 0.8020221292972565, + "rewards/check_divine_comedy_plagiarism": -0.21875, + "rewards/endecasillabo_reward_func": 1.1873353123664856, + "rewards/no_repetition_reward_func": -0.0431317463517189, + "rewards/verse_reward_func": 0.0, + "step": 136 + }, + { + "completion_length": 489.609375, + "epoch": 1.096, + "grad_norm": 0.337890625, + "kl": 0.019163841381669044, + "learning_rate": 4.9985964542786614e-05, + "loss": 0.0008, + "reward": 1.4049009084701538, + "reward_std": 0.7209422588348389, + "rewards/check_divine_comedy_plagiarism": -0.234375, + "rewards/endecasillabo_reward_func": 1.6837031841278076, + "rewards/no_repetition_reward_func": -0.044427333399653435, + "rewards/verse_reward_func": 0.0, + "step": 137 + }, + { + "completion_length": 472.375, + "epoch": 1.104, + "grad_norm": 0.408203125, + "kl": 0.022796817123889923, + "learning_rate": 4.998352809907928e-05, + "loss": 0.0009, + "reward": 0.8673029541969299, + "reward_std": 0.7481123208999634, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 1.0824537873268127, + "rewards/no_repetition_reward_func": -0.04327577166259289, + "rewards/verse_reward_func": 0.0, + "step": 138 + }, + { + "completion_length": 465.46875, + "epoch": 1.112, + "grad_norm": 0.38671875, + "kl": 0.020916489884257317, + "learning_rate": 4.998089682880117e-05, + "loss": 0.0008, + "reward": 1.2010924220085144, + "reward_std": 0.8952957987785339, + "rewards/check_divine_comedy_plagiarism": -0.3125, + "rewards/endecasillabo_reward_func": 1.5584226250648499, + "rewards/no_repetition_reward_func": -0.04483017139136791, + "rewards/verse_reward_func": 0.0, + "step": 139 + }, + { + "completion_length": 503.8125, + "epoch": 1.12, + "grad_norm": 0.39453125, + "kl": 0.0209491653367877, + "learning_rate": 4.997807075247146e-05, + "loss": 0.0008, + "reward": 1.0968765914440155, + "reward_std": 0.7842168509960175, + "rewards/check_divine_comedy_plagiarism": -0.234375, + "rewards/endecasillabo_reward_func": 1.3767050504684448, + "rewards/no_repetition_reward_func": -0.04545341432094574, + "rewards/verse_reward_func": 0.0, + "step": 140 + }, + { + "completion_length": 504.953125, + "epoch": 1.1280000000000001, + "grad_norm": 0.4375, + "kl": 0.02285260334610939, + "learning_rate": 4.9975049892128455e-05, + "loss": 0.0009, + "reward": 1.435727596282959, + "reward_std": 0.8578230142593384, + "rewards/check_divine_comedy_plagiarism": -0.203125, + "rewards/endecasillabo_reward_func": 1.6804776191711426, + "rewards/no_repetition_reward_func": -0.04162488505244255, + "rewards/verse_reward_func": 0.0, + "step": 141 + }, + { + "completion_length": 477.515625, + "epoch": 1.1360000000000001, + "grad_norm": 0.349609375, + "kl": 0.02167558390647173, + "learning_rate": 4.997183427132943e-05, + "loss": 0.0009, + "reward": 1.0282384157180786, + "reward_std": 0.8157914876937866, + "rewards/check_divine_comedy_plagiarism": -0.234375, + "rewards/endecasillabo_reward_func": 1.307436227798462, + "rewards/no_repetition_reward_func": -0.044822804629802704, + "rewards/verse_reward_func": 0.0, + "step": 142 + }, + { + "completion_length": 487.234375, + "epoch": 1.144, + "grad_norm": 0.5234375, + "kl": 0.024569068104028702, + "learning_rate": 4.996842391515044e-05, + "loss": 0.001, + "reward": 1.0931780934333801, + "reward_std": 0.7948631942272186, + "rewards/check_divine_comedy_plagiarism": -0.21875, + "rewards/endecasillabo_reward_func": 1.3624120354652405, + "rewards/no_repetition_reward_func": -0.042671388015151024, + "rewards/verse_reward_func": -0.0078125, + "step": 143 + }, + { + "completion_length": 492.765625, + "epoch": 1.152, + "grad_norm": 0.33984375, + "kl": 0.024668540805578232, + "learning_rate": 4.9964818850186135e-05, + "loss": 0.001, + "reward": 1.261244297027588, + "reward_std": 0.9897023439407349, + "rewards/check_divine_comedy_plagiarism": -0.25, + "rewards/endecasillabo_reward_func": 1.558914840221405, + "rewards/no_repetition_reward_func": -0.04767055623233318, + "rewards/verse_reward_func": 0.0, + "step": 144 + }, + { + "completion_length": 465.21875, + "epoch": 1.16, + "grad_norm": 0.54296875, + "kl": 0.02321091666817665, + "learning_rate": 4.996101910454953e-05, + "loss": 0.0009, + "reward": 1.2484649121761322, + "reward_std": 1.0629516243934631, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 1.4465356469154358, + "rewards/no_repetition_reward_func": -0.041820771992206573, + "rewards/verse_reward_func": -0.015625, + "step": 145 + }, + { + "completion_length": 476.328125, + "epoch": 1.168, + "grad_norm": 0.5625, + "kl": 0.0270308880135417, + "learning_rate": 4.9957024707871806e-05, + "loss": 0.0011, + "reward": 1.4868295788764954, + "reward_std": 1.0868560671806335, + "rewards/check_divine_comedy_plagiarism": -0.203125, + "rewards/endecasillabo_reward_func": 1.746797263622284, + "rewards/no_repetition_reward_func": -0.04903018660843372, + "rewards/verse_reward_func": -0.0078125, + "step": 146 + }, + { + "completion_length": 497.3125, + "epoch": 1.176, + "grad_norm": 0.322265625, + "kl": 0.025261851027607918, + "learning_rate": 4.995283569130207e-05, + "loss": 0.001, + "reward": 1.238202691078186, + "reward_std": 0.7323828339576721, + "rewards/check_divine_comedy_plagiarism": -0.25, + "rewards/endecasillabo_reward_func": 1.533545732498169, + "rewards/no_repetition_reward_func": -0.0453429389744997, + "rewards/verse_reward_func": 0.0, + "step": 147 + }, + { + "completion_length": 504.390625, + "epoch": 1.184, + "grad_norm": 0.3515625, + "kl": 0.029752018861472607, + "learning_rate": 4.9948452087507116e-05, + "loss": 0.0012, + "reward": 1.296115756034851, + "reward_std": 0.8444748222827911, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 1.5093401670455933, + "rewards/no_repetition_reward_func": -0.04134938679635525, + "rewards/verse_reward_func": 0.0, + "step": 148 + }, + { + "completion_length": 492.0, + "epoch": 1.192, + "grad_norm": 0.625, + "kl": 0.030186453834176064, + "learning_rate": 4.994387393067117e-05, + "loss": 0.0012, + "reward": 1.3956579566001892, + "reward_std": 0.9162646532058716, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 1.6273554563522339, + "rewards/no_repetition_reward_func": -0.05201003514230251, + "rewards/verse_reward_func": -0.0078125, + "step": 149 + }, + { + "completion_length": 489.25, + "epoch": 1.2, + "grad_norm": 0.41015625, + "kl": 0.032290924340486526, + "learning_rate": 4.993910125649561e-05, + "loss": 0.0013, + "reward": 1.4135644435882568, + "reward_std": 0.7723727226257324, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 1.6108007431030273, + "rewards/no_repetition_reward_func": -0.04098622687160969, + "rewards/verse_reward_func": 0.0, + "step": 150 + }, + { + "completion_length": 499.859375, + "epoch": 1.208, + "grad_norm": 0.3515625, + "kl": 0.029701238498091698, + "learning_rate": 4.993413410219871e-05, + "loss": 0.0012, + "reward": 1.2983157634735107, + "reward_std": 0.7290353775024414, + "rewards/check_divine_comedy_plagiarism": -0.234375, + "rewards/endecasillabo_reward_func": 1.5830079913139343, + "rewards/no_repetition_reward_func": -0.05031728185713291, + "rewards/verse_reward_func": 0.0, + "step": 151 + }, + { + "completion_length": 486.9375, + "epoch": 1.216, + "grad_norm": 0.361328125, + "kl": 0.03047781065106392, + "learning_rate": 4.992897250651535e-05, + "loss": 0.0012, + "reward": 1.6017792224884033, + "reward_std": 0.7814383804798126, + "rewards/check_divine_comedy_plagiarism": -0.21875, + "rewards/endecasillabo_reward_func": 1.8685170412063599, + "rewards/no_repetition_reward_func": -0.04798780754208565, + "rewards/verse_reward_func": 0.0, + "step": 152 + }, + { + "completion_length": 494.5, + "epoch": 1.224, + "grad_norm": 0.33984375, + "kl": 0.030035555362701416, + "learning_rate": 4.9923616509696683e-05, + "loss": 0.0012, + "reward": 1.5098856687545776, + "reward_std": 1.0164735019207, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 1.7092449069023132, + "rewards/no_repetition_reward_func": -0.043109240010380745, + "rewards/verse_reward_func": 0.0, + "step": 153 + }, + { + "completion_length": 488.921875, + "epoch": 1.232, + "grad_norm": 0.5546875, + "kl": 0.03215971402823925, + "learning_rate": 4.9918066153509834e-05, + "loss": 0.0013, + "reward": 1.5010871291160583, + "reward_std": 1.0179056525230408, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 1.6921757459640503, + "rewards/no_repetition_reward_func": -0.050463516265153885, + "rewards/verse_reward_func": 0.0, + "step": 154 + }, + { + "completion_length": 501.359375, + "epoch": 1.24, + "grad_norm": 0.3046875, + "kl": 0.033996814861893654, + "learning_rate": 4.991232148123761e-05, + "loss": 0.0014, + "reward": 1.6083585619926453, + "reward_std": 0.8226166665554047, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 1.7796440124511719, + "rewards/no_repetition_reward_func": -0.04628537967801094, + "rewards/verse_reward_func": 0.0, + "step": 155 + }, + { + "completion_length": 483.140625, + "epoch": 1.248, + "grad_norm": 0.36328125, + "kl": 0.03517594002187252, + "learning_rate": 4.990638253767812e-05, + "loss": 0.0014, + "reward": 1.4982413053512573, + "reward_std": 0.9330625236034393, + "rewards/check_divine_comedy_plagiarism": -0.3125, + "rewards/endecasillabo_reward_func": 1.8592085242271423, + "rewards/no_repetition_reward_func": -0.04846729524433613, + "rewards/verse_reward_func": 0.0, + "step": 156 + }, + { + "completion_length": 483.78125, + "epoch": 1.256, + "grad_norm": 0.40234375, + "kl": 0.03153814375400543, + "learning_rate": 4.9900249369144434e-05, + "loss": 0.0013, + "reward": 1.7124320268630981, + "reward_std": 0.8352412283420563, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 1.8840967416763306, + "rewards/no_repetition_reward_func": -0.046664800494909286, + "rewards/verse_reward_func": 0.0, + "step": 157 + }, + { + "completion_length": 506.421875, + "epoch": 1.264, + "grad_norm": 0.341796875, + "kl": 0.034675946459174156, + "learning_rate": 4.9893922023464236e-05, + "loss": 0.0014, + "reward": 1.7153830528259277, + "reward_std": 0.9767770171165466, + "rewards/check_divine_comedy_plagiarism": -0.203125, + "rewards/endecasillabo_reward_func": 1.9652009010314941, + "rewards/no_repetition_reward_func": -0.04669289477169514, + "rewards/verse_reward_func": 0.0, + "step": 158 + }, + { + "completion_length": 482.34375, + "epoch": 1.272, + "grad_norm": 0.412109375, + "kl": 0.03509502485394478, + "learning_rate": 4.988740054997943e-05, + "loss": 0.0014, + "reward": 1.4190146327018738, + "reward_std": 1.175166666507721, + "rewards/check_divine_comedy_plagiarism": -0.265625, + "rewards/endecasillabo_reward_func": 1.744959533214569, + "rewards/no_repetition_reward_func": -0.04469497315585613, + "rewards/verse_reward_func": -0.015625, + "step": 159 + }, + { + "completion_length": 488.53125, + "epoch": 1.28, + "grad_norm": 0.765625, + "kl": 0.03635891526937485, + "learning_rate": 4.988068499954578e-05, + "loss": 0.0015, + "reward": 1.3922036290168762, + "reward_std": 0.9128883481025696, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 1.5677708983421326, + "rewards/no_repetition_reward_func": -0.04275474511086941, + "rewards/verse_reward_func": -0.0078125, + "step": 160 + }, + { + "completion_length": 509.3125, + "epoch": 1.288, + "grad_norm": 0.33984375, + "kl": 0.037595415487885475, + "learning_rate": 4.987377542453251e-05, + "loss": 0.0015, + "reward": 1.7273921370506287, + "reward_std": 0.7672889232635498, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 1.896098256111145, + "rewards/no_repetition_reward_func": -0.04370614141225815, + "rewards/verse_reward_func": 0.0, + "step": 161 + }, + { + "completion_length": 501.25, + "epoch": 1.296, + "grad_norm": 0.36328125, + "kl": 0.03937136195600033, + "learning_rate": 4.986667187882186e-05, + "loss": 0.0016, + "reward": 1.7635703682899475, + "reward_std": 0.9336355030536652, + "rewards/check_divine_comedy_plagiarism": -0.265625, + "rewards/endecasillabo_reward_func": 2.0765941739082336, + "rewards/no_repetition_reward_func": -0.047398870810866356, + "rewards/verse_reward_func": 0.0, + "step": 162 + }, + { + "completion_length": 486.4375, + "epoch": 1.304, + "grad_norm": 0.349609375, + "kl": 0.03927781619131565, + "learning_rate": 4.98593744178087e-05, + "loss": 0.0016, + "reward": 1.7251930832862854, + "reward_std": 0.715046614408493, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 1.9129748940467834, + "rewards/no_repetition_reward_func": -0.04715685173869133, + "rewards/verse_reward_func": 0.0, + "step": 163 + }, + { + "completion_length": 485.25, + "epoch": 1.312, + "grad_norm": 0.365234375, + "kl": 0.04016737826168537, + "learning_rate": 4.985188309840012e-05, + "loss": 0.0016, + "reward": 1.5819841623306274, + "reward_std": 0.874216765165329, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 1.8094885349273682, + "rewards/no_repetition_reward_func": -0.04781687632203102, + "rewards/verse_reward_func": -0.0078125, + "step": 164 + }, + { + "completion_length": 499.53125, + "epoch": 1.32, + "grad_norm": 0.33984375, + "kl": 0.04220748133957386, + "learning_rate": 4.984419797901491e-05, + "loss": 0.0017, + "reward": 1.9484219551086426, + "reward_std": 1.1173385977745056, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 2.151341497898102, + "rewards/no_repetition_reward_func": -0.04666942358016968, + "rewards/verse_reward_func": 0.0, + "step": 165 + }, + { + "completion_length": 511.5625, + "epoch": 1.328, + "grad_norm": 0.51953125, + "kl": 0.05515013448894024, + "learning_rate": 4.983631911958319e-05, + "loss": 0.0022, + "reward": 2.1161813139915466, + "reward_std": 0.8764962255954742, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.239416718482971, + "rewards/no_repetition_reward_func": -0.045110369101166725, + "rewards/verse_reward_func": 0.0, + "step": 166 + }, + { + "completion_length": 488.25, + "epoch": 1.336, + "grad_norm": 0.345703125, + "kl": 0.03920900821685791, + "learning_rate": 4.982824658154589e-05, + "loss": 0.0016, + "reward": 1.6657472848892212, + "reward_std": 1.0773403942584991, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 1.8885748386383057, + "rewards/no_repetition_reward_func": -0.05095253698527813, + "rewards/verse_reward_func": 0.0, + "step": 167 + }, + { + "completion_length": 502.078125, + "epoch": 1.3439999999999999, + "grad_norm": 0.333984375, + "kl": 0.04175206460058689, + "learning_rate": 4.981998042785427e-05, + "loss": 0.0017, + "reward": 2.017165780067444, + "reward_std": 1.0157120823860168, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 2.226186156272888, + "rewards/no_repetition_reward_func": -0.052770448848605156, + "rewards/verse_reward_func": 0.0, + "step": 168 + }, + { + "completion_length": 495.75, + "epoch": 1.3519999999999999, + "grad_norm": 0.5078125, + "kl": 0.041013073176145554, + "learning_rate": 4.9811520722969465e-05, + "loss": 0.0016, + "reward": 1.90992271900177, + "reward_std": 0.9325758814811707, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.061087131500244, + "rewards/no_repetition_reward_func": -0.04960194416344166, + "rewards/verse_reward_func": -0.0078125, + "step": 169 + }, + { + "completion_length": 479.65625, + "epoch": 1.3599999999999999, + "grad_norm": 0.443359375, + "kl": 0.04386119917035103, + "learning_rate": 4.980286753286195e-05, + "loss": 0.0018, + "reward": 1.5909650325775146, + "reward_std": 1.2001194953918457, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 1.832824170589447, + "rewards/no_repetition_reward_func": -0.04654666781425476, + "rewards/verse_reward_func": -0.0078125, + "step": 170 + }, + { + "completion_length": 500.578125, + "epoch": 1.3679999999999999, + "grad_norm": 0.34765625, + "kl": 0.04243324138224125, + "learning_rate": 4.9794020925011044e-05, + "loss": 0.0017, + "reward": 1.9046602845191956, + "reward_std": 0.952025443315506, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 2.07928466796875, + "rewards/no_repetition_reward_func": -0.04962450824677944, + "rewards/verse_reward_func": 0.0, + "step": 171 + }, + { + "completion_length": 505.5625, + "epoch": 1.376, + "grad_norm": 0.322265625, + "kl": 0.04043492302298546, + "learning_rate": 4.978498096840436e-05, + "loss": 0.0016, + "reward": 1.5453163981437683, + "reward_std": 0.785085916519165, + "rewards/check_divine_comedy_plagiarism": -0.21875, + "rewards/endecasillabo_reward_func": 1.8121280074119568, + "rewards/no_repetition_reward_func": -0.04806157201528549, + "rewards/verse_reward_func": 0.0, + "step": 172 + }, + { + "completion_length": 503.984375, + "epoch": 1.384, + "grad_norm": 0.361328125, + "kl": 0.046845294535160065, + "learning_rate": 4.977574773353732e-05, + "loss": 0.0019, + "reward": 1.9868143796920776, + "reward_std": 1.016902208328247, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.1253031492233276, + "rewards/no_repetition_reward_func": -0.044738685712218285, + "rewards/verse_reward_func": 0.0, + "step": 173 + }, + { + "completion_length": 496.703125, + "epoch": 1.392, + "grad_norm": 0.337890625, + "kl": 0.047794874757528305, + "learning_rate": 4.976632129241252e-05, + "loss": 0.0019, + "reward": 2.057507276535034, + "reward_std": 1.0268380641937256, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 2.215571165084839, + "rewards/no_repetition_reward_func": -0.04868905432522297, + "rewards/verse_reward_func": 0.0, + "step": 174 + }, + { + "completion_length": 495.3125, + "epoch": 1.4, + "grad_norm": 1.8671875, + "kl": 0.04859863966703415, + "learning_rate": 4.975670171853926e-05, + "loss": 0.0019, + "reward": 2.3248183727264404, + "reward_std": 1.017104059457779, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 2.5184853076934814, + "rewards/no_repetition_reward_func": -0.053041817620396614, + "rewards/verse_reward_func": 0.0, + "step": 175 + }, + { + "completion_length": 492.625, + "epoch": 1.408, + "grad_norm": 0.3828125, + "kl": 0.048793213441967964, + "learning_rate": 4.9746889086932895e-05, + "loss": 0.002, + "reward": 1.9652841091156006, + "reward_std": 0.9530762434005737, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.100122809410095, + "rewards/no_repetition_reward_func": -0.04890110343694687, + "rewards/verse_reward_func": -0.0078125, + "step": 176 + }, + { + "completion_length": 505.71875, + "epoch": 1.416, + "grad_norm": 0.330078125, + "kl": 0.052117861807346344, + "learning_rate": 4.973688347411431e-05, + "loss": 0.0021, + "reward": 2.27458393573761, + "reward_std": 1.0376586616039276, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 2.4665403366088867, + "rewards/no_repetition_reward_func": -0.05133134685456753, + "rewards/verse_reward_func": 0.0, + "step": 177 + }, + { + "completion_length": 496.671875, + "epoch": 1.424, + "grad_norm": 0.38671875, + "kl": 0.05739965848624706, + "learning_rate": 4.9726684958109266e-05, + "loss": 0.0023, + "reward": 2.431235671043396, + "reward_std": 1.0570597350597382, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 2.6314516067504883, + "rewards/no_repetition_reward_func": -0.043965984135866165, + "rewards/verse_reward_func": 0.0, + "step": 178 + }, + { + "completion_length": 476.140625, + "epoch": 1.432, + "grad_norm": 0.353515625, + "kl": 0.049789853394031525, + "learning_rate": 4.971629361844785e-05, + "loss": 0.002, + "reward": 2.0167656540870667, + "reward_std": 1.120947539806366, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 2.2094757556915283, + "rewards/no_repetition_reward_func": -0.05208521708846092, + "rewards/verse_reward_func": 0.0, + "step": 179 + }, + { + "completion_length": 498.96875, + "epoch": 1.44, + "grad_norm": 0.6171875, + "kl": 0.059595197439193726, + "learning_rate": 4.9705709536163824e-05, + "loss": 0.0024, + "reward": 2.348409056663513, + "reward_std": 1.0287013053894043, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.487688899040222, + "rewards/no_repetition_reward_func": -0.04552990384399891, + "rewards/verse_reward_func": 0.0, + "step": 180 + }, + { + "completion_length": 504.109375, + "epoch": 1.448, + "grad_norm": 0.365234375, + "kl": 0.05591634660959244, + "learning_rate": 4.969493279379398e-05, + "loss": 0.0022, + "reward": 2.160691499710083, + "reward_std": 1.0874946117401123, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 2.368074893951416, + "rewards/no_repetition_reward_func": -0.0511332843452692, + "rewards/verse_reward_func": 0.0, + "step": 181 + }, + { + "completion_length": 485.125, + "epoch": 1.456, + "grad_norm": 0.44140625, + "kl": 0.05642455257475376, + "learning_rate": 4.968396347537751e-05, + "loss": 0.0023, + "reward": 2.50939404964447, + "reward_std": 1.1524232625961304, + "rewards/check_divine_comedy_plagiarism": -0.203125, + "rewards/endecasillabo_reward_func": 2.7775166034698486, + "rewards/no_repetition_reward_func": -0.05718499608337879, + "rewards/verse_reward_func": -0.0078125, + "step": 182 + }, + { + "completion_length": 513.125, + "epoch": 1.464, + "grad_norm": 0.35546875, + "kl": 0.06478038430213928, + "learning_rate": 4.967280166645538e-05, + "loss": 0.0026, + "reward": 2.852421760559082, + "reward_std": 1.119005262851715, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.9872894287109375, + "rewards/no_repetition_reward_func": -0.0489300899207592, + "rewards/verse_reward_func": -0.0078125, + "step": 183 + }, + { + "completion_length": 503.09375, + "epoch": 1.472, + "grad_norm": 0.333984375, + "kl": 0.056777598336339, + "learning_rate": 4.966144745406961e-05, + "loss": 0.0023, + "reward": 2.2432305812835693, + "reward_std": 1.0427147150039673, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 2.4254778623580933, + "rewards/no_repetition_reward_func": -0.05724722146987915, + "rewards/verse_reward_func": 0.0, + "step": 184 + }, + { + "completion_length": 502.078125, + "epoch": 1.48, + "grad_norm": 0.341796875, + "kl": 0.05473160557448864, + "learning_rate": 4.964990092676263e-05, + "loss": 0.0022, + "reward": 2.1923807859420776, + "reward_std": 1.0567432045936584, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.337090492248535, + "rewards/no_repetition_reward_func": -0.05095980130136013, + "rewards/verse_reward_func": 0.0, + "step": 185 + }, + { + "completion_length": 510.28125, + "epoch": 1.488, + "grad_norm": 0.3359375, + "kl": 0.055219683796167374, + "learning_rate": 4.963816217457657e-05, + "loss": 0.0022, + "reward": 2.1019275188446045, + "reward_std": 1.1001242995262146, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.2371888160705566, + "rewards/no_repetition_reward_func": -0.0571361668407917, + "rewards/verse_reward_func": 0.0, + "step": 186 + }, + { + "completion_length": 499.359375, + "epoch": 1.496, + "grad_norm": 0.37109375, + "kl": 0.059148844331502914, + "learning_rate": 4.9626231289052596e-05, + "loss": 0.0024, + "reward": 2.393579840660095, + "reward_std": 1.0116261839866638, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 2.5770163536071777, + "rewards/no_repetition_reward_func": -0.05843658186495304, + "rewards/verse_reward_func": 0.0, + "step": 187 + }, + { + "completion_length": 500.15625, + "epoch": 1.504, + "grad_norm": 0.53125, + "kl": 0.057259781286120415, + "learning_rate": 4.9614108363230135e-05, + "loss": 0.0023, + "reward": 2.1232675313949585, + "reward_std": 1.1966499090194702, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 2.3699220418930054, + "rewards/no_repetition_reward_func": -0.05134197138249874, + "rewards/verse_reward_func": -0.0078125, + "step": 188 + }, + { + "completion_length": 476.6875, + "epoch": 1.512, + "grad_norm": 1.4453125, + "kl": 0.055534353479743004, + "learning_rate": 4.960179349164621e-05, + "loss": 0.0022, + "reward": 1.9770573377609253, + "reward_std": 0.9365957975387573, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 2.1771883964538574, + "rewards/no_repetition_reward_func": -0.059506142511963844, + "rewards/verse_reward_func": -0.015625, + "step": 189 + }, + { + "completion_length": 506.421875, + "epoch": 1.52, + "grad_norm": 0.31640625, + "kl": 0.05753283575177193, + "learning_rate": 4.9589286770334654e-05, + "loss": 0.0023, + "reward": 2.3927392959594727, + "reward_std": 1.0350596010684967, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 2.55940318107605, + "rewards/no_repetition_reward_func": -0.05728885717689991, + "rewards/verse_reward_func": 0.0, + "step": 190 + }, + { + "completion_length": 513.484375, + "epoch": 1.528, + "grad_norm": 0.32421875, + "kl": 0.05903477966785431, + "learning_rate": 4.9576588296825386e-05, + "loss": 0.0024, + "reward": 2.5249528884887695, + "reward_std": 1.2699385285377502, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 2.6938159465789795, + "rewards/no_repetition_reward_func": -0.05948808416724205, + "rewards/verse_reward_func": 0.0, + "step": 191 + }, + { + "completion_length": 490.515625, + "epoch": 1.536, + "grad_norm": 0.6484375, + "kl": 0.06209894269704819, + "learning_rate": 4.9563698170143666e-05, + "loss": 0.0025, + "reward": 2.2243632078170776, + "reward_std": 1.0745408236980438, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 2.338887870311737, + "rewards/no_repetition_reward_func": -0.059837039560079575, + "rewards/verse_reward_func": -0.0078125, + "step": 192 + }, + { + "completion_length": 499.71875, + "epoch": 1.544, + "grad_norm": 0.3359375, + "kl": 0.06187794730067253, + "learning_rate": 4.95506164908093e-05, + "loss": 0.0025, + "reward": 2.3777061700820923, + "reward_std": 1.0805965065956116, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.4948073625564575, + "rewards/no_repetition_reward_func": -0.05460111051797867, + "rewards/verse_reward_func": 0.0, + "step": 193 + }, + { + "completion_length": 505.375, + "epoch": 1.552, + "grad_norm": 0.443359375, + "kl": 0.06536386534571648, + "learning_rate": 4.953734336083583e-05, + "loss": 0.0026, + "reward": 2.721463680267334, + "reward_std": 1.117157757282257, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.8564270734786987, + "rewards/no_repetition_reward_func": -0.05683847330510616, + "rewards/verse_reward_func": 0.0, + "step": 194 + }, + { + "completion_length": 503.015625, + "epoch": 1.56, + "grad_norm": 0.388671875, + "kl": 0.06220167875289917, + "learning_rate": 4.952387888372979e-05, + "loss": 0.0025, + "reward": 2.6069748401641846, + "reward_std": 1.291869878768921, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 2.815003275871277, + "rewards/no_repetition_reward_func": -0.06740351766347885, + "rewards/verse_reward_func": 0.0, + "step": 195 + }, + { + "completion_length": 491.953125, + "epoch": 1.568, + "grad_norm": 0.345703125, + "kl": 0.05213422700762749, + "learning_rate": 4.95102231644899e-05, + "loss": 0.0021, + "reward": 2.0487062335014343, + "reward_std": 0.9666328430175781, + "rewards/check_divine_comedy_plagiarism": -0.25, + "rewards/endecasillabo_reward_func": 2.3606979846954346, + "rewards/no_repetition_reward_func": -0.0619916133582592, + "rewards/verse_reward_func": 0.0, + "step": 196 + }, + { + "completion_length": 504.03125, + "epoch": 1.576, + "grad_norm": 0.392578125, + "kl": 0.059214821085333824, + "learning_rate": 4.949637630960617e-05, + "loss": 0.0024, + "reward": 2.5467708110809326, + "reward_std": 1.095446616411209, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.7130298614501953, + "rewards/no_repetition_reward_func": -0.0725090503692627, + "rewards/verse_reward_func": 0.0, + "step": 197 + }, + { + "completion_length": 506.09375, + "epoch": 1.584, + "grad_norm": 0.287109375, + "kl": 0.060350148007273674, + "learning_rate": 4.948233842705919e-05, + "loss": 0.0024, + "reward": 2.5752665996551514, + "reward_std": 1.4399675726890564, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 2.774072051048279, + "rewards/no_repetition_reward_func": -0.07380523532629013, + "rewards/verse_reward_func": 0.0, + "step": 198 + }, + { + "completion_length": 505.0, + "epoch": 1.592, + "grad_norm": 0.365234375, + "kl": 0.05758612044155598, + "learning_rate": 4.946810962631916e-05, + "loss": 0.0023, + "reward": 2.364269733428955, + "reward_std": 1.4654395580291748, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 2.5511739253997803, + "rewards/no_repetition_reward_func": -0.0697169192135334, + "rewards/verse_reward_func": -0.0078125, + "step": 199 + }, + { + "completion_length": 498.65625, + "epoch": 1.6, + "grad_norm": 0.39453125, + "kl": 0.06153145059943199, + "learning_rate": 4.9453690018345144e-05, + "loss": 0.0025, + "reward": 2.1242388486862183, + "reward_std": 1.2155637741088867, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.261116862297058, + "rewards/no_repetition_reward_func": -0.058753009885549545, + "rewards/verse_reward_func": 0.0, + "step": 200 + }, + { + "completion_length": 494.71875, + "epoch": 1.608, + "grad_norm": 0.3125, + "kl": 0.058732934296131134, + "learning_rate": 4.9439079715584135e-05, + "loss": 0.0023, + "reward": 2.3503177165985107, + "reward_std": 1.0894514918327332, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 2.598857879638672, + "rewards/no_repetition_reward_func": -0.07666521146893501, + "rewards/verse_reward_func": 0.0, + "step": 201 + }, + { + "completion_length": 510.359375, + "epoch": 1.616, + "grad_norm": 0.322265625, + "kl": 0.06430498510599136, + "learning_rate": 4.942427883197021e-05, + "loss": 0.0026, + "reward": 2.6571733951568604, + "reward_std": 1.015516221523285, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.8161243200302124, + "rewards/no_repetition_reward_func": -0.06520115211606026, + "rewards/verse_reward_func": 0.0, + "step": 202 + }, + { + "completion_length": 505.734375, + "epoch": 1.624, + "grad_norm": 0.306640625, + "kl": 0.06140182912349701, + "learning_rate": 4.940928748292363e-05, + "loss": 0.0025, + "reward": 2.7970908880233765, + "reward_std": 1.4815118908882141, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.963927149772644, + "rewards/no_repetition_reward_func": -0.07308632135391235, + "rewards/verse_reward_func": 0.0, + "step": 203 + }, + { + "completion_length": 505.03125, + "epoch": 1.6320000000000001, + "grad_norm": 0.310546875, + "kl": 0.059984346851706505, + "learning_rate": 4.9394105785349944e-05, + "loss": 0.0024, + "reward": 2.593742847442627, + "reward_std": 1.2685362100601196, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 2.8400269746780396, + "rewards/no_repetition_reward_func": -0.0744091160595417, + "rewards/verse_reward_func": 0.0, + "step": 204 + }, + { + "completion_length": 494.828125, + "epoch": 1.6400000000000001, + "grad_norm": 0.3046875, + "kl": 0.06854120641946793, + "learning_rate": 4.937873385763908e-05, + "loss": 0.0027, + "reward": 3.199925422668457, + "reward_std": 1.5619657635688782, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.3613587617874146, + "rewards/no_repetition_reward_func": -0.08330819010734558, + "rewards/verse_reward_func": 0.0, + "step": 205 + }, + { + "completion_length": 509.15625, + "epoch": 1.6480000000000001, + "grad_norm": 0.306640625, + "kl": 0.06849325075745583, + "learning_rate": 4.9363171819664434e-05, + "loss": 0.0027, + "reward": 3.045534372329712, + "reward_std": 1.4717455506324768, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.2179375886917114, + "rewards/no_repetition_reward_func": -0.07865316048264503, + "rewards/verse_reward_func": 0.0, + "step": 206 + }, + { + "completion_length": 509.828125, + "epoch": 1.6560000000000001, + "grad_norm": 0.328125, + "kl": 0.06695650145411491, + "learning_rate": 4.9347419792781876e-05, + "loss": 0.0027, + "reward": 2.713486909866333, + "reward_std": 1.081823080778122, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.803895592689514, + "rewards/no_repetition_reward_func": -0.05915876477956772, + "rewards/verse_reward_func": 0.0, + "step": 207 + }, + { + "completion_length": 499.65625, + "epoch": 1.6640000000000001, + "grad_norm": 0.353515625, + "kl": 0.06637488305568695, + "learning_rate": 4.93314778998289e-05, + "loss": 0.0027, + "reward": 2.6594350337982178, + "reward_std": 1.3726709485054016, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 2.8672618865966797, + "rewards/no_repetition_reward_func": -0.07501417398452759, + "rewards/verse_reward_func": -0.0078125, + "step": 208 + }, + { + "completion_length": 481.109375, + "epoch": 1.6720000000000002, + "grad_norm": 0.3515625, + "kl": 0.05885790474712849, + "learning_rate": 4.9315346265123594e-05, + "loss": 0.0024, + "reward": 2.1628894805908203, + "reward_std": 1.1130412817001343, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 2.294868588447571, + "rewards/no_repetition_reward_func": -0.0772915743291378, + "rewards/verse_reward_func": -0.0078125, + "step": 209 + }, + { + "completion_length": 501.15625, + "epoch": 1.6800000000000002, + "grad_norm": 0.326171875, + "kl": 0.06681947410106659, + "learning_rate": 4.929902501446366e-05, + "loss": 0.0027, + "reward": 2.7717407941818237, + "reward_std": 1.598202884197235, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 2.972840666770935, + "rewards/no_repetition_reward_func": -0.07610012590885162, + "rewards/verse_reward_func": 0.0, + "step": 210 + }, + { + "completion_length": 491.015625, + "epoch": 1.688, + "grad_norm": 0.3046875, + "kl": 0.0636095255613327, + "learning_rate": 4.92825142751255e-05, + "loss": 0.0025, + "reward": 3.0454728603363037, + "reward_std": 1.5663429498672485, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 3.2448137998580933, + "rewards/no_repetition_reward_func": -0.08996595069766045, + "rewards/verse_reward_func": 0.0, + "step": 211 + }, + { + "completion_length": 505.765625, + "epoch": 1.696, + "grad_norm": 0.326171875, + "kl": 0.06654490530490875, + "learning_rate": 4.9265814175863186e-05, + "loss": 0.0027, + "reward": 2.7433502674102783, + "reward_std": 1.353922426700592, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.8858747482299805, + "rewards/no_repetition_reward_func": -0.08002437278628349, + "rewards/verse_reward_func": 0.0, + "step": 212 + }, + { + "completion_length": 500.578125, + "epoch": 1.704, + "grad_norm": 0.306640625, + "kl": 0.07159844413399696, + "learning_rate": 4.924892484690743e-05, + "loss": 0.0029, + "reward": 2.9242680072784424, + "reward_std": 1.3001119494438171, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 3.1409326791763306, + "rewards/no_repetition_reward_func": -0.0760396160185337, + "rewards/verse_reward_func": 0.0, + "step": 213 + }, + { + "completion_length": 491.578125, + "epoch": 1.712, + "grad_norm": 0.310546875, + "kl": 0.06415756233036518, + "learning_rate": 4.923184641996463e-05, + "loss": 0.0026, + "reward": 3.0631299018859863, + "reward_std": 1.6679434180259705, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 3.268932580947876, + "rewards/no_repetition_reward_func": -0.09642736241221428, + "rewards/verse_reward_func": 0.0, + "step": 214 + }, + { + "completion_length": 497.765625, + "epoch": 1.72, + "grad_norm": 0.37109375, + "kl": 0.05811973102390766, + "learning_rate": 4.9214579028215776e-05, + "loss": 0.0023, + "reward": 2.569594621658325, + "reward_std": 1.6089882254600525, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 2.7898136377334595, + "rewards/no_repetition_reward_func": -0.09521926939487457, + "rewards/verse_reward_func": 0.0, + "step": 215 + }, + { + "completion_length": 502.96875, + "epoch": 1.728, + "grad_norm": 0.296875, + "kl": 0.06474460661411285, + "learning_rate": 4.919712280631547e-05, + "loss": 0.0026, + "reward": 2.8761953115463257, + "reward_std": 1.3299922347068787, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.0243782997131348, + "rewards/no_repetition_reward_func": -0.0856829322874546, + "rewards/verse_reward_func": 0.0, + "step": 216 + }, + { + "completion_length": 498.90625, + "epoch": 1.736, + "grad_norm": 0.310546875, + "kl": 0.06863456964492798, + "learning_rate": 4.9179477890390825e-05, + "loss": 0.0027, + "reward": 3.329008102416992, + "reward_std": 1.615666389465332, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 3.5736000537872314, + "rewards/no_repetition_reward_func": -0.10396680608391762, + "rewards/verse_reward_func": 0.0, + "step": 217 + }, + { + "completion_length": 508.75, + "epoch": 1.744, + "grad_norm": 0.322265625, + "kl": 0.0714041143655777, + "learning_rate": 4.916164441804044e-05, + "loss": 0.0029, + "reward": 3.2173657417297363, + "reward_std": 1.6365570425987244, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.419893741607666, + "rewards/no_repetition_reward_func": -0.10877805948257446, + "rewards/verse_reward_func": 0.0, + "step": 218 + }, + { + "completion_length": 505.8125, + "epoch": 1.752, + "grad_norm": 0.298828125, + "kl": 0.0687246136367321, + "learning_rate": 4.914362252833332e-05, + "loss": 0.0027, + "reward": 3.1534866094589233, + "reward_std": 1.4445326328277588, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 3.3648873567581177, + "rewards/no_repetition_reward_func": -0.10202562063932419, + "rewards/verse_reward_func": 0.0, + "step": 219 + }, + { + "completion_length": 499.21875, + "epoch": 1.76, + "grad_norm": 0.35546875, + "kl": 0.07193133607506752, + "learning_rate": 4.912541236180779e-05, + "loss": 0.0029, + "reward": 2.664386749267578, + "reward_std": 1.2780566215515137, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.783419609069824, + "rewards/no_repetition_reward_func": -0.08778273686766624, + "rewards/verse_reward_func": 0.0, + "step": 220 + }, + { + "completion_length": 503.4375, + "epoch": 1.768, + "grad_norm": 0.57421875, + "kl": 0.06494078785181046, + "learning_rate": 4.910701406047037e-05, + "loss": 0.0026, + "reward": 2.78882098197937, + "reward_std": 1.4788238406181335, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.9799296855926514, + "rewards/no_repetition_reward_func": -0.10517111420631409, + "rewards/verse_reward_func": -0.0078125, + "step": 221 + }, + { + "completion_length": 500.65625, + "epoch": 1.776, + "grad_norm": 0.326171875, + "kl": 0.07160357758402824, + "learning_rate": 4.908842776779472e-05, + "loss": 0.0029, + "reward": 2.793657660484314, + "reward_std": 1.5747646689414978, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 2.98151433467865, + "rewards/no_repetition_reward_func": -0.07848173752427101, + "rewards/verse_reward_func": 0.0, + "step": 222 + }, + { + "completion_length": 500.546875, + "epoch": 1.784, + "grad_norm": 0.302734375, + "kl": 0.06824653595685959, + "learning_rate": 4.906965362872047e-05, + "loss": 0.0027, + "reward": 3.1853344440460205, + "reward_std": 1.592510163784027, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.3494197130203247, + "rewards/no_repetition_reward_func": -0.101585254073143, + "rewards/verse_reward_func": 0.0, + "step": 223 + }, + { + "completion_length": 514.40625, + "epoch": 1.792, + "grad_norm": 0.302734375, + "kl": 0.07127313688397408, + "learning_rate": 4.905069178965215e-05, + "loss": 0.0029, + "reward": 3.0298691987991333, + "reward_std": 1.370770514011383, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 3.2418570518493652, + "rewards/no_repetition_reward_func": -0.10261284187436104, + "rewards/verse_reward_func": 0.0, + "step": 224 + }, + { + "completion_length": 508.140625, + "epoch": 1.8, + "grad_norm": 0.3046875, + "kl": 0.06873781606554985, + "learning_rate": 4.9031542398457974e-05, + "loss": 0.0027, + "reward": 2.8017191886901855, + "reward_std": 1.5255757570266724, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.925915241241455, + "rewards/no_repetition_reward_func": -0.09294591471552849, + "rewards/verse_reward_func": 0.0, + "step": 225 + }, + { + "completion_length": 513.734375, + "epoch": 1.808, + "grad_norm": 0.3046875, + "kl": 0.07772918045520782, + "learning_rate": 4.9012205604468744e-05, + "loss": 0.0031, + "reward": 3.338989734649658, + "reward_std": 1.4803727269172668, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 3.638012647628784, + "rewards/no_repetition_reward_func": -0.11152287572622299, + "rewards/verse_reward_func": 0.0, + "step": 226 + }, + { + "completion_length": 507.546875, + "epoch": 1.8159999999999998, + "grad_norm": 0.28125, + "kl": 0.07632273808121681, + "learning_rate": 4.899268155847667e-05, + "loss": 0.0031, + "reward": 3.102638363838196, + "reward_std": 1.193020761013031, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.268816828727722, + "rewards/no_repetition_reward_func": -0.08805351704359055, + "rewards/verse_reward_func": 0.0, + "step": 227 + }, + { + "completion_length": 498.875, + "epoch": 1.8239999999999998, + "grad_norm": 0.50390625, + "kl": 0.07189759239554405, + "learning_rate": 4.8972970412734176e-05, + "loss": 0.0029, + "reward": 3.357118844985962, + "reward_std": 1.6401885747909546, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.5579077005386353, + "rewards/no_repetition_reward_func": -0.12266393005847931, + "rewards/verse_reward_func": -0.015625, + "step": 228 + }, + { + "completion_length": 504.359375, + "epoch": 1.8319999999999999, + "grad_norm": 0.330078125, + "kl": 0.07323037087917328, + "learning_rate": 4.8953072320952745e-05, + "loss": 0.0029, + "reward": 2.947006344795227, + "reward_std": 1.3394423723220825, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 3.1556328535079956, + "rewards/no_repetition_reward_func": -0.08362649008631706, + "rewards/verse_reward_func": 0.0, + "step": 229 + }, + { + "completion_length": 500.703125, + "epoch": 1.8399999999999999, + "grad_norm": 0.3046875, + "kl": 0.0726243145763874, + "learning_rate": 4.893298743830168e-05, + "loss": 0.0029, + "reward": 2.6961551904678345, + "reward_std": 1.7145134806632996, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.9025362730026245, + "rewards/no_repetition_reward_func": -0.10481851547956467, + "rewards/verse_reward_func": -0.0078125, + "step": 230 + }, + { + "completion_length": 508.046875, + "epoch": 1.8479999999999999, + "grad_norm": 0.296875, + "kl": 0.07663927972316742, + "learning_rate": 4.891271592140695e-05, + "loss": 0.0031, + "reward": 3.0300631523132324, + "reward_std": 1.2490823864936829, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 3.262691617012024, + "rewards/no_repetition_reward_func": -0.09200334176421165, + "rewards/verse_reward_func": 0.0, + "step": 231 + }, + { + "completion_length": 488.671875, + "epoch": 1.8559999999999999, + "grad_norm": 0.3984375, + "kl": 0.06773153692483902, + "learning_rate": 4.889225792834991e-05, + "loss": 0.0027, + "reward": 3.047479033470154, + "reward_std": 1.3070793151855469, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 3.3006049394607544, + "rewards/no_repetition_reward_func": -0.09687581285834312, + "rewards/verse_reward_func": 0.0, + "step": 232 + }, + { + "completion_length": 490.359375, + "epoch": 1.8639999999999999, + "grad_norm": 0.416015625, + "kl": 0.06605518609285355, + "learning_rate": 4.887161361866608e-05, + "loss": 0.0026, + "reward": 2.8387598991394043, + "reward_std": 1.5805920958518982, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 3.068055748939514, + "rewards/no_repetition_reward_func": -0.09648354724049568, + "rewards/verse_reward_func": -0.0078125, + "step": 233 + }, + { + "completion_length": 508.84375, + "epoch": 1.8719999999999999, + "grad_norm": 0.3046875, + "kl": 0.07300540804862976, + "learning_rate": 4.885078315334395e-05, + "loss": 0.0029, + "reward": 3.049012780189514, + "reward_std": 1.4188016057014465, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.216846227645874, + "rewards/no_repetition_reward_func": -0.08970846235752106, + "rewards/verse_reward_func": 0.0, + "step": 234 + }, + { + "completion_length": 499.203125, + "epoch": 1.88, + "grad_norm": 0.59765625, + "kl": 0.07140311598777771, + "learning_rate": 4.882976669482367e-05, + "loss": 0.0029, + "reward": 2.9766345024108887, + "reward_std": 1.340936005115509, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 3.183377504348755, + "rewards/no_repetition_reward_func": -0.09736775979399681, + "rewards/verse_reward_func": 0.0, + "step": 235 + }, + { + "completion_length": 506.3125, + "epoch": 1.888, + "grad_norm": 0.330078125, + "kl": 0.06853189319372177, + "learning_rate": 4.880856440699582e-05, + "loss": 0.0027, + "reward": 3.1858813762664795, + "reward_std": 1.5762462615966797, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 3.433692455291748, + "rewards/no_repetition_reward_func": -0.09156115725636482, + "rewards/verse_reward_func": 0.0, + "step": 236 + }, + { + "completion_length": 510.46875, + "epoch": 1.896, + "grad_norm": 0.310546875, + "kl": 0.06697448343038559, + "learning_rate": 4.878717645520008e-05, + "loss": 0.0027, + "reward": 2.258975625038147, + "reward_std": 1.4997749328613281, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.440456986427307, + "rewards/no_repetition_reward_func": -0.09554393962025642, + "rewards/verse_reward_func": -0.0078125, + "step": 237 + }, + { + "completion_length": 489.1875, + "epoch": 1.904, + "grad_norm": 0.361328125, + "kl": 0.07771424576640129, + "learning_rate": 4.8765603006224006e-05, + "loss": 0.0031, + "reward": 2.9600815773010254, + "reward_std": 1.64453125, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.133408784866333, + "rewards/no_repetition_reward_func": -0.09520234167575836, + "rewards/verse_reward_func": 0.0, + "step": 238 + }, + { + "completion_length": 495.140625, + "epoch": 1.912, + "grad_norm": 0.4296875, + "kl": 0.06277117878198624, + "learning_rate": 4.874384422830167e-05, + "loss": 0.0025, + "reward": 2.6670076847076416, + "reward_std": 1.5096913576126099, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 2.8843451738357544, + "rewards/no_repetition_reward_func": -0.0923374854028225, + "rewards/verse_reward_func": 0.0, + "step": 239 + }, + { + "completion_length": 501.875, + "epoch": 1.92, + "grad_norm": 0.296875, + "kl": 0.06972404569387436, + "learning_rate": 4.8721900291112415e-05, + "loss": 0.0028, + "reward": 3.041973829269409, + "reward_std": 1.4522449970245361, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.2099310159683228, + "rewards/no_repetition_reward_func": -0.08983222767710686, + "rewards/verse_reward_func": 0.0, + "step": 240 + }, + { + "completion_length": 513.609375, + "epoch": 1.928, + "grad_norm": 0.29296875, + "kl": 0.06830225139856339, + "learning_rate": 4.8699771365779453e-05, + "loss": 0.0027, + "reward": 2.957208037376404, + "reward_std": 1.4727458953857422, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.143127679824829, + "rewards/no_repetition_reward_func": -0.0921698585152626, + "rewards/verse_reward_func": 0.0, + "step": 241 + }, + { + "completion_length": 496.953125, + "epoch": 1.936, + "grad_norm": 0.474609375, + "kl": 0.0700872428715229, + "learning_rate": 4.867745762486861e-05, + "loss": 0.0028, + "reward": 2.6644710302352905, + "reward_std": 1.5864287614822388, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.8457623720169067, + "rewards/no_repetition_reward_func": -0.09535382688045502, + "rewards/verse_reward_func": -0.0078125, + "step": 242 + }, + { + "completion_length": 486.0625, + "epoch": 1.944, + "grad_norm": 0.86328125, + "kl": 0.07269894331693649, + "learning_rate": 4.8654959242386896e-05, + "loss": 0.0029, + "reward": 3.050985336303711, + "reward_std": 1.7207348346710205, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.2463687658309937, + "rewards/no_repetition_reward_func": -0.09382085129618645, + "rewards/verse_reward_func": -0.0234375, + "step": 243 + }, + { + "completion_length": 507.8125, + "epoch": 1.952, + "grad_norm": 0.318359375, + "kl": 0.07244386896491051, + "learning_rate": 4.863227639378124e-05, + "loss": 0.0029, + "reward": 3.0806673765182495, + "reward_std": 1.4346663355827332, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.216737389564514, + "rewards/no_repetition_reward_func": -0.10482005774974823, + "rewards/verse_reward_func": 0.0, + "step": 244 + }, + { + "completion_length": 489.484375, + "epoch": 1.96, + "grad_norm": 0.443359375, + "kl": 0.06466012820601463, + "learning_rate": 4.860940925593703e-05, + "loss": 0.0026, + "reward": 2.3501996994018555, + "reward_std": 1.3326597213745117, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.4916345477104187, + "rewards/no_repetition_reward_func": -0.07112225517630577, + "rewards/verse_reward_func": -0.0078125, + "step": 245 + }, + { + "completion_length": 497.0625, + "epoch": 1.968, + "grad_norm": 0.390625, + "kl": 0.06556032411754131, + "learning_rate": 4.858635800717681e-05, + "loss": 0.0026, + "reward": 3.0849809646606445, + "reward_std": 1.3696974515914917, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 3.3093611001968384, + "rewards/no_repetition_reward_func": -0.08375527337193489, + "rewards/verse_reward_func": 0.0, + "step": 246 + }, + { + "completion_length": 502.40625, + "epoch": 1.976, + "grad_norm": 0.306640625, + "kl": 0.06797042489051819, + "learning_rate": 4.856312282725886e-05, + "loss": 0.0027, + "reward": 2.746731162071228, + "reward_std": 1.519513487815857, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.8806822299957275, + "rewards/no_repetition_reward_func": -0.10270101949572563, + "rewards/verse_reward_func": -0.015625, + "step": 247 + }, + { + "completion_length": 508.203125, + "epoch": 1.984, + "grad_norm": 0.3125, + "kl": 0.07060381397604942, + "learning_rate": 4.8539703897375755e-05, + "loss": 0.0028, + "reward": 2.874662756919861, + "reward_std": 1.2588258981704712, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 3.155393123626709, + "rewards/no_repetition_reward_func": -0.0932304635643959, + "rewards/verse_reward_func": 0.0, + "step": 248 + }, + { + "completion_length": 513.203125, + "epoch": 1.992, + "grad_norm": 0.3046875, + "kl": 0.07223336398601532, + "learning_rate": 4.851610140015304e-05, + "loss": 0.0029, + "reward": 3.1444543600082397, + "reward_std": 1.4597138166427612, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 3.392817735671997, + "rewards/no_repetition_reward_func": -0.09211357310414314, + "rewards/verse_reward_func": 0.0, + "step": 249 + }, + { + "completion_length": 508.1875, + "epoch": 2.0, + "grad_norm": 0.3203125, + "kl": 0.07433345541357994, + "learning_rate": 4.849231551964771e-05, + "loss": 0.003, + "reward": 3.304791212081909, + "reward_std": 1.4314507246017456, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 3.5422061681747437, + "rewards/no_repetition_reward_func": -0.0967898890376091, + "rewards/verse_reward_func": 0.0, + "step": 250 + }, + { + "completion_length": 483.703125, + "epoch": 2.008, + "grad_norm": 0.390625, + "kl": 0.06221909821033478, + "learning_rate": 4.846834644134686e-05, + "loss": 0.0025, + "reward": 2.8896068334579468, + "reward_std": 1.7400249242782593, + "rewards/check_divine_comedy_plagiarism": -0.234375, + "rewards/endecasillabo_reward_func": 3.250904083251953, + "rewards/no_repetition_reward_func": -0.10348471999168396, + "rewards/verse_reward_func": -0.0234375, + "step": 251 + }, + { + "completion_length": 512.875, + "epoch": 2.016, + "grad_norm": 0.271484375, + "kl": 0.07741362974047661, + "learning_rate": 4.844419435216615e-05, + "loss": 0.0031, + "reward": 3.7476611137390137, + "reward_std": 1.5707623362541199, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.916447162628174, + "rewards/no_repetition_reward_func": -0.10628580302000046, + "rewards/verse_reward_func": 0.0, + "step": 252 + }, + { + "completion_length": 502.6875, + "epoch": 2.024, + "grad_norm": 0.30859375, + "kl": 0.07832404226064682, + "learning_rate": 4.841985944044845e-05, + "loss": 0.0031, + "reward": 3.113116145133972, + "reward_std": 1.247343897819519, + "rewards/check_divine_comedy_plagiarism": -0.1875, + "rewards/endecasillabo_reward_func": 3.387788772583008, + "rewards/no_repetition_reward_func": -0.08717276155948639, + "rewards/verse_reward_func": 0.0, + "step": 253 + }, + { + "completion_length": 488.6875, + "epoch": 2.032, + "grad_norm": 0.7421875, + "kl": 0.08463284745812416, + "learning_rate": 4.839534189596228e-05, + "loss": 0.0034, + "reward": 3.2809360027313232, + "reward_std": 1.702888548374176, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 3.5504496097564697, + "rewards/no_repetition_reward_func": -0.08982605114579201, + "rewards/verse_reward_func": -0.0078125, + "step": 254 + }, + { + "completion_length": 500.890625, + "epoch": 2.04, + "grad_norm": 0.302734375, + "kl": 0.06946692056953907, + "learning_rate": 4.837064190990036e-05, + "loss": 0.0028, + "reward": 3.0790518522262573, + "reward_std": 1.3131604194641113, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.2106006145477295, + "rewards/no_repetition_reward_func": -0.10029885172843933, + "rewards/verse_reward_func": 0.0, + "step": 255 + }, + { + "completion_length": 513.734375, + "epoch": 2.048, + "grad_norm": 0.26953125, + "kl": 0.07956181094050407, + "learning_rate": 4.834575967487817e-05, + "loss": 0.0032, + "reward": 3.1968002319335938, + "reward_std": 1.35796058177948, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.3373371362686157, + "rewards/no_repetition_reward_func": -0.1092870756983757, + "rewards/verse_reward_func": 0.0, + "step": 256 + }, + { + "completion_length": 508.140625, + "epoch": 2.056, + "grad_norm": 0.30078125, + "kl": 0.07099571451544762, + "learning_rate": 4.832069538493237e-05, + "loss": 0.0028, + "reward": 3.573302745819092, + "reward_std": 1.702924132347107, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.769816517829895, + "rewards/no_repetition_reward_func": -0.1183888167142868, + "rewards/verse_reward_func": 0.0, + "step": 257 + }, + { + "completion_length": 501.4375, + "epoch": 2.064, + "grad_norm": 0.302734375, + "kl": 0.07141702994704247, + "learning_rate": 4.829544923551931e-05, + "loss": 0.0029, + "reward": 3.166470766067505, + "reward_std": 1.432704210281372, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.3196059465408325, + "rewards/no_repetition_reward_func": -0.09063518419861794, + "rewards/verse_reward_func": 0.0, + "step": 258 + }, + { + "completion_length": 505.140625, + "epoch": 2.072, + "grad_norm": 0.3359375, + "kl": 0.0749228447675705, + "learning_rate": 4.8270021423513554e-05, + "loss": 0.003, + "reward": 3.0830200910568237, + "reward_std": 1.6206685304641724, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 3.3199602365493774, + "rewards/no_repetition_reward_func": -0.1119401752948761, + "rewards/verse_reward_func": 0.0, + "step": 259 + }, + { + "completion_length": 511.640625, + "epoch": 2.08, + "grad_norm": 0.296875, + "kl": 0.07271172851324081, + "learning_rate": 4.8244412147206284e-05, + "loss": 0.0029, + "reward": 2.9411349296569824, + "reward_std": 1.6098909378051758, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.129357099533081, + "rewards/no_repetition_reward_func": -0.09447216242551804, + "rewards/verse_reward_func": 0.0, + "step": 260 + }, + { + "completion_length": 514.171875, + "epoch": 2.088, + "grad_norm": 0.2890625, + "kl": 0.0661465972661972, + "learning_rate": 4.821862160630378e-05, + "loss": 0.0026, + "reward": 3.0424567461013794, + "reward_std": 1.509213387966156, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.258923292160034, + "rewards/no_repetition_reward_func": -0.12271657213568687, + "rewards/verse_reward_func": 0.0, + "step": 261 + }, + { + "completion_length": 508.84375, + "epoch": 2.096, + "grad_norm": 0.28515625, + "kl": 0.07240790873765945, + "learning_rate": 4.8192650001925855e-05, + "loss": 0.0029, + "reward": 3.2937673330307007, + "reward_std": 1.5072815418243408, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 3.5493109226226807, + "rewards/no_repetition_reward_func": -0.10710598528385162, + "rewards/verse_reward_func": -0.0078125, + "step": 262 + }, + { + "completion_length": 507.078125, + "epoch": 2.104, + "grad_norm": 0.61328125, + "kl": 0.0797758400440216, + "learning_rate": 4.81664975366043e-05, + "loss": 0.0032, + "reward": 3.2865813970565796, + "reward_std": 1.549062967300415, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.440986156463623, + "rewards/no_repetition_reward_func": -0.0840921625494957, + "rewards/verse_reward_func": -0.0078125, + "step": 263 + }, + { + "completion_length": 504.8125, + "epoch": 2.112, + "grad_norm": 0.30859375, + "kl": 0.06889849156141281, + "learning_rate": 4.8140164414281306e-05, + "loss": 0.0028, + "reward": 3.0999197959899902, + "reward_std": 1.4922637939453125, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.255833625793457, + "rewards/no_repetition_reward_func": -0.0934138223528862, + "rewards/verse_reward_func": 0.0, + "step": 264 + }, + { + "completion_length": 508.125, + "epoch": 2.12, + "grad_norm": 0.34375, + "kl": 0.07055166736245155, + "learning_rate": 4.8113650840307834e-05, + "loss": 0.0028, + "reward": 2.3729827404022217, + "reward_std": 1.5330259203910828, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.555717945098877, + "rewards/no_repetition_reward_func": -0.088985126465559, + "rewards/verse_reward_func": 0.0, + "step": 265 + }, + { + "completion_length": 505.453125, + "epoch": 2.128, + "grad_norm": 0.58203125, + "kl": 0.06583471968770027, + "learning_rate": 4.808695702144206e-05, + "loss": 0.0026, + "reward": 2.8258155584335327, + "reward_std": 1.6740102767944336, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 3.045043706893921, + "rewards/no_repetition_reward_func": -0.08641548827290535, + "rewards/verse_reward_func": -0.0078125, + "step": 266 + }, + { + "completion_length": 514.21875, + "epoch": 2.136, + "grad_norm": 0.30078125, + "kl": 0.06372017040848732, + "learning_rate": 4.8060083165847754e-05, + "loss": 0.0025, + "reward": 2.503085136413574, + "reward_std": 1.5131137371063232, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.7022485733032227, + "rewards/no_repetition_reward_func": -0.08978844434022903, + "rewards/verse_reward_func": -0.015625, + "step": 267 + }, + { + "completion_length": 488.59375, + "epoch": 2.144, + "grad_norm": 0.33203125, + "kl": 0.06943970546126366, + "learning_rate": 4.803302948309264e-05, + "loss": 0.0028, + "reward": 3.3013628721237183, + "reward_std": 1.4646414518356323, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 3.575153946876526, + "rewards/no_repetition_reward_func": -0.10191601514816284, + "rewards/verse_reward_func": 0.0, + "step": 268 + }, + { + "completion_length": 509.3125, + "epoch": 2.152, + "grad_norm": 0.296875, + "kl": 0.06533129513263702, + "learning_rate": 4.800579618414676e-05, + "loss": 0.0026, + "reward": 2.817284941673279, + "reward_std": 1.51528662443161, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 2.9585076570510864, + "rewards/no_repetition_reward_func": -0.09434769675135612, + "rewards/verse_reward_func": 0.0, + "step": 269 + }, + { + "completion_length": 515.21875, + "epoch": 2.16, + "grad_norm": 0.291015625, + "kl": 0.06973907724022865, + "learning_rate": 4.797838348138086e-05, + "loss": 0.0028, + "reward": 3.4826382398605347, + "reward_std": 1.3262174725532532, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.678097724914551, + "rewards/no_repetition_reward_func": -0.10170967876911163, + "rewards/verse_reward_func": 0.0, + "step": 270 + }, + { + "completion_length": 510.0625, + "epoch": 2.168, + "grad_norm": 0.27734375, + "kl": 0.07255783304572105, + "learning_rate": 4.79507915885647e-05, + "loss": 0.0029, + "reward": 3.3016916513442993, + "reward_std": 1.6018950939178467, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 3.5301390886306763, + "rewards/no_repetition_reward_func": -0.10344741493463516, + "rewards/verse_reward_func": 0.0, + "step": 271 + }, + { + "completion_length": 511.3125, + "epoch": 2.176, + "grad_norm": 0.2890625, + "kl": 0.07038086652755737, + "learning_rate": 4.7923020720865414e-05, + "loss": 0.0028, + "reward": 3.0651341676712036, + "reward_std": 1.531345248222351, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.248605966567993, + "rewards/no_repetition_reward_func": -0.08972195163369179, + "rewards/verse_reward_func": 0.0, + "step": 272 + }, + { + "completion_length": 490.5625, + "epoch": 2.184, + "grad_norm": 0.435546875, + "kl": 0.06709034740924835, + "learning_rate": 4.789507109484579e-05, + "loss": 0.0027, + "reward": 2.3052377700805664, + "reward_std": 1.0921958088874817, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.4315980672836304, + "rewards/no_repetition_reward_func": -0.06386028416454792, + "rewards/verse_reward_func": 0.0, + "step": 273 + }, + { + "completion_length": 503.046875, + "epoch": 2.192, + "grad_norm": 0.35546875, + "kl": 0.0665770173072815, + "learning_rate": 4.7866942928462625e-05, + "loss": 0.0027, + "reward": 2.720050573348999, + "reward_std": 1.6716498136520386, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.8677737712860107, + "rewards/no_repetition_reward_func": -0.08522327244281769, + "rewards/verse_reward_func": 0.0, + "step": 274 + }, + { + "completion_length": 508.171875, + "epoch": 2.2, + "grad_norm": 0.322265625, + "kl": 0.06027090735733509, + "learning_rate": 4.783863644106502e-05, + "loss": 0.0024, + "reward": 2.5845494270324707, + "reward_std": 1.1862663626670837, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.7357324361801147, + "rewards/no_repetition_reward_func": -0.08087033778429031, + "rewards/verse_reward_func": -0.0078125, + "step": 275 + }, + { + "completion_length": 488.75, + "epoch": 2.208, + "grad_norm": 0.359375, + "kl": 0.0663112010806799, + "learning_rate": 4.781015185339266e-05, + "loss": 0.0027, + "reward": 2.6463863849639893, + "reward_std": 1.698828101158142, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 2.7765138149261475, + "rewards/no_repetition_reward_func": -0.08325253054499626, + "rewards/verse_reward_func": 0.0, + "step": 276 + }, + { + "completion_length": 501.25, + "epoch": 2.216, + "grad_norm": 0.55859375, + "kl": 0.07847867533564568, + "learning_rate": 4.778148938757406e-05, + "loss": 0.0031, + "reward": 3.0669597387313843, + "reward_std": 1.574509859085083, + "rewards/check_divine_comedy_plagiarism": -0.171875, + "rewards/endecasillabo_reward_func": 3.3199340105056763, + "rewards/no_repetition_reward_func": -0.08109928667545319, + "rewards/verse_reward_func": 0.0, + "step": 277 + }, + { + "completion_length": 502.109375, + "epoch": 2.224, + "grad_norm": 0.322265625, + "kl": 0.06526695936918259, + "learning_rate": 4.775264926712489e-05, + "loss": 0.0026, + "reward": 3.378905773162842, + "reward_std": 1.6468844413757324, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 3.610251307487488, + "rewards/no_repetition_reward_func": -0.10634538531303406, + "rewards/verse_reward_func": 0.0, + "step": 278 + }, + { + "completion_length": 500.953125, + "epoch": 2.232, + "grad_norm": 0.30078125, + "kl": 0.06745465844869614, + "learning_rate": 4.772363171694622e-05, + "loss": 0.0027, + "reward": 2.7158459424972534, + "reward_std": 1.7546974420547485, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 2.915878176689148, + "rewards/no_repetition_reward_func": -0.09065758809447289, + "rewards/verse_reward_func": 0.0, + "step": 279 + }, + { + "completion_length": 504.84375, + "epoch": 2.24, + "grad_norm": 0.35546875, + "kl": 0.06638182699680328, + "learning_rate": 4.769443696332272e-05, + "loss": 0.0027, + "reward": 3.0297917127609253, + "reward_std": 1.7113708853721619, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.176946997642517, + "rewards/no_repetition_reward_func": -0.08465522900223732, + "rewards/verse_reward_func": 0.0, + "step": 280 + }, + { + "completion_length": 501.953125, + "epoch": 2.248, + "grad_norm": 0.29296875, + "kl": 0.06997008621692657, + "learning_rate": 4.7665065233920945e-05, + "loss": 0.0028, + "reward": 3.2803046703338623, + "reward_std": 1.7106173038482666, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 3.497170925140381, + "rewards/no_repetition_reward_func": -0.10749094933271408, + "rewards/verse_reward_func": 0.0, + "step": 281 + }, + { + "completion_length": 505.34375, + "epoch": 2.2560000000000002, + "grad_norm": 0.384765625, + "kl": 0.0646367110311985, + "learning_rate": 4.763551675778755e-05, + "loss": 0.0026, + "reward": 2.9635597467422485, + "reward_std": 1.5218685269355774, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.139837861061096, + "rewards/no_repetition_reward_func": -0.09034047648310661, + "rewards/verse_reward_func": -0.0078125, + "step": 282 + }, + { + "completion_length": 508.59375, + "epoch": 2.2640000000000002, + "grad_norm": 0.32421875, + "kl": 0.07036764174699783, + "learning_rate": 4.760579176534747e-05, + "loss": 0.0028, + "reward": 3.3416887521743774, + "reward_std": 1.7129950523376465, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.5184959173202515, + "rewards/no_repetition_reward_func": -0.09868232533335686, + "rewards/verse_reward_func": 0.0, + "step": 283 + }, + { + "completion_length": 509.953125, + "epoch": 2.2720000000000002, + "grad_norm": 0.30078125, + "kl": 0.0678275041282177, + "learning_rate": 4.7575890488402185e-05, + "loss": 0.0027, + "reward": 3.2072685956954956, + "reward_std": 1.4313910007476807, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.3867697715759277, + "rewards/no_repetition_reward_func": -0.08575112372636795, + "rewards/verse_reward_func": 0.0, + "step": 284 + }, + { + "completion_length": 506.546875, + "epoch": 2.2800000000000002, + "grad_norm": 0.5546875, + "kl": 0.07302361354231834, + "learning_rate": 4.754581316012785e-05, + "loss": 0.0029, + "reward": 3.34305739402771, + "reward_std": 1.863444983959198, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.531742215156555, + "rewards/no_repetition_reward_func": -0.09493469074368477, + "rewards/verse_reward_func": -0.015625, + "step": 285 + }, + { + "completion_length": 497.921875, + "epoch": 2.288, + "grad_norm": 1.046875, + "kl": 0.07187492027878761, + "learning_rate": 4.7515560015073514e-05, + "loss": 0.0029, + "reward": 3.0450669527053833, + "reward_std": 1.4660121202468872, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.244352102279663, + "rewards/no_repetition_reward_func": -0.09772272035479546, + "rewards/verse_reward_func": -0.0078125, + "step": 286 + }, + { + "completion_length": 506.859375, + "epoch": 2.296, + "grad_norm": 0.302734375, + "kl": 0.06990887597203255, + "learning_rate": 4.7485131289159276e-05, + "loss": 0.0028, + "reward": 3.0129621028900146, + "reward_std": 1.6986202001571655, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.165220618247986, + "rewards/no_repetition_reward_func": -0.10538357868790627, + "rewards/verse_reward_func": 0.0, + "step": 287 + }, + { + "completion_length": 504.5, + "epoch": 2.304, + "grad_norm": 0.443359375, + "kl": 0.07503120228648186, + "learning_rate": 4.745452721967446e-05, + "loss": 0.003, + "reward": 3.148566961288452, + "reward_std": 1.7028566002845764, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.2956703901290894, + "rewards/no_repetition_reward_func": -0.09241596609354019, + "rewards/verse_reward_func": -0.0078125, + "step": 288 + }, + { + "completion_length": 512.578125, + "epoch": 2.312, + "grad_norm": 0.3125, + "kl": 0.0700761154294014, + "learning_rate": 4.742374804527575e-05, + "loss": 0.0028, + "reward": 3.1349668502807617, + "reward_std": 1.5545377135276794, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 3.360965609550476, + "rewards/no_repetition_reward_func": -0.10099885985255241, + "rewards/verse_reward_func": 0.0, + "step": 289 + }, + { + "completion_length": 510.125, + "epoch": 2.32, + "grad_norm": 0.2890625, + "kl": 0.0722375214099884, + "learning_rate": 4.7392794005985326e-05, + "loss": 0.0029, + "reward": 3.335442543029785, + "reward_std": 1.527139961719513, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 3.5558252334594727, + "rewards/no_repetition_reward_func": -0.11100802943110466, + "rewards/verse_reward_func": 0.0, + "step": 290 + }, + { + "completion_length": 500.046875, + "epoch": 2.328, + "grad_norm": 0.3359375, + "kl": 0.07503898441791534, + "learning_rate": 4.7361665343189e-05, + "loss": 0.003, + "reward": 3.261005997657776, + "reward_std": 1.844939112663269, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.417694330215454, + "rewards/no_repetition_reward_func": -0.10200077295303345, + "rewards/verse_reward_func": -0.0078125, + "step": 291 + }, + { + "completion_length": 511.0625, + "epoch": 2.336, + "grad_norm": 0.28125, + "kl": 0.07717472687363625, + "learning_rate": 4.733036229963435e-05, + "loss": 0.0031, + "reward": 3.105303168296814, + "reward_std": 1.7155184149742126, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.270214080810547, + "rewards/no_repetition_reward_func": -0.1024109274148941, + "rewards/verse_reward_func": 0.0, + "step": 292 + }, + { + "completion_length": 497.8125, + "epoch": 2.344, + "grad_norm": 0.3046875, + "kl": 0.0709347277879715, + "learning_rate": 4.7298885119428773e-05, + "loss": 0.0028, + "reward": 3.0362720489501953, + "reward_std": 1.4715232849121094, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.1813852787017822, + "rewards/no_repetition_reward_func": -0.11386314034461975, + "rewards/verse_reward_func": 0.0, + "step": 293 + }, + { + "completion_length": 514.421875, + "epoch": 2.352, + "grad_norm": 0.28125, + "kl": 0.06916424445807934, + "learning_rate": 4.7267234048037664e-05, + "loss": 0.0028, + "reward": 2.821957588195801, + "reward_std": 1.4813061952590942, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.975194573402405, + "rewards/no_repetition_reward_func": -0.09073694795370102, + "rewards/verse_reward_func": 0.0, + "step": 294 + }, + { + "completion_length": 513.5, + "epoch": 2.36, + "grad_norm": 0.263671875, + "kl": 0.06962788850069046, + "learning_rate": 4.723540933228244e-05, + "loss": 0.0028, + "reward": 2.9061992168426514, + "reward_std": 1.7336826920509338, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.058964967727661, + "rewards/no_repetition_reward_func": -0.10589079931378365, + "rewards/verse_reward_func": 0.0, + "step": 295 + }, + { + "completion_length": 501.609375, + "epoch": 2.368, + "grad_norm": 0.30078125, + "kl": 0.07347158342599869, + "learning_rate": 4.720341122033862e-05, + "loss": 0.0029, + "reward": 3.133572816848755, + "reward_std": 1.719075322151184, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 3.3519644737243652, + "rewards/no_repetition_reward_func": -0.09339166060090065, + "rewards/verse_reward_func": 0.0, + "step": 296 + }, + { + "completion_length": 513.34375, + "epoch": 2.376, + "grad_norm": 0.310546875, + "kl": 0.06967844814062119, + "learning_rate": 4.71712399617339e-05, + "loss": 0.0028, + "reward": 3.4598069190979004, + "reward_std": 2.057759165763855, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.6794203519821167, + "rewards/no_repetition_reward_func": -0.14148825407028198, + "rewards/verse_reward_func": 0.0, + "step": 297 + }, + { + "completion_length": 497.359375, + "epoch": 2.384, + "grad_norm": 0.28125, + "kl": 0.078913364559412, + "learning_rate": 4.713889580734623e-05, + "loss": 0.0032, + "reward": 3.2082329988479614, + "reward_std": 1.7593758702278137, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.3988951444625854, + "rewards/no_repetition_reward_func": -0.11253709346055984, + "rewards/verse_reward_func": 0.0, + "step": 298 + }, + { + "completion_length": 514.046875, + "epoch": 2.392, + "grad_norm": 0.26953125, + "kl": 0.07097313925623894, + "learning_rate": 4.710637900940181e-05, + "loss": 0.0028, + "reward": 3.2228420972824097, + "reward_std": 1.5825055837631226, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.390450358390808, + "rewards/no_repetition_reward_func": -0.1207333356142044, + "rewards/verse_reward_func": 0.0, + "step": 299 + }, + { + "completion_length": 506.4375, + "epoch": 2.4, + "grad_norm": 0.28515625, + "kl": 0.07148850336670876, + "learning_rate": 4.707368982147318e-05, + "loss": 0.0029, + "reward": 3.136491537094116, + "reward_std": 1.5527461767196655, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.318567991256714, + "rewards/no_repetition_reward_func": -0.13520141690969467, + "rewards/verse_reward_func": 0.0, + "step": 300 + }, + { + "completion_length": 481.53125, + "epoch": 2.408, + "grad_norm": 0.58984375, + "kl": 0.09273532032966614, + "learning_rate": 4.704082849847718e-05, + "loss": 0.0037, + "reward": 2.540133476257324, + "reward_std": 1.6269648671150208, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.690992593765259, + "rewards/no_repetition_reward_func": -0.08835902437567711, + "rewards/verse_reward_func": -0.03125, + "step": 301 + }, + { + "completion_length": 495.921875, + "epoch": 2.416, + "grad_norm": 0.38671875, + "kl": 0.07708022743463516, + "learning_rate": 4.7007795296673006e-05, + "loss": 0.0031, + "reward": 3.36426043510437, + "reward_std": 1.945961594581604, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 3.6420836448669434, + "rewards/no_repetition_reward_func": -0.1215733140707016, + "rewards/verse_reward_func": -0.015625, + "step": 302 + }, + { + "completion_length": 511.9375, + "epoch": 2.424, + "grad_norm": 0.283203125, + "kl": 0.06745997816324234, + "learning_rate": 4.6974590473660216e-05, + "loss": 0.0027, + "reward": 3.257583260536194, + "reward_std": 1.670568823814392, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.474300980567932, + "rewards/no_repetition_reward_func": -0.13859259709715843, + "rewards/verse_reward_func": 0.0, + "step": 303 + }, + { + "completion_length": 508.171875, + "epoch": 2.432, + "grad_norm": 0.310546875, + "kl": 0.07339571416378021, + "learning_rate": 4.694121428837668e-05, + "loss": 0.0029, + "reward": 3.3377233743667603, + "reward_std": 1.8575774431228638, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.485177755355835, + "rewards/no_repetition_reward_func": -0.10057934373617172, + "rewards/verse_reward_func": 0.0, + "step": 304 + }, + { + "completion_length": 503.75, + "epoch": 2.44, + "grad_norm": 0.322265625, + "kl": 0.06545048952102661, + "learning_rate": 4.690766700109659e-05, + "loss": 0.0026, + "reward": 3.6106656789779663, + "reward_std": 2.087935447692871, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.7787448167800903, + "rewards/no_repetition_reward_func": -0.1368292197585106, + "rewards/verse_reward_func": 0.0, + "step": 305 + }, + { + "completion_length": 515.515625, + "epoch": 2.448, + "grad_norm": 0.30078125, + "kl": 0.07043803110718727, + "learning_rate": 4.687394887342845e-05, + "loss": 0.0028, + "reward": 2.7173802852630615, + "reward_std": 1.3183306455612183, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.8942841291427612, + "rewards/no_repetition_reward_func": -0.11440388858318329, + "rewards/verse_reward_func": 0.0, + "step": 306 + }, + { + "completion_length": 508.03125, + "epoch": 2.456, + "grad_norm": 0.306640625, + "kl": 0.0740664005279541, + "learning_rate": 4.684006016831297e-05, + "loss": 0.003, + "reward": 3.12207293510437, + "reward_std": 1.578538179397583, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 3.3996095657348633, + "rewards/no_repetition_reward_func": -0.1212867721915245, + "rewards/verse_reward_func": 0.0, + "step": 307 + }, + { + "completion_length": 504.25, + "epoch": 2.464, + "grad_norm": 0.30078125, + "kl": 0.07328982651233673, + "learning_rate": 4.68060011500211e-05, + "loss": 0.0029, + "reward": 2.6460139751434326, + "reward_std": 1.4866119027137756, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 2.7962961196899414, + "rewards/no_repetition_reward_func": -0.10340718552470207, + "rewards/verse_reward_func": 0.0, + "step": 308 + }, + { + "completion_length": 500.0, + "epoch": 2.472, + "grad_norm": 0.25390625, + "kl": 0.0697004497051239, + "learning_rate": 4.6771772084151885e-05, + "loss": 0.0028, + "reward": 3.3726465702056885, + "reward_std": 1.973508894443512, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.568192958831787, + "rewards/no_repetition_reward_func": -0.13304628431797028, + "rewards/verse_reward_func": 0.0, + "step": 309 + }, + { + "completion_length": 496.046875, + "epoch": 2.48, + "grad_norm": 0.474609375, + "kl": 0.06395493634045124, + "learning_rate": 4.6737373237630476e-05, + "loss": 0.0026, + "reward": 3.1311886310577393, + "reward_std": 1.9479712843894958, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.3875889778137207, + "rewards/no_repetition_reward_func": -0.1392129883170128, + "rewards/verse_reward_func": -0.0234375, + "step": 310 + }, + { + "completion_length": 504.796875, + "epoch": 2.488, + "grad_norm": 0.56640625, + "kl": 0.06765878945589066, + "learning_rate": 4.670280487870598e-05, + "loss": 0.0027, + "reward": 2.1348655223846436, + "reward_std": 1.502507209777832, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 2.3871118426322937, + "rewards/no_repetition_reward_func": -0.09599621221423149, + "rewards/verse_reward_func": -0.015625, + "step": 311 + }, + { + "completion_length": 497.640625, + "epoch": 2.496, + "grad_norm": 0.322265625, + "kl": 0.07813980802893639, + "learning_rate": 4.6668067276949414e-05, + "loss": 0.0031, + "reward": 2.6111741065979004, + "reward_std": 1.7092649936676025, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.8095227479934692, + "rewards/no_repetition_reward_func": -0.10459873452782631, + "rewards/verse_reward_func": 0.0, + "step": 312 + }, + { + "completion_length": 505.828125, + "epoch": 2.504, + "grad_norm": 0.28515625, + "kl": 0.07776602357625961, + "learning_rate": 4.6633160703251554e-05, + "loss": 0.0031, + "reward": 3.1250683069229126, + "reward_std": 1.8037731051445007, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.2875659465789795, + "rewards/no_repetition_reward_func": -0.0999976396560669, + "rewards/verse_reward_func": 0.0, + "step": 313 + }, + { + "completion_length": 508.625, + "epoch": 2.512, + "grad_norm": 0.296875, + "kl": 0.07612806558609009, + "learning_rate": 4.659808542982088e-05, + "loss": 0.003, + "reward": 3.2272887229919434, + "reward_std": 1.6824378371238708, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.415904998779297, + "rewards/no_repetition_reward_func": -0.12611641362309456, + "rewards/verse_reward_func": 0.0, + "step": 314 + }, + { + "completion_length": 495.796875, + "epoch": 2.52, + "grad_norm": 0.29296875, + "kl": 0.07150795310735703, + "learning_rate": 4.656284173018144e-05, + "loss": 0.0029, + "reward": 3.2090909481048584, + "reward_std": 1.7938783168792725, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 3.433637261390686, + "rewards/no_repetition_reward_func": -0.11517109721899033, + "rewards/verse_reward_func": 0.0, + "step": 315 + }, + { + "completion_length": 496.90625, + "epoch": 2.528, + "grad_norm": 0.26171875, + "kl": 0.07954888790845871, + "learning_rate": 4.652742987917066e-05, + "loss": 0.0032, + "reward": 3.6636433601379395, + "reward_std": 1.7127233743667603, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.818321466445923, + "rewards/no_repetition_reward_func": -0.10780292004346848, + "rewards/verse_reward_func": 0.0, + "step": 316 + }, + { + "completion_length": 510.28125, + "epoch": 2.536, + "grad_norm": 0.3046875, + "kl": 0.07655661180615425, + "learning_rate": 4.649185015293728e-05, + "loss": 0.0031, + "reward": 3.2104947566986084, + "reward_std": 1.5801751613616943, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 3.446211576461792, + "rewards/no_repetition_reward_func": -0.09509172290563583, + "rewards/verse_reward_func": 0.0, + "step": 317 + }, + { + "completion_length": 510.859375, + "epoch": 2.544, + "grad_norm": 0.26171875, + "kl": 0.07212075591087341, + "learning_rate": 4.645610282893915e-05, + "loss": 0.0029, + "reward": 3.259039878845215, + "reward_std": 2.0055137276649475, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.440922975540161, + "rewards/no_repetition_reward_func": -0.13500789925456047, + "rewards/verse_reward_func": 0.0, + "step": 318 + }, + { + "completion_length": 498.125, + "epoch": 2.552, + "grad_norm": 0.7109375, + "kl": 0.074952382594347, + "learning_rate": 4.642018818594107e-05, + "loss": 0.003, + "reward": 3.1337172985076904, + "reward_std": 1.8900681734085083, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.347471833229065, + "rewards/no_repetition_reward_func": -0.10437954589724541, + "rewards/verse_reward_func": -0.015625, + "step": 319 + }, + { + "completion_length": 513.28125, + "epoch": 2.56, + "grad_norm": 0.294921875, + "kl": 0.0727897435426712, + "learning_rate": 4.638410650401267e-05, + "loss": 0.0029, + "reward": 2.800894260406494, + "reward_std": 1.584051251411438, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.009430170059204, + "rewards/no_repetition_reward_func": -0.11478592455387115, + "rewards/verse_reward_func": 0.0, + "step": 320 + }, + { + "completion_length": 510.03125, + "epoch": 2.568, + "grad_norm": 0.310546875, + "kl": 0.0793125219643116, + "learning_rate": 4.6347858064526125e-05, + "loss": 0.0032, + "reward": 3.102364182472229, + "reward_std": 1.450266420841217, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 3.2971229553222656, + "rewards/no_repetition_reward_func": -0.08538374304771423, + "rewards/verse_reward_func": 0.0, + "step": 321 + }, + { + "completion_length": 508.953125, + "epoch": 2.576, + "grad_norm": 0.287109375, + "kl": 0.06919117085635662, + "learning_rate": 4.631144315015407e-05, + "loss": 0.0028, + "reward": 2.532883644104004, + "reward_std": 1.7970365285873413, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.689835548400879, + "rewards/no_repetition_reward_func": -0.09445185586810112, + "rewards/verse_reward_func": 0.0, + "step": 322 + }, + { + "completion_length": 513.78125, + "epoch": 2.584, + "grad_norm": 0.3203125, + "kl": 0.0671267919242382, + "learning_rate": 4.6274862044867304e-05, + "loss": 0.0027, + "reward": 2.44406259059906, + "reward_std": 1.6146090626716614, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.6371697187423706, + "rewards/no_repetition_reward_func": -0.09935720264911652, + "rewards/verse_reward_func": 0.0, + "step": 323 + }, + { + "completion_length": 507.125, + "epoch": 2.592, + "grad_norm": 0.294921875, + "kl": 0.07809794694185257, + "learning_rate": 4.6238115033932636e-05, + "loss": 0.0031, + "reward": 3.1308138370513916, + "reward_std": 2.139958918094635, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.3291839361190796, + "rewards/no_repetition_reward_func": -0.1202448196709156, + "rewards/verse_reward_func": 0.0, + "step": 324 + }, + { + "completion_length": 494.609375, + "epoch": 2.6, + "grad_norm": 0.330078125, + "kl": 0.07215837389230728, + "learning_rate": 4.620120240391065e-05, + "loss": 0.0029, + "reward": 2.8539516925811768, + "reward_std": 1.6233501434326172, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.0176103115081787, + "rewards/no_repetition_reward_func": -0.10115836560726166, + "rewards/verse_reward_func": -0.015625, + "step": 325 + }, + { + "completion_length": 506.640625, + "epoch": 2.608, + "grad_norm": 0.28515625, + "kl": 0.07418030500411987, + "learning_rate": 4.616412444265345e-05, + "loss": 0.003, + "reward": 2.9015748500823975, + "reward_std": 1.8753230571746826, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.113346576690674, + "rewards/no_repetition_reward_func": -0.11802182346582413, + "rewards/verse_reward_func": 0.0, + "step": 326 + }, + { + "completion_length": 510.796875, + "epoch": 2.616, + "grad_norm": 0.359375, + "kl": 0.07631281018257141, + "learning_rate": 4.612688143930242e-05, + "loss": 0.0031, + "reward": 3.325826406478882, + "reward_std": 1.9779987335205078, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.5272114276885986, + "rewards/no_repetition_reward_func": -0.1388852298259735, + "rewards/verse_reward_func": 0.0, + "step": 327 + }, + { + "completion_length": 510.09375, + "epoch": 2.624, + "grad_norm": 0.26953125, + "kl": 0.07021552324295044, + "learning_rate": 4.6089473684285974e-05, + "loss": 0.0028, + "reward": 3.431481957435608, + "reward_std": 2.006627321243286, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.641479969024658, + "rewards/no_repetition_reward_func": -0.13187310099601746, + "rewards/verse_reward_func": 0.0, + "step": 328 + }, + { + "completion_length": 513.9375, + "epoch": 2.632, + "grad_norm": 0.30078125, + "kl": 0.07575985416769981, + "learning_rate": 4.605190146931731e-05, + "loss": 0.003, + "reward": 2.770979404449463, + "reward_std": 1.638094186782837, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.953513026237488, + "rewards/no_repetition_reward_func": -0.10440856218338013, + "rewards/verse_reward_func": 0.0, + "step": 329 + }, + { + "completion_length": 514.0, + "epoch": 2.64, + "grad_norm": 0.279296875, + "kl": 0.06511147692799568, + "learning_rate": 4.601416508739211e-05, + "loss": 0.0026, + "reward": 2.8098108768463135, + "reward_std": 1.5987440943717957, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 3.0167447328567505, + "rewards/no_repetition_reward_func": -0.09755871817469597, + "rewards/verse_reward_func": 0.0, + "step": 330 + }, + { + "completion_length": 504.421875, + "epoch": 2.648, + "grad_norm": 0.26953125, + "kl": 0.0791972354054451, + "learning_rate": 4.597626483278625e-05, + "loss": 0.0032, + "reward": 3.4486122131347656, + "reward_std": 1.98677396774292, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.6018072366714478, + "rewards/no_repetition_reward_func": -0.12194481492042542, + "rewards/verse_reward_func": 0.0, + "step": 331 + }, + { + "completion_length": 501.375, + "epoch": 2.656, + "grad_norm": 0.333984375, + "kl": 0.07010731846094131, + "learning_rate": 4.593820100105355e-05, + "loss": 0.0028, + "reward": 3.297447919845581, + "reward_std": 1.6448124051094055, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.4953991174697876, + "rewards/no_repetition_reward_func": -0.11201392114162445, + "rewards/verse_reward_func": -0.0078125, + "step": 332 + }, + { + "completion_length": 504.109375, + "epoch": 2.664, + "grad_norm": 0.52734375, + "kl": 0.07532177120447159, + "learning_rate": 4.589997388902338e-05, + "loss": 0.003, + "reward": 3.2759357690811157, + "reward_std": 1.816375732421875, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.4455432891845703, + "rewards/no_repetition_reward_func": -0.11492021754384041, + "rewards/verse_reward_func": -0.0078125, + "step": 333 + }, + { + "completion_length": 508.703125, + "epoch": 2.672, + "grad_norm": 0.296875, + "kl": 0.08575493842363358, + "learning_rate": 4.586158379479848e-05, + "loss": 0.0034, + "reward": 3.2375138998031616, + "reward_std": 1.7461047768592834, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.405545473098755, + "rewards/no_repetition_reward_func": -0.08990637958049774, + "rewards/verse_reward_func": 0.0, + "step": 334 + }, + { + "completion_length": 516.0, + "epoch": 2.68, + "grad_norm": 0.287109375, + "kl": 0.08123177289962769, + "learning_rate": 4.5823031017752485e-05, + "loss": 0.0032, + "reward": 3.3141754865646362, + "reward_std": 1.5318008661270142, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.486935019493103, + "rewards/no_repetition_reward_func": -0.09463479742407799, + "rewards/verse_reward_func": 0.0, + "step": 335 + }, + { + "completion_length": 504.984375, + "epoch": 2.6879999999999997, + "grad_norm": 0.48046875, + "kl": 0.07857323810458183, + "learning_rate": 4.5784315858527715e-05, + "loss": 0.0031, + "reward": 2.693997859954834, + "reward_std": 1.4404518008232117, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.858163595199585, + "rewards/no_repetition_reward_func": -0.07822811976075172, + "rewards/verse_reward_func": -0.0078125, + "step": 336 + }, + { + "completion_length": 505.3125, + "epoch": 2.6959999999999997, + "grad_norm": 0.294921875, + "kl": 0.0779704824090004, + "learning_rate": 4.574543861903274e-05, + "loss": 0.0031, + "reward": 2.717993974685669, + "reward_std": 1.3718830943107605, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 2.8717318773269653, + "rewards/no_repetition_reward_func": -0.1068628765642643, + "rewards/verse_reward_func": 0.0, + "step": 337 + }, + { + "completion_length": 508.5625, + "epoch": 2.7039999999999997, + "grad_norm": 0.298828125, + "kl": 0.0806598961353302, + "learning_rate": 4.5706399602440106e-05, + "loss": 0.0032, + "reward": 3.2430031299591064, + "reward_std": 1.8967853784561157, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.381757378578186, + "rewards/no_repetition_reward_func": -0.10750437527894974, + "rewards/verse_reward_func": 0.0, + "step": 338 + }, + { + "completion_length": 509.421875, + "epoch": 2.7119999999999997, + "grad_norm": 0.279296875, + "kl": 0.08057577162981033, + "learning_rate": 4.566719911318389e-05, + "loss": 0.0032, + "reward": 3.279172420501709, + "reward_std": 1.7849737405776978, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.4664946794509888, + "rewards/no_repetition_reward_func": -0.12482251971960068, + "rewards/verse_reward_func": 0.0, + "step": 339 + }, + { + "completion_length": 502.375, + "epoch": 2.7199999999999998, + "grad_norm": 0.29296875, + "kl": 0.07772545889019966, + "learning_rate": 4.562783745695738e-05, + "loss": 0.0031, + "reward": 3.309648275375366, + "reward_std": 1.682247817516327, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 3.486277222633362, + "rewards/no_repetition_reward_func": -0.09850378707051277, + "rewards/verse_reward_func": 0.0, + "step": 340 + }, + { + "completion_length": 489.484375, + "epoch": 2.7279999999999998, + "grad_norm": 0.3828125, + "kl": 0.07098709791898727, + "learning_rate": 4.558831494071069e-05, + "loss": 0.0028, + "reward": 2.3683422803878784, + "reward_std": 1.85801362991333, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.535942554473877, + "rewards/no_repetition_reward_func": -0.0972878709435463, + "rewards/verse_reward_func": -0.0078125, + "step": 341 + }, + { + "completion_length": 499.203125, + "epoch": 2.7359999999999998, + "grad_norm": 0.53515625, + "kl": 0.07683470845222473, + "learning_rate": 4.5548631872648326e-05, + "loss": 0.0031, + "reward": 2.377056300640106, + "reward_std": 1.6657148599624634, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.5483158230781555, + "rewards/no_repetition_reward_func": -0.08532220497727394, + "rewards/verse_reward_func": -0.0078125, + "step": 342 + }, + { + "completion_length": 504.859375, + "epoch": 2.7439999999999998, + "grad_norm": 0.3515625, + "kl": 0.07454614341259003, + "learning_rate": 4.550878856222685e-05, + "loss": 0.003, + "reward": 3.1114176511764526, + "reward_std": 1.8299107551574707, + "rewards/check_divine_comedy_plagiarism": -0.15625, + "rewards/endecasillabo_reward_func": 3.3795931339263916, + "rewards/no_repetition_reward_func": -0.11192573979496956, + "rewards/verse_reward_func": 0.0, + "step": 343 + }, + { + "completion_length": 508.84375, + "epoch": 2.752, + "grad_norm": 0.326171875, + "kl": 0.07307468354701996, + "learning_rate": 4.5468785320152365e-05, + "loss": 0.0029, + "reward": 2.6008999347686768, + "reward_std": 1.6515042781829834, + "rewards/check_divine_comedy_plagiarism": -0.140625, + "rewards/endecasillabo_reward_func": 2.8341875076293945, + "rewards/no_repetition_reward_func": -0.09266239777207375, + "rewards/verse_reward_func": 0.0, + "step": 344 + }, + { + "completion_length": 500.203125, + "epoch": 2.76, + "grad_norm": 0.57421875, + "kl": 0.0740683265030384, + "learning_rate": 4.542862245837821e-05, + "loss": 0.003, + "reward": 1.9722895622253418, + "reward_std": 1.6432594060897827, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.1615516543388367, + "rewards/no_repetition_reward_func": -0.08769946545362473, + "rewards/verse_reward_func": -0.0078125, + "step": 345 + }, + { + "completion_length": 491.484375, + "epoch": 2.768, + "grad_norm": 0.34765625, + "kl": 0.07674963027238846, + "learning_rate": 4.5388300290102456e-05, + "loss": 0.0031, + "reward": 2.9414669275283813, + "reward_std": 1.9813048243522644, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 3.151545286178589, + "rewards/no_repetition_reward_func": -0.10070311650633812, + "rewards/verse_reward_func": 0.0, + "step": 346 + }, + { + "completion_length": 508.96875, + "epoch": 2.776, + "grad_norm": 0.341796875, + "kl": 0.0730576142668724, + "learning_rate": 4.534781912976546e-05, + "loss": 0.0029, + "reward": 2.4778465032577515, + "reward_std": 1.6462915539741516, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 2.5727109909057617, + "rewards/no_repetition_reward_func": -0.09486469626426697, + "rewards/verse_reward_func": 0.0, + "step": 347 + }, + { + "completion_length": 505.40625, + "epoch": 2.784, + "grad_norm": 0.318359375, + "kl": 0.07403989881277084, + "learning_rate": 4.530717929304743e-05, + "loss": 0.003, + "reward": 2.2229321002960205, + "reward_std": 1.5802661180496216, + "rewards/check_divine_comedy_plagiarism": -0.125, + "rewards/endecasillabo_reward_func": 2.420830249786377, + "rewards/no_repetition_reward_func": -0.07289816811680794, + "rewards/verse_reward_func": 0.0, + "step": 348 + }, + { + "completion_length": 508.96875, + "epoch": 2.792, + "grad_norm": 0.3203125, + "kl": 0.08156667277216911, + "learning_rate": 4.5266381096866e-05, + "loss": 0.0033, + "reward": 2.7173726558685303, + "reward_std": 1.7503728866577148, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.8982162475585938, + "rewards/no_repetition_reward_func": -0.10271859541535378, + "rewards/verse_reward_func": 0.0, + "step": 349 + }, + { + "completion_length": 511.078125, + "epoch": 2.8, + "grad_norm": 0.30078125, + "kl": 0.07661834359169006, + "learning_rate": 4.522542485937369e-05, + "loss": 0.0031, + "reward": 3.030595541000366, + "reward_std": 1.8016299605369568, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.1770427227020264, + "rewards/no_repetition_reward_func": -0.09957227110862732, + "rewards/verse_reward_func": 0.0, + "step": 350 + }, + { + "completion_length": 509.953125, + "epoch": 2.808, + "grad_norm": 0.28515625, + "kl": 0.08019143715500832, + "learning_rate": 4.5184310899955465e-05, + "loss": 0.0032, + "reward": 2.9095336198806763, + "reward_std": 1.6646459698677063, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.0490472316741943, + "rewards/no_repetition_reward_func": -0.09263867139816284, + "rewards/verse_reward_func": 0.0, + "step": 351 + }, + { + "completion_length": 509.375, + "epoch": 2.816, + "grad_norm": 0.29296875, + "kl": 0.07593303173780441, + "learning_rate": 4.5143039539226234e-05, + "loss": 0.003, + "reward": 2.6091028451919556, + "reward_std": 1.8233085870742798, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.738195776939392, + "rewards/no_repetition_reward_func": -0.09784290567040443, + "rewards/verse_reward_func": 0.0, + "step": 352 + }, + { + "completion_length": 511.546875, + "epoch": 2.824, + "grad_norm": 0.302734375, + "kl": 0.07943109422922134, + "learning_rate": 4.510161109902837e-05, + "loss": 0.0032, + "reward": 3.223742961883545, + "reward_std": 1.7403135895729065, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.350186824798584, + "rewards/no_repetition_reward_func": -0.11081875115633011, + "rewards/verse_reward_func": 0.0, + "step": 353 + }, + { + "completion_length": 506.921875, + "epoch": 2.832, + "grad_norm": 0.322265625, + "kl": 0.07846018671989441, + "learning_rate": 4.5060025902429174e-05, + "loss": 0.0031, + "reward": 3.29129695892334, + "reward_std": 1.4399842619895935, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.494906783103943, + "rewards/no_repetition_reward_func": -0.10986022651195526, + "rewards/verse_reward_func": 0.0, + "step": 354 + }, + { + "completion_length": 490.171875, + "epoch": 2.84, + "grad_norm": 0.65625, + "kl": 0.0785321518778801, + "learning_rate": 4.5018284273718336e-05, + "loss": 0.0031, + "reward": 3.3225284814834595, + "reward_std": 1.730772852897644, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.552213191986084, + "rewards/no_repetition_reward_func": -0.12030969932675362, + "rewards/verse_reward_func": -0.015625, + "step": 355 + }, + { + "completion_length": 513.671875, + "epoch": 2.848, + "grad_norm": 0.296875, + "kl": 0.08289977163076401, + "learning_rate": 4.4976386538405495e-05, + "loss": 0.0033, + "reward": 2.5051289796829224, + "reward_std": 1.9811246395111084, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.6793651580810547, + "rewards/no_repetition_reward_func": -0.10392363741993904, + "rewards/verse_reward_func": -0.0078125, + "step": 356 + }, + { + "completion_length": 503.234375, + "epoch": 2.856, + "grad_norm": 0.296875, + "kl": 0.07945031672716141, + "learning_rate": 4.493433302321759e-05, + "loss": 0.0032, + "reward": 3.140336513519287, + "reward_std": 1.9608882665634155, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.3023455142974854, + "rewards/no_repetition_reward_func": -0.11513377726078033, + "rewards/verse_reward_func": 0.0, + "step": 357 + }, + { + "completion_length": 506.546875, + "epoch": 2.864, + "grad_norm": 0.38671875, + "kl": 0.08129360899329185, + "learning_rate": 4.4892124056096386e-05, + "loss": 0.0033, + "reward": 1.0734683275222778, + "reward_std": 1.145058035850525, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 1.1880543529987335, + "rewards/no_repetition_reward_func": -0.06771104969084263, + "rewards/verse_reward_func": -0.015625, + "step": 358 + }, + { + "completion_length": 496.015625, + "epoch": 2.872, + "grad_norm": 0.30859375, + "kl": 0.0791650302708149, + "learning_rate": 4.484975996619589e-05, + "loss": 0.0032, + "reward": 2.947505831718445, + "reward_std": 1.8184013962745667, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.090460181236267, + "rewards/no_repetition_reward_func": -0.11170443147420883, + "rewards/verse_reward_func": 0.0, + "step": 359 + }, + { + "completion_length": 510.765625, + "epoch": 2.88, + "grad_norm": 0.283203125, + "kl": 0.07467341423034668, + "learning_rate": 4.480724108387977e-05, + "loss": 0.003, + "reward": 2.1855517625808716, + "reward_std": 1.5964102745056152, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 2.361377239227295, + "rewards/no_repetition_reward_func": -0.12113794311881065, + "rewards/verse_reward_func": -0.0078125, + "step": 360 + }, + { + "completion_length": 504.390625, + "epoch": 2.888, + "grad_norm": 0.3046875, + "kl": 0.0794813297688961, + "learning_rate": 4.4764567740718825e-05, + "loss": 0.0032, + "reward": 2.409980893135071, + "reward_std": 1.7127461433410645, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.5352184772491455, + "rewards/no_repetition_reward_func": -0.09398764371871948, + "rewards/verse_reward_func": 0.0, + "step": 361 + }, + { + "completion_length": 504.46875, + "epoch": 2.896, + "grad_norm": 0.3125, + "kl": 0.0833638608455658, + "learning_rate": 4.4721740269488355e-05, + "loss": 0.0033, + "reward": 2.9569716453552246, + "reward_std": 2.1555756330490112, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.0904860496520996, + "rewards/no_repetition_reward_func": -0.10226451605558395, + "rewards/verse_reward_func": 0.0, + "step": 362 + }, + { + "completion_length": 504.6875, + "epoch": 2.904, + "grad_norm": 0.30859375, + "kl": 0.07423374801874161, + "learning_rate": 4.4678759004165584e-05, + "loss": 0.003, + "reward": 2.69820499420166, + "reward_std": 2.2836015224456787, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.8553237915039062, + "rewards/no_repetition_reward_func": -0.13368131592869759, + "rewards/verse_reward_func": -0.0078125, + "step": 363 + }, + { + "completion_length": 505.609375, + "epoch": 2.912, + "grad_norm": 0.3125, + "kl": 0.08233223482966423, + "learning_rate": 4.4635624279927044e-05, + "loss": 0.0033, + "reward": 2.9073365926742554, + "reward_std": 2.151949405670166, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.0435147285461426, + "rewards/no_repetition_reward_func": -0.10492793098092079, + "rewards/verse_reward_func": 0.0, + "step": 364 + }, + { + "completion_length": 506.328125, + "epoch": 2.92, + "grad_norm": 0.2890625, + "kl": 0.09081687405705452, + "learning_rate": 4.4592336433146e-05, + "loss": 0.0036, + "reward": 2.696870446205139, + "reward_std": 2.107183516025543, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.8592997789382935, + "rewards/no_repetition_reward_func": -0.13117936998605728, + "rewards/verse_reward_func": 0.0, + "step": 365 + }, + { + "completion_length": 494.28125, + "epoch": 2.928, + "grad_norm": 0.3125, + "kl": 0.0823359303176403, + "learning_rate": 4.454889580138975e-05, + "loss": 0.0033, + "reward": 2.3096872568130493, + "reward_std": 1.971324384212494, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.4551390409469604, + "rewards/no_repetition_reward_func": -0.10638919472694397, + "rewards/verse_reward_func": -0.0234375, + "step": 366 + }, + { + "completion_length": 496.640625, + "epoch": 2.936, + "grad_norm": 0.353515625, + "kl": 0.08790365234017372, + "learning_rate": 4.450530272341709e-05, + "loss": 0.0035, + "reward": 2.1643463373184204, + "reward_std": 1.9374281764030457, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.2692928314208984, + "rewards/no_repetition_reward_func": -0.08932144194841385, + "rewards/verse_reward_func": 0.0, + "step": 367 + }, + { + "completion_length": 502.546875, + "epoch": 2.944, + "grad_norm": 0.271484375, + "kl": 0.08607224375009537, + "learning_rate": 4.4461557539175594e-05, + "loss": 0.0034, + "reward": 2.3362812995910645, + "reward_std": 1.782151699066162, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.5236377716064453, + "rewards/no_repetition_reward_func": -0.10141897201538086, + "rewards/verse_reward_func": -0.0078125, + "step": 368 + }, + { + "completion_length": 500.53125, + "epoch": 2.952, + "grad_norm": 0.302734375, + "kl": 0.08719504252076149, + "learning_rate": 4.441766058979898e-05, + "loss": 0.0035, + "reward": 2.3396250009536743, + "reward_std": 1.969063401222229, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.450891137123108, + "rewards/no_repetition_reward_func": -0.09564093872904778, + "rewards/verse_reward_func": 0.0, + "step": 369 + }, + { + "completion_length": 501.234375, + "epoch": 2.96, + "grad_norm": 0.3359375, + "kl": 0.09293963387608528, + "learning_rate": 4.4373612217604496e-05, + "loss": 0.0037, + "reward": 1.8408806920051575, + "reward_std": 1.6127179861068726, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 1.9763121604919434, + "rewards/no_repetition_reward_func": -0.08855630829930305, + "rewards/verse_reward_func": 0.0, + "step": 370 + }, + { + "completion_length": 512.265625, + "epoch": 2.968, + "grad_norm": 0.326171875, + "kl": 0.09362768009305, + "learning_rate": 4.432941276609018e-05, + "loss": 0.0037, + "reward": 2.147946834564209, + "reward_std": 1.85464209318161, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 2.3234997987747192, + "rewards/no_repetition_reward_func": -0.09742774814367294, + "rewards/verse_reward_func": 0.0, + "step": 371 + }, + { + "completion_length": 507.703125, + "epoch": 2.976, + "grad_norm": 0.275390625, + "kl": 0.08821310102939606, + "learning_rate": 4.428506257993226e-05, + "loss": 0.0035, + "reward": 2.054785430431366, + "reward_std": 1.9274823665618896, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.2238965034484863, + "rewards/no_repetition_reward_func": -0.09879856184124947, + "rewards/verse_reward_func": -0.0078125, + "step": 372 + }, + { + "completion_length": 509.296875, + "epoch": 2.984, + "grad_norm": 0.314453125, + "kl": 0.09096482396125793, + "learning_rate": 4.4240562004982364e-05, + "loss": 0.0036, + "reward": 2.536385416984558, + "reward_std": 2.125114858150482, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 2.703101396560669, + "rewards/no_repetition_reward_func": -0.11984112113714218, + "rewards/verse_reward_func": 0.0, + "step": 373 + }, + { + "completion_length": 506.65625, + "epoch": 2.992, + "grad_norm": 0.2890625, + "kl": 0.08306482806801796, + "learning_rate": 4.4195911388264946e-05, + "loss": 0.0033, + "reward": 3.070643186569214, + "reward_std": 2.549742817878723, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.2660434246063232, + "rewards/no_repetition_reward_func": -0.14852508530020714, + "rewards/verse_reward_func": 0.0, + "step": 374 + }, + { + "completion_length": 516.0, + "epoch": 3.0, + "grad_norm": 0.29296875, + "kl": 0.08598653972148895, + "learning_rate": 4.415111107797445e-05, + "loss": 0.0034, + "reward": 3.407633066177368, + "reward_std": 2.4881142377853394, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 3.6524672508239746, + "rewards/no_repetition_reward_func": -0.15108438581228256, + "rewards/verse_reward_func": 0.0, + "step": 375 + }, + { + "completion_length": 503.90625, + "epoch": 3.008, + "grad_norm": 0.310546875, + "kl": 0.0911925882101059, + "learning_rate": 4.410616142347273e-05, + "loss": 0.0036, + "reward": 2.8319064378738403, + "reward_std": 2.0524205565452576, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.985151767730713, + "rewards/no_repetition_reward_func": -0.12199538946151733, + "rewards/verse_reward_func": 0.0, + "step": 376 + }, + { + "completion_length": 496.890625, + "epoch": 3.016, + "grad_norm": 0.265625, + "kl": 0.08954320847988129, + "learning_rate": 4.40610627752862e-05, + "loss": 0.0036, + "reward": 2.7986027002334595, + "reward_std": 2.0063053965568542, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.9766725301742554, + "rewards/no_repetition_reward_func": -0.1390073597431183, + "rewards/verse_reward_func": -0.0078125, + "step": 377 + }, + { + "completion_length": 511.671875, + "epoch": 3.024, + "grad_norm": 0.25390625, + "kl": 0.08867848664522171, + "learning_rate": 4.401581548510318e-05, + "loss": 0.0035, + "reward": 2.313808798789978, + "reward_std": 2.238036334514618, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 2.5276044607162476, + "rewards/no_repetition_reward_func": -0.15129563957452774, + "rewards/verse_reward_func": 0.0, + "step": 378 + }, + { + "completion_length": 491.1875, + "epoch": 3.032, + "grad_norm": 0.328125, + "kl": 0.10196596384048462, + "learning_rate": 4.3970419905771145e-05, + "loss": 0.0041, + "reward": 2.4618290662765503, + "reward_std": 2.0097826719284058, + "rewards/check_divine_comedy_plagiarism": -0.109375, + "rewards/endecasillabo_reward_func": 2.70884907245636, + "rewards/no_repetition_reward_func": -0.1298324093222618, + "rewards/verse_reward_func": -0.0078125, + "step": 379 + }, + { + "completion_length": 505.359375, + "epoch": 3.04, + "grad_norm": 0.296875, + "kl": 0.12007220834493637, + "learning_rate": 4.3924876391293915e-05, + "loss": 0.0048, + "reward": 2.079891800880432, + "reward_std": 1.9552977681159973, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.21696937084198, + "rewards/no_repetition_reward_func": -0.09801513701677322, + "rewards/verse_reward_func": -0.0078125, + "step": 380 + }, + { + "completion_length": 496.046875, + "epoch": 3.048, + "grad_norm": 0.357421875, + "kl": 0.09339821338653564, + "learning_rate": 4.387918529682898e-05, + "loss": 0.0037, + "reward": 3.4741328954696655, + "reward_std": 2.361031413078308, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.636458158493042, + "rewards/no_repetition_reward_func": -0.1545126587152481, + "rewards/verse_reward_func": -0.0078125, + "step": 381 + }, + { + "completion_length": 512.6875, + "epoch": 3.056, + "grad_norm": 0.30078125, + "kl": 0.10695686936378479, + "learning_rate": 4.3833346978684675e-05, + "loss": 0.0043, + "reward": 2.3594932556152344, + "reward_std": 2.1355048418045044, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 2.532397747039795, + "rewards/no_repetition_reward_func": -0.11821706220507622, + "rewards/verse_reward_func": -0.0078125, + "step": 382 + }, + { + "completion_length": 495.15625, + "epoch": 3.064, + "grad_norm": 0.40234375, + "kl": 0.09381513297557831, + "learning_rate": 4.3787361794317405e-05, + "loss": 0.0038, + "reward": 3.5152982473373413, + "reward_std": 2.718501091003418, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.7205398082733154, + "rewards/no_repetition_reward_func": -0.15055381506681442, + "rewards/verse_reward_func": -0.0078125, + "step": 383 + }, + { + "completion_length": 501.21875, + "epoch": 3.072, + "grad_norm": 0.6640625, + "kl": 0.10930196195840836, + "learning_rate": 4.374123010232888e-05, + "loss": 0.0044, + "reward": 1.694966346025467, + "reward_std": 1.9119724035263062, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 1.885745644569397, + "rewards/no_repetition_reward_func": -0.12046677619218826, + "rewards/verse_reward_func": -0.0078125, + "step": 384 + }, + { + "completion_length": 513.578125, + "epoch": 3.08, + "grad_norm": 0.318359375, + "kl": 0.10504106432199478, + "learning_rate": 4.36949522624633e-05, + "loss": 0.0042, + "reward": 1.7644249200820923, + "reward_std": 2.0065885186195374, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 1.9339220523834229, + "rewards/no_repetition_reward_func": -0.1460595689713955, + "rewards/verse_reward_func": -0.0078125, + "step": 385 + }, + { + "completion_length": 488.1875, + "epoch": 3.088, + "grad_norm": 0.35546875, + "kl": 0.11348450928926468, + "learning_rate": 4.3648528635604556e-05, + "loss": 0.0045, + "reward": 2.270286500453949, + "reward_std": 1.969484806060791, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.4428231716156006, + "rewards/no_repetition_reward_func": -0.12566189467906952, + "rewards/verse_reward_func": -0.015625, + "step": 386 + }, + { + "completion_length": 508.15625, + "epoch": 3.096, + "grad_norm": 0.31640625, + "kl": 0.1091524213552475, + "learning_rate": 4.3601959583773415e-05, + "loss": 0.0044, + "reward": 2.753540515899658, + "reward_std": 2.482025384902954, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 2.933242678642273, + "rewards/no_repetition_reward_func": -0.12501465156674385, + "rewards/verse_reward_func": -0.0078125, + "step": 387 + }, + { + "completion_length": 501.0625, + "epoch": 3.104, + "grad_norm": 0.26171875, + "kl": 0.09920186549425125, + "learning_rate": 4.355524547012471e-05, + "loss": 0.004, + "reward": 3.3010605573654175, + "reward_std": 2.4878424406051636, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.49591600894928, + "rewards/no_repetition_reward_func": -0.1714179664850235, + "rewards/verse_reward_func": -0.0078125, + "step": 388 + }, + { + "completion_length": 500.21875, + "epoch": 3.112, + "grad_norm": 0.4296875, + "kl": 0.11637993901968002, + "learning_rate": 4.350838665894446e-05, + "loss": 0.0047, + "reward": 1.9576940536499023, + "reward_std": 2.396308660507202, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 2.0917818546295166, + "rewards/no_repetition_reward_func": -0.12627528235316277, + "rewards/verse_reward_func": -0.0078125, + "step": 389 + }, + { + "completion_length": 515.40625, + "epoch": 3.12, + "grad_norm": 0.275390625, + "kl": 0.11347230896353722, + "learning_rate": 4.3461383515647106e-05, + "loss": 0.0045, + "reward": 2.4671833515167236, + "reward_std": 2.0614771842956543, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 2.6480787992477417, + "rewards/no_repetition_reward_func": -0.1340203806757927, + "rewards/verse_reward_func": 0.0, + "step": 390 + }, + { + "completion_length": 510.59375, + "epoch": 3.128, + "grad_norm": 0.255859375, + "kl": 0.10588905215263367, + "learning_rate": 4.3414236406772584e-05, + "loss": 0.0042, + "reward": 2.7870692014694214, + "reward_std": 2.130202054977417, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.9620786905288696, + "rewards/no_repetition_reward_func": -0.15938443690538406, + "rewards/verse_reward_func": 0.0, + "step": 391 + }, + { + "completion_length": 494.59375, + "epoch": 3.136, + "grad_norm": 0.4375, + "kl": 0.12427257001399994, + "learning_rate": 4.336694569998354e-05, + "loss": 0.005, + "reward": 1.4034064412117004, + "reward_std": 1.5337103009223938, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 1.547293245792389, + "rewards/no_repetition_reward_func": -0.1126367598772049, + "rewards/verse_reward_func": -0.015625, + "step": 392 + }, + { + "completion_length": 495.359375, + "epoch": 3.144, + "grad_norm": 0.314453125, + "kl": 0.11520429328083992, + "learning_rate": 4.331951176406239e-05, + "loss": 0.0046, + "reward": 1.9562144577503204, + "reward_std": 1.758342981338501, + "rewards/check_divine_comedy_plagiarism": -0.09375, + "rewards/endecasillabo_reward_func": 2.1947571635246277, + "rewards/no_repetition_reward_func": -0.13698016107082367, + "rewards/verse_reward_func": -0.0078125, + "step": 393 + }, + { + "completion_length": 504.46875, + "epoch": 3.152, + "grad_norm": 0.35546875, + "kl": 0.10974125564098358, + "learning_rate": 4.3271934968908514e-05, + "loss": 0.0044, + "reward": 3.798835515975952, + "reward_std": 2.617736339569092, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 4.009515166282654, + "rewards/no_repetition_reward_func": -0.1794297993183136, + "rewards/verse_reward_func": 0.0, + "step": 394 + }, + { + "completion_length": 508.234375, + "epoch": 3.16, + "grad_norm": 0.29296875, + "kl": 0.11817478388547897, + "learning_rate": 4.3224215685535294e-05, + "loss": 0.0047, + "reward": 2.817207098007202, + "reward_std": 2.395043134689331, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.9876492023468018, + "rewards/no_repetition_reward_func": -0.15481726080179214, + "rewards/verse_reward_func": 0.0, + "step": 395 + }, + { + "completion_length": 494.5625, + "epoch": 3.168, + "grad_norm": 0.345703125, + "kl": 0.12087716907262802, + "learning_rate": 4.31763542860673e-05, + "loss": 0.0048, + "reward": 2.910922646522522, + "reward_std": 2.534151554107666, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.099183201789856, + "rewards/no_repetition_reward_func": -0.13357309997081757, + "rewards/verse_reward_func": -0.0078125, + "step": 396 + }, + { + "completion_length": 510.671875, + "epoch": 3.176, + "grad_norm": 0.2734375, + "kl": 0.11646657437086105, + "learning_rate": 4.3128351143737335e-05, + "loss": 0.0047, + "reward": 3.247679829597473, + "reward_std": 2.626707911491394, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.4056390523910522, + "rewards/no_repetition_reward_func": -0.15795912593603134, + "rewards/verse_reward_func": 0.0, + "step": 397 + }, + { + "completion_length": 511.796875, + "epoch": 3.184, + "grad_norm": 0.2451171875, + "kl": 0.10919316485524178, + "learning_rate": 4.3080206632883554e-05, + "loss": 0.0044, + "reward": 3.1105839014053345, + "reward_std": 2.5240284204483032, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.322108745574951, + "rewards/no_repetition_reward_func": -0.1802750676870346, + "rewards/verse_reward_func": 0.0, + "step": 398 + }, + { + "completion_length": 506.90625, + "epoch": 3.192, + "grad_norm": 0.3203125, + "kl": 0.11702189594507217, + "learning_rate": 4.303192112894652e-05, + "loss": 0.0047, + "reward": 2.5559383630752563, + "reward_std": 2.6969780921936035, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.7421352863311768, + "rewards/no_repetition_reward_func": -0.15494690090417862, + "rewards/verse_reward_func": -0.015625, + "step": 399 + }, + { + "completion_length": 512.84375, + "epoch": 3.2, + "grad_norm": 0.27734375, + "kl": 0.11996134743094444, + "learning_rate": 4.2983495008466276e-05, + "loss": 0.0048, + "reward": 3.209777355194092, + "reward_std": 2.9023302793502808, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.410954236984253, + "rewards/no_repetition_reward_func": -0.1621144488453865, + "rewards/verse_reward_func": -0.0078125, + "step": 400 + }, + { + "completion_length": 511.375, + "epoch": 3.208, + "grad_norm": 0.2734375, + "kl": 0.11048588529229164, + "learning_rate": 4.293492864907947e-05, + "loss": 0.0044, + "reward": 2.8161327838897705, + "reward_std": 2.761341094970703, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.004380464553833, + "rewards/no_repetition_reward_func": -0.1726226657629013, + "rewards/verse_reward_func": 0.0, + "step": 401 + }, + { + "completion_length": 507.8125, + "epoch": 3.216, + "grad_norm": 0.24609375, + "kl": 0.10434413701295853, + "learning_rate": 4.2886222429516296e-05, + "loss": 0.0042, + "reward": 3.458737373352051, + "reward_std": 2.450830340385437, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.69482684135437, + "rewards/no_repetition_reward_func": -0.20483969897031784, + "rewards/verse_reward_func": 0.0, + "step": 402 + }, + { + "completion_length": 504.421875, + "epoch": 3.224, + "grad_norm": 0.248046875, + "kl": 0.10975316539406776, + "learning_rate": 4.283737672959766e-05, + "loss": 0.0044, + "reward": 3.8317277431488037, + "reward_std": 2.7614123821258545, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.041813850402832, + "rewards/no_repetition_reward_func": -0.19446122646331787, + "rewards/verse_reward_func": 0.0, + "step": 403 + }, + { + "completion_length": 511.375, + "epoch": 3.232, + "grad_norm": 0.2470703125, + "kl": 0.11660506576299667, + "learning_rate": 4.278839193023214e-05, + "loss": 0.0047, + "reward": 2.5691113471984863, + "reward_std": 1.8011227250099182, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 2.733238101005554, + "rewards/no_repetition_reward_func": -0.16412660479545593, + "rewards/verse_reward_func": 0.0, + "step": 404 + }, + { + "completion_length": 515.734375, + "epoch": 3.24, + "grad_norm": 0.275390625, + "kl": 0.12126060575246811, + "learning_rate": 4.273926841341302e-05, + "loss": 0.0049, + "reward": 2.8409249782562256, + "reward_std": 2.440640449523926, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.009791851043701, + "rewards/no_repetition_reward_func": -0.16886688768863678, + "rewards/verse_reward_func": 0.0, + "step": 405 + }, + { + "completion_length": 512.96875, + "epoch": 3.248, + "grad_norm": 0.251953125, + "kl": 0.12648889422416687, + "learning_rate": 4.2690006562215384e-05, + "loss": 0.0051, + "reward": 3.132773518562317, + "reward_std": 2.673558831214905, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.3459843397140503, + "rewards/no_repetition_reward_func": -0.17414817214012146, + "rewards/verse_reward_func": -0.0234375, + "step": 406 + }, + { + "completion_length": 504.0625, + "epoch": 3.2560000000000002, + "grad_norm": 0.333984375, + "kl": 0.1512024626135826, + "learning_rate": 4.264060676079302e-05, + "loss": 0.006, + "reward": 2.127965211868286, + "reward_std": 2.119272291660309, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.2899091243743896, + "rewards/no_repetition_reward_func": -0.12288130074739456, + "rewards/verse_reward_func": -0.0078125, + "step": 407 + }, + { + "completion_length": 506.46875, + "epoch": 3.2640000000000002, + "grad_norm": 0.34765625, + "kl": 0.11792661249637604, + "learning_rate": 4.259106939437551e-05, + "loss": 0.0047, + "reward": 3.789017915725708, + "reward_std": 3.002945899963379, + "rewards/check_divine_comedy_plagiarism": -0.078125, + "rewards/endecasillabo_reward_func": 4.057240605354309, + "rewards/no_repetition_reward_func": -0.19009757041931152, + "rewards/verse_reward_func": 0.0, + "step": 408 + }, + { + "completion_length": 510.546875, + "epoch": 3.2720000000000002, + "grad_norm": 0.25390625, + "kl": 0.11058615520596504, + "learning_rate": 4.254139484926519e-05, + "loss": 0.0044, + "reward": 3.2477234601974487, + "reward_std": 2.4257978200912476, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.4715274572372437, + "rewards/no_repetition_reward_func": -0.18474137037992477, + "rewards/verse_reward_func": -0.0390625, + "step": 409 + }, + { + "completion_length": 510.109375, + "epoch": 3.2800000000000002, + "grad_norm": 0.2421875, + "kl": 0.13551191240549088, + "learning_rate": 4.249158351283414e-05, + "loss": 0.0054, + "reward": 2.745664596557617, + "reward_std": 2.4549667835235596, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.9488738775253296, + "rewards/no_repetition_reward_func": -0.17195943742990494, + "rewards/verse_reward_func": -0.015625, + "step": 410 + }, + { + "completion_length": 506.078125, + "epoch": 3.288, + "grad_norm": 0.283203125, + "kl": 0.16104421019554138, + "learning_rate": 4.244163577352116e-05, + "loss": 0.0064, + "reward": 2.689857602119446, + "reward_std": 2.6156352758407593, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 2.8315701484680176, + "rewards/no_repetition_reward_func": -0.13390029221773148, + "rewards/verse_reward_func": -0.0078125, + "step": 411 + }, + { + "completion_length": 514.765625, + "epoch": 3.296, + "grad_norm": 0.2353515625, + "kl": 0.11327937617897987, + "learning_rate": 4.2391552020828775e-05, + "loss": 0.0045, + "reward": 2.7442198991775513, + "reward_std": 2.58572256565094, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 2.945777654647827, + "rewards/no_repetition_reward_func": -0.1937449797987938, + "rewards/verse_reward_func": -0.0078125, + "step": 412 + }, + { + "completion_length": 514.671875, + "epoch": 3.304, + "grad_norm": 0.2578125, + "kl": 0.1518239751458168, + "learning_rate": 4.234133264532012e-05, + "loss": 0.0061, + "reward": 2.8443405628204346, + "reward_std": 2.850118637084961, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 2.9905670881271362, + "rewards/no_repetition_reward_func": -0.13841407001018524, + "rewards/verse_reward_func": -0.0078125, + "step": 413 + }, + { + "completion_length": 502.171875, + "epoch": 3.312, + "grad_norm": 0.291015625, + "kl": 0.14399583637714386, + "learning_rate": 4.2290978038616e-05, + "loss": 0.0058, + "reward": 2.4478920102119446, + "reward_std": 2.0861271023750305, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 2.5947184562683105, + "rewards/no_repetition_reward_func": -0.13120137155056, + "rewards/verse_reward_func": -0.015625, + "step": 414 + }, + { + "completion_length": 507.21875, + "epoch": 3.32, + "grad_norm": 0.21875, + "kl": 0.12847361713647842, + "learning_rate": 4.224048859339175e-05, + "loss": 0.0051, + "reward": 3.6349053382873535, + "reward_std": 2.8520772457122803, + "rewards/check_divine_comedy_plagiarism": -0.0625, + "rewards/endecasillabo_reward_func": 3.9046359062194824, + "rewards/no_repetition_reward_func": -0.1916055902838707, + "rewards/verse_reward_func": -0.015625, + "step": 415 + }, + { + "completion_length": 516.0, + "epoch": 3.328, + "grad_norm": 0.2275390625, + "kl": 0.09135512635111809, + "learning_rate": 4.218986470337419e-05, + "loss": 0.0037, + "reward": 4.117633819580078, + "reward_std": 3.1633849143981934, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.3612682819366455, + "rewards/no_repetition_reward_func": -0.23582186549901962, + "rewards/verse_reward_func": -0.0078125, + "step": 416 + }, + { + "completion_length": 506.390625, + "epoch": 3.336, + "grad_norm": 0.275390625, + "kl": 0.17307034134864807, + "learning_rate": 4.213910676333859e-05, + "loss": 0.0069, + "reward": 2.407034158706665, + "reward_std": 2.3376948833465576, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.589349329471588, + "rewards/no_repetition_reward_func": -0.13544024527072906, + "rewards/verse_reward_func": -0.015625, + "step": 417 + }, + { + "completion_length": 515.03125, + "epoch": 3.344, + "grad_norm": 0.263671875, + "kl": 0.15060757100582123, + "learning_rate": 4.208821516910557e-05, + "loss": 0.006, + "reward": 3.0883065462112427, + "reward_std": 2.5728085041046143, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.2736164331436157, + "rewards/no_repetition_reward_func": -0.16968472301959991, + "rewards/verse_reward_func": -0.015625, + "step": 418 + }, + { + "completion_length": 494.984375, + "epoch": 3.352, + "grad_norm": 0.3359375, + "kl": 0.1880241334438324, + "learning_rate": 4.2037190317538e-05, + "loss": 0.0075, + "reward": 2.0592082738876343, + "reward_std": 2.0997730493545532, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.205148935317993, + "rewards/no_repetition_reward_func": -0.12250316515564919, + "rewards/verse_reward_func": -0.0078125, + "step": 419 + }, + { + "completion_length": 494.390625, + "epoch": 3.36, + "grad_norm": 0.298828125, + "kl": 0.1564255654811859, + "learning_rate": 4.198603260653792e-05, + "loss": 0.0063, + "reward": 2.75505268573761, + "reward_std": 2.354513645172119, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.9267383813858032, + "rewards/no_repetition_reward_func": -0.1482483297586441, + "rewards/verse_reward_func": -0.0078125, + "step": 420 + }, + { + "completion_length": 514.03125, + "epoch": 3.368, + "grad_norm": 0.26953125, + "kl": 0.14629875868558884, + "learning_rate": 4.193474243504343e-05, + "loss": 0.0059, + "reward": 3.295858860015869, + "reward_std": 2.7084752321243286, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.526587963104248, + "rewards/no_repetition_reward_func": -0.1838541328907013, + "rewards/verse_reward_func": 0.0, + "step": 421 + }, + { + "completion_length": 508.96875, + "epoch": 3.376, + "grad_norm": 0.2890625, + "kl": 0.13834019750356674, + "learning_rate": 4.188332020302561e-05, + "loss": 0.0055, + "reward": 3.59060800075531, + "reward_std": 2.721167802810669, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.791170120239258, + "rewards/no_repetition_reward_func": -0.2005620077252388, + "rewards/verse_reward_func": 0.0, + "step": 422 + }, + { + "completion_length": 512.5, + "epoch": 3.384, + "grad_norm": 0.2412109375, + "kl": 0.16679373383522034, + "learning_rate": 4.183176631148534e-05, + "loss": 0.0067, + "reward": 3.043287992477417, + "reward_std": 2.7805556058883667, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 3.2572696208953857, + "rewards/no_repetition_reward_func": -0.1671067699790001, + "rewards/verse_reward_func": 0.0, + "step": 423 + }, + { + "completion_length": 503.5, + "epoch": 3.392, + "grad_norm": 0.310546875, + "kl": 0.14389514923095703, + "learning_rate": 4.178008116245024e-05, + "loss": 0.0058, + "reward": 3.2104222774505615, + "reward_std": 2.914844274520874, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.415592074394226, + "rewards/no_repetition_reward_func": -0.18173237144947052, + "rewards/verse_reward_func": -0.0078125, + "step": 424 + }, + { + "completion_length": 506.3125, + "epoch": 3.4, + "grad_norm": 0.255859375, + "kl": 0.1810968518257141, + "learning_rate": 4.172826515897146e-05, + "loss": 0.0072, + "reward": 2.3340342044830322, + "reward_std": 2.5345683097839355, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 2.513601541519165, + "rewards/no_repetition_reward_func": -0.16394253447651863, + "rewards/verse_reward_func": -0.015625, + "step": 425 + }, + { + "completion_length": 515.46875, + "epoch": 3.408, + "grad_norm": 0.2451171875, + "kl": 0.1547551155090332, + "learning_rate": 4.1676318705120616e-05, + "loss": 0.0062, + "reward": 2.971490263938904, + "reward_std": 2.7377501726150513, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.169802188873291, + "rewards/no_repetition_reward_func": -0.19831183552742004, + "rewards/verse_reward_func": 0.0, + "step": 426 + }, + { + "completion_length": 506.234375, + "epoch": 3.416, + "grad_norm": 0.28125, + "kl": 0.12054744735360146, + "learning_rate": 4.162424220598658e-05, + "loss": 0.0048, + "reward": 3.945735216140747, + "reward_std": 3.2063586711883545, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 4.218912839889526, + "rewards/no_repetition_reward_func": -0.24192775785923004, + "rewards/verse_reward_func": 0.0, + "step": 427 + }, + { + "completion_length": 512.984375, + "epoch": 3.424, + "grad_norm": 0.263671875, + "kl": 0.11428412050008774, + "learning_rate": 4.157203606767238e-05, + "loss": 0.0046, + "reward": 4.398524284362793, + "reward_std": 3.2104114294052124, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.640822887420654, + "rewards/no_repetition_reward_func": -0.23448620736598969, + "rewards/verse_reward_func": -0.0078125, + "step": 428 + }, + { + "completion_length": 514.6875, + "epoch": 3.432, + "grad_norm": 0.25390625, + "kl": 0.14808723330497742, + "learning_rate": 4.1519700697291944e-05, + "loss": 0.0059, + "reward": 4.012346506118774, + "reward_std": 2.8824875354766846, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.229502558708191, + "rewards/no_repetition_reward_func": -0.2171558290719986, + "rewards/verse_reward_func": 0.0, + "step": 429 + }, + { + "completion_length": 513.09375, + "epoch": 3.44, + "grad_norm": 0.2041015625, + "kl": 0.1369466930627823, + "learning_rate": 4.146723650296701e-05, + "loss": 0.0055, + "reward": 3.526071786880493, + "reward_std": 3.058579683303833, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.7593990564346313, + "rewards/no_repetition_reward_func": -0.21770219504833221, + "rewards/verse_reward_func": 0.0, + "step": 430 + }, + { + "completion_length": 509.734375, + "epoch": 3.448, + "grad_norm": 0.265625, + "kl": 0.18149402737617493, + "learning_rate": 4.1414643893823914e-05, + "loss": 0.0073, + "reward": 2.796503186225891, + "reward_std": 2.818273186683655, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.9950339794158936, + "rewards/no_repetition_reward_func": -0.17509325593709946, + "rewards/verse_reward_func": -0.0078125, + "step": 431 + }, + { + "completion_length": 514.25, + "epoch": 3.456, + "grad_norm": 1.5546875, + "kl": 0.18319929391145706, + "learning_rate": 4.136192327999037e-05, + "loss": 0.0073, + "reward": 2.8051342964172363, + "reward_std": 2.6849365234375, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.023751199245453, + "rewards/no_repetition_reward_func": -0.18736690282821655, + "rewards/verse_reward_func": -0.015625, + "step": 432 + }, + { + "completion_length": 511.65625, + "epoch": 3.464, + "grad_norm": 0.287109375, + "kl": 0.17975015938282013, + "learning_rate": 4.130907507259233e-05, + "loss": 0.0072, + "reward": 3.996192455291748, + "reward_std": 3.0177977085113525, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 4.205715656280518, + "rewards/no_repetition_reward_func": -0.17046086490154266, + "rewards/verse_reward_func": -0.0078125, + "step": 433 + }, + { + "completion_length": 512.265625, + "epoch": 3.472, + "grad_norm": 0.244140625, + "kl": 0.21731548756361008, + "learning_rate": 4.125609968375072e-05, + "loss": 0.0087, + "reward": 1.950236201286316, + "reward_std": 2.5976409912109375, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 2.0975663661956787, + "rewards/no_repetition_reward_func": -0.11608023568987846, + "rewards/verse_reward_func": -0.03125, + "step": 434 + }, + { + "completion_length": 497.75, + "epoch": 3.48, + "grad_norm": 0.2578125, + "kl": 0.19993934035301208, + "learning_rate": 4.1202997526578276e-05, + "loss": 0.008, + "reward": 3.0186195373535156, + "reward_std": 2.3946720957756042, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.1919888257980347, + "rewards/no_repetition_reward_func": -0.15774434059858322, + "rewards/verse_reward_func": 0.0, + "step": 435 + }, + { + "completion_length": 513.546875, + "epoch": 3.488, + "grad_norm": 0.216796875, + "kl": 0.15761160105466843, + "learning_rate": 4.1149769015176275e-05, + "loss": 0.0063, + "reward": 2.905766248703003, + "reward_std": 2.4093360900878906, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.1262102127075195, + "rewards/no_repetition_reward_func": -0.2048187404870987, + "rewards/verse_reward_func": 0.0, + "step": 436 + }, + { + "completion_length": 511.859375, + "epoch": 3.496, + "grad_norm": 0.2265625, + "kl": 0.1997891068458557, + "learning_rate": 4.109641456463135e-05, + "loss": 0.008, + "reward": 2.2230801582336426, + "reward_std": 2.788390874862671, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 2.3949772119522095, + "rewards/no_repetition_reward_func": -0.17189717292785645, + "rewards/verse_reward_func": 0.0, + "step": 437 + }, + { + "completion_length": 510.34375, + "epoch": 3.504, + "grad_norm": 0.2236328125, + "kl": 0.20020601898431778, + "learning_rate": 4.104293459101222e-05, + "loss": 0.008, + "reward": 3.4283978939056396, + "reward_std": 3.281007170677185, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.6194968223571777, + "rewards/no_repetition_reward_func": -0.1754738986492157, + "rewards/verse_reward_func": 0.0, + "step": 438 + }, + { + "completion_length": 515.296875, + "epoch": 3.512, + "grad_norm": 0.21875, + "kl": 0.21494118869304657, + "learning_rate": 4.098932951136645e-05, + "loss": 0.0086, + "reward": 2.6579113006591797, + "reward_std": 2.673240542411804, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.8559789657592773, + "rewards/no_repetition_reward_func": -0.15119270235300064, + "rewards/verse_reward_func": -0.03125, + "step": 439 + }, + { + "completion_length": 514.84375, + "epoch": 3.52, + "grad_norm": 0.267578125, + "kl": 0.14929042756557465, + "learning_rate": 4.093559974371725e-05, + "loss": 0.006, + "reward": 4.074631214141846, + "reward_std": 2.9376754760742188, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.302459478378296, + "rewards/no_repetition_reward_func": -0.22782832384109497, + "rewards/verse_reward_func": 0.0, + "step": 440 + }, + { + "completion_length": 516.0, + "epoch": 3.528, + "grad_norm": 0.23046875, + "kl": 0.21595358103513718, + "learning_rate": 4.088174570706011e-05, + "loss": 0.0086, + "reward": 2.1889911890029907, + "reward_std": 3.0301918983459473, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.367292642593384, + "rewards/no_repetition_reward_func": -0.1470513716340065, + "rewards/verse_reward_func": -0.015625, + "step": 441 + }, + { + "completion_length": 507.703125, + "epoch": 3.536, + "grad_norm": 0.2373046875, + "kl": 0.22236041724681854, + "learning_rate": 4.082776782135964e-05, + "loss": 0.0089, + "reward": 2.0300171971321106, + "reward_std": 2.102987051010132, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 2.1914154291152954, + "rewards/no_repetition_reward_func": -0.12233571708202362, + "rewards/verse_reward_func": -0.0078125, + "step": 442 + }, + { + "completion_length": 503.015625, + "epoch": 3.544, + "grad_norm": 0.2412109375, + "kl": 0.15490446984767914, + "learning_rate": 4.077366650754624e-05, + "loss": 0.0062, + "reward": 2.7166714668273926, + "reward_std": 2.5419137477874756, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 2.941044569015503, + "rewards/no_repetition_reward_func": -0.2009354531764984, + "rewards/verse_reward_func": -0.0078125, + "step": 443 + }, + { + "completion_length": 514.53125, + "epoch": 3.552, + "grad_norm": 0.32421875, + "kl": 0.1430477425456047, + "learning_rate": 4.071944218751282e-05, + "loss": 0.0057, + "reward": 3.7163684368133545, + "reward_std": 2.8839529752731323, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 3.99422550201416, + "rewards/no_repetition_reward_func": -0.23879505693912506, + "rewards/verse_reward_func": -0.0078125, + "step": 444 + }, + { + "completion_length": 515.328125, + "epoch": 3.56, + "grad_norm": 0.8515625, + "kl": 0.22414397448301315, + "learning_rate": 4.066509528411152e-05, + "loss": 0.009, + "reward": 4.198101043701172, + "reward_std": 3.4066922664642334, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.4438769817352295, + "rewards/no_repetition_reward_func": -0.23015085607767105, + "rewards/verse_reward_func": 0.0, + "step": 445 + }, + { + "completion_length": 511.53125, + "epoch": 3.568, + "grad_norm": 0.2216796875, + "kl": 0.18752896040678024, + "learning_rate": 4.0610626221150394e-05, + "loss": 0.0075, + "reward": 3.5320030450820923, + "reward_std": 3.2751731872558594, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.7407288551330566, + "rewards/no_repetition_reward_func": -0.19310110807418823, + "rewards/verse_reward_func": 0.0, + "step": 446 + }, + { + "completion_length": 512.59375, + "epoch": 3.576, + "grad_norm": 0.244140625, + "kl": 0.16677276045084, + "learning_rate": 4.055603542339016e-05, + "loss": 0.0067, + "reward": 3.8673243522644043, + "reward_std": 3.3228381872177124, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.078340768814087, + "rewards/no_repetition_reward_func": -0.2110166996717453, + "rewards/verse_reward_func": 0.0, + "step": 447 + }, + { + "completion_length": 514.9375, + "epoch": 3.584, + "grad_norm": 0.234375, + "kl": 0.19201727956533432, + "learning_rate": 4.050132331654082e-05, + "loss": 0.0077, + "reward": 3.5196954011917114, + "reward_std": 2.5613479018211365, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.7096508741378784, + "rewards/no_repetition_reward_func": -0.1899554580450058, + "rewards/verse_reward_func": 0.0, + "step": 448 + }, + { + "completion_length": 512.4375, + "epoch": 3.592, + "grad_norm": 0.1962890625, + "kl": 0.19043003022670746, + "learning_rate": 4.044649032725836e-05, + "loss": 0.0076, + "reward": 3.5038254261016846, + "reward_std": 2.7718945741653442, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.7137285470962524, + "rewards/no_repetition_reward_func": -0.2099030539393425, + "rewards/verse_reward_func": 0.0, + "step": 449 + }, + { + "completion_length": 508.40625, + "epoch": 3.6, + "grad_norm": 0.265625, + "kl": 0.15774796158075333, + "learning_rate": 4.039153688314145e-05, + "loss": 0.0063, + "reward": 3.825740098953247, + "reward_std": 3.0137126445770264, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.049816608428955, + "rewards/no_repetition_reward_func": -0.20845142006874084, + "rewards/verse_reward_func": 0.0, + "step": 450 + }, + { + "completion_length": 511.890625, + "epoch": 3.608, + "grad_norm": 0.2294921875, + "kl": 0.12934113293886185, + "learning_rate": 4.033646341272811e-05, + "loss": 0.0052, + "reward": 4.6898884773254395, + "reward_std": 3.304353713989258, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.95593523979187, + "rewards/no_repetition_reward_func": -0.2660471498966217, + "rewards/verse_reward_func": 0.0, + "step": 451 + }, + { + "completion_length": 516.0, + "epoch": 3.616, + "grad_norm": 0.2421875, + "kl": 0.1671893447637558, + "learning_rate": 4.028127034549229e-05, + "loss": 0.0067, + "reward": 3.6175343990325928, + "reward_std": 2.850470542907715, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.8519634008407593, + "rewards/no_repetition_reward_func": -0.21880415081977844, + "rewards/verse_reward_func": 0.0, + "step": 452 + }, + { + "completion_length": 514.28125, + "epoch": 3.624, + "grad_norm": 0.2255859375, + "kl": 0.20030619949102402, + "learning_rate": 4.022595811184064e-05, + "loss": 0.008, + "reward": 2.8536930084228516, + "reward_std": 2.914884090423584, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.101408362388611, + "rewards/no_repetition_reward_func": -0.18521516770124435, + "rewards/verse_reward_func": -0.046875, + "step": 453 + }, + { + "completion_length": 514.984375, + "epoch": 3.632, + "grad_norm": 0.1923828125, + "kl": 0.13944398239254951, + "learning_rate": 4.017052714310906e-05, + "loss": 0.0056, + "reward": 4.75087034702301, + "reward_std": 2.9399373531341553, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.009894967079163, + "rewards/no_repetition_reward_func": -0.25902480632066727, + "rewards/verse_reward_func": 0.0, + "step": 454 + }, + { + "completion_length": 514.8125, + "epoch": 3.64, + "grad_norm": 0.1865234375, + "kl": 0.15428784489631653, + "learning_rate": 4.011497787155938e-05, + "loss": 0.0062, + "reward": 4.469793081283569, + "reward_std": 3.545780301094055, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.7508862018585205, + "rewards/no_repetition_reward_func": -0.26546820998191833, + "rewards/verse_reward_func": 0.0, + "step": 455 + }, + { + "completion_length": 509.0, + "epoch": 3.648, + "grad_norm": 0.193359375, + "kl": 0.17394553124904633, + "learning_rate": 4.005931073037596e-05, + "loss": 0.007, + "reward": 3.4749141931533813, + "reward_std": 2.884767532348633, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.7560935020446777, + "rewards/no_repetition_reward_func": -0.22649192065000534, + "rewards/verse_reward_func": -0.0390625, + "step": 456 + }, + { + "completion_length": 511.3125, + "epoch": 3.656, + "grad_norm": 0.19140625, + "kl": 0.11640754342079163, + "learning_rate": 4.000352615366239e-05, + "loss": 0.0047, + "reward": 4.572721004486084, + "reward_std": 3.5922268629074097, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.890354871749878, + "rewards/no_repetition_reward_func": -0.30200865864753723, + "rewards/verse_reward_func": -0.015625, + "step": 457 + }, + { + "completion_length": 515.53125, + "epoch": 3.664, + "grad_norm": 0.1875, + "kl": 0.14622241258621216, + "learning_rate": 3.9947624576437975e-05, + "loss": 0.0058, + "reward": 4.165874719619751, + "reward_std": 2.908482074737549, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.432632207870483, + "rewards/no_repetition_reward_func": -0.2589448392391205, + "rewards/verse_reward_func": -0.0078125, + "step": 458 + }, + { + "completion_length": 509.53125, + "epoch": 3.672, + "grad_norm": 0.232421875, + "kl": 0.14606256037950516, + "learning_rate": 3.989160643463445e-05, + "loss": 0.0058, + "reward": 3.6520293951034546, + "reward_std": 3.0917210578918457, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.9079716205596924, + "rewards/no_repetition_reward_func": -0.24031748622655869, + "rewards/verse_reward_func": -0.015625, + "step": 459 + }, + { + "completion_length": 514.75, + "epoch": 3.68, + "grad_norm": 0.185546875, + "kl": 0.19900204241275787, + "learning_rate": 3.983547216509254e-05, + "loss": 0.008, + "reward": 3.074185609817505, + "reward_std": 3.560298204421997, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.2693833112716675, + "rewards/no_repetition_reward_func": -0.19519764930009842, + "rewards/verse_reward_func": 0.0, + "step": 460 + }, + { + "completion_length": 507.53125, + "epoch": 3.6879999999999997, + "grad_norm": 0.212890625, + "kl": 0.1963191255927086, + "learning_rate": 3.977922220555855e-05, + "loss": 0.0079, + "reward": 3.675302743911743, + "reward_std": 2.9777519702911377, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.890020728111267, + "rewards/no_repetition_reward_func": -0.21471787989139557, + "rewards/verse_reward_func": 0.0, + "step": 461 + }, + { + "completion_length": 515.59375, + "epoch": 3.6959999999999997, + "grad_norm": 0.181640625, + "kl": 0.16736619919538498, + "learning_rate": 3.9722856994680966e-05, + "loss": 0.0067, + "reward": 4.121479272842407, + "reward_std": 2.9138602018356323, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.374983549118042, + "rewards/no_repetition_reward_func": -0.25350430607795715, + "rewards/verse_reward_func": 0.0, + "step": 462 + }, + { + "completion_length": 496.859375, + "epoch": 3.7039999999999997, + "grad_norm": 0.42578125, + "kl": 0.18568754941225052, + "learning_rate": 3.966637697200703e-05, + "loss": 0.0074, + "reward": 4.498464465141296, + "reward_std": 3.1409988403320312, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.774617314338684, + "rewards/no_repetition_reward_func": -0.2683406174182892, + "rewards/verse_reward_func": -0.0078125, + "step": 463 + }, + { + "completion_length": 506.921875, + "epoch": 3.7119999999999997, + "grad_norm": 0.369140625, + "kl": 0.15412476658821106, + "learning_rate": 3.960978257797931e-05, + "loss": 0.0062, + "reward": 4.354745030403137, + "reward_std": 3.3551993370056152, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.62610650062561, + "rewards/no_repetition_reward_func": -0.2401115521788597, + "rewards/verse_reward_func": -0.015625, + "step": 464 + }, + { + "completion_length": 516.0, + "epoch": 3.7199999999999998, + "grad_norm": 0.2099609375, + "kl": 0.15957008302211761, + "learning_rate": 3.955307425393224e-05, + "loss": 0.0064, + "reward": 4.5120017528533936, + "reward_std": 3.1755751371383667, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.752218961715698, + "rewards/no_repetition_reward_func": -0.23240458965301514, + "rewards/verse_reward_func": -0.0078125, + "step": 465 + }, + { + "completion_length": 515.078125, + "epoch": 3.7279999999999998, + "grad_norm": 0.2021484375, + "kl": 0.17485202848911285, + "learning_rate": 3.9496252442088733e-05, + "loss": 0.007, + "reward": 4.464818716049194, + "reward_std": 3.570101261138916, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.717419624328613, + "rewards/no_repetition_reward_func": -0.23697632551193237, + "rewards/verse_reward_func": -0.015625, + "step": 466 + }, + { + "completion_length": 498.59375, + "epoch": 3.7359999999999998, + "grad_norm": 0.2138671875, + "kl": 0.17735601216554642, + "learning_rate": 3.943931758555669e-05, + "loss": 0.0071, + "reward": 4.290479898452759, + "reward_std": 2.32756644487381, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.4979881048202515, + "rewards/no_repetition_reward_func": -0.20750821381807327, + "rewards/verse_reward_func": 0.0, + "step": 467 + }, + { + "completion_length": 516.0, + "epoch": 3.7439999999999998, + "grad_norm": 0.19140625, + "kl": 0.21024346351623535, + "learning_rate": 3.938227012832557e-05, + "loss": 0.0084, + "reward": 4.146731972694397, + "reward_std": 2.5268921852111816, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.379987955093384, + "rewards/no_repetition_reward_func": -0.21763069927692413, + "rewards/verse_reward_func": -0.015625, + "step": 468 + }, + { + "completion_length": 511.0, + "epoch": 3.752, + "grad_norm": 0.2255859375, + "kl": 0.1623486950993538, + "learning_rate": 3.932511051526289e-05, + "loss": 0.0065, + "reward": 3.8976480960845947, + "reward_std": 3.126002788543701, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.144995212554932, + "rewards/no_repetition_reward_func": -0.23172180354595184, + "rewards/verse_reward_func": -0.015625, + "step": 469 + }, + { + "completion_length": 514.96875, + "epoch": 3.76, + "grad_norm": 0.201171875, + "kl": 0.13839838281273842, + "learning_rate": 3.92678391921108e-05, + "loss": 0.0055, + "reward": 5.005223751068115, + "reward_std": 3.0770896673202515, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.296253442764282, + "rewards/no_repetition_reward_func": -0.275405153632164, + "rewards/verse_reward_func": -0.015625, + "step": 470 + }, + { + "completion_length": 504.578125, + "epoch": 3.768, + "grad_norm": 0.302734375, + "kl": 0.18781789392232895, + "learning_rate": 3.9210456605482576e-05, + "loss": 0.0075, + "reward": 4.448628187179565, + "reward_std": 3.2973560094833374, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.689706444740295, + "rewards/no_repetition_reward_func": -0.23326591402292252, + "rewards/verse_reward_func": -0.0078125, + "step": 471 + }, + { + "completion_length": 510.125, + "epoch": 3.776, + "grad_norm": 0.2216796875, + "kl": 0.17455513030290604, + "learning_rate": 3.915296320285917e-05, + "loss": 0.007, + "reward": 4.679287791252136, + "reward_std": 2.8502267599105835, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.952520370483398, + "rewards/no_repetition_reward_func": -0.24979472905397415, + "rewards/verse_reward_func": -0.0078125, + "step": 472 + }, + { + "completion_length": 516.0, + "epoch": 3.784, + "grad_norm": 0.2177734375, + "kl": 0.16542016714811325, + "learning_rate": 3.909535943258567e-05, + "loss": 0.0066, + "reward": 4.813771963119507, + "reward_std": 3.186178207397461, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.0696399211883545, + "rewards/no_repetition_reward_func": -0.2558676451444626, + "rewards/verse_reward_func": 0.0, + "step": 473 + }, + { + "completion_length": 503.40625, + "epoch": 3.792, + "grad_norm": 0.2431640625, + "kl": 0.18362727761268616, + "learning_rate": 3.903764574386786e-05, + "loss": 0.0073, + "reward": 4.246111631393433, + "reward_std": 3.300834536552429, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.477446794509888, + "rewards/no_repetition_reward_func": -0.23133472353219986, + "rewards/verse_reward_func": 0.0, + "step": 474 + }, + { + "completion_length": 515.03125, + "epoch": 3.8, + "grad_norm": 0.1806640625, + "kl": 0.13111485540866852, + "learning_rate": 3.897982258676867e-05, + "loss": 0.0052, + "reward": 4.730367183685303, + "reward_std": 3.4605612754821777, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.012945890426636, + "rewards/no_repetition_reward_func": -0.2825789302587509, + "rewards/verse_reward_func": 0.0, + "step": 475 + }, + { + "completion_length": 514.390625, + "epoch": 3.808, + "grad_norm": 0.20703125, + "kl": 0.18289577960968018, + "learning_rate": 3.8921890412204705e-05, + "loss": 0.0073, + "reward": 3.9644694328308105, + "reward_std": 3.2665653228759766, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.2124305963516235, + "rewards/no_repetition_reward_func": -0.23233625292778015, + "rewards/verse_reward_func": 0.0, + "step": 476 + }, + { + "completion_length": 516.0, + "epoch": 3.816, + "grad_norm": 0.201171875, + "kl": 0.13228074461221695, + "learning_rate": 3.8863849671942685e-05, + "loss": 0.0053, + "reward": 5.0202600955963135, + "reward_std": 2.999442458152771, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.30688214302063, + "rewards/no_repetition_reward_func": -0.28662198036909103, + "rewards/verse_reward_func": 0.0, + "step": 477 + }, + { + "completion_length": 514.03125, + "epoch": 3.824, + "grad_norm": 0.18359375, + "kl": 0.12542841583490372, + "learning_rate": 3.880570081859597e-05, + "loss": 0.005, + "reward": 5.413423538208008, + "reward_std": 3.1352925300598145, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.71832013130188, + "rewards/no_repetition_reward_func": -0.3048967272043228, + "rewards/verse_reward_func": 0.0, + "step": 478 + }, + { + "completion_length": 516.0, + "epoch": 3.832, + "grad_norm": 0.26953125, + "kl": 0.208566352725029, + "learning_rate": 3.8747444305621e-05, + "loss": 0.0083, + "reward": 3.5483458042144775, + "reward_std": 2.507484197616577, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.8046435117721558, + "rewards/no_repetition_reward_func": -0.2094227820634842, + "rewards/verse_reward_func": -0.046875, + "step": 479 + }, + { + "completion_length": 511.453125, + "epoch": 3.84, + "grad_norm": 0.2109375, + "kl": 0.2369367927312851, + "learning_rate": 3.868908058731376e-05, + "loss": 0.0095, + "reward": 4.015443325042725, + "reward_std": 2.8727692365646362, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.220126390457153, + "rewards/no_repetition_reward_func": -0.19687048345804214, + "rewards/verse_reward_func": -0.0078125, + "step": 480 + }, + { + "completion_length": 500.84375, + "epoch": 3.848, + "grad_norm": 0.2373046875, + "kl": 0.16295814514160156, + "learning_rate": 3.8630610118806254e-05, + "loss": 0.0065, + "reward": 4.887747287750244, + "reward_std": 2.9053876399993896, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.157720565795898, + "rewards/no_repetition_reward_func": -0.26997287571430206, + "rewards/verse_reward_func": 0.0, + "step": 481 + }, + { + "completion_length": 516.0, + "epoch": 3.856, + "grad_norm": 0.388671875, + "kl": 0.18515048921108246, + "learning_rate": 3.8572033356062943e-05, + "loss": 0.0074, + "reward": 4.967130184173584, + "reward_std": 3.262230396270752, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.237434387207031, + "rewards/no_repetition_reward_func": -0.25467927753925323, + "rewards/verse_reward_func": -0.015625, + "step": 482 + }, + { + "completion_length": 515.671875, + "epoch": 3.864, + "grad_norm": 0.205078125, + "kl": 0.13084059953689575, + "learning_rate": 3.851335075587718e-05, + "loss": 0.0052, + "reward": 5.34868597984314, + "reward_std": 3.1101691722869873, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.638932466506958, + "rewards/no_repetition_reward_func": -0.29024672508239746, + "rewards/verse_reward_func": 0.0, + "step": 483 + }, + { + "completion_length": 514.5, + "epoch": 3.872, + "grad_norm": 0.224609375, + "kl": 0.16262559592723846, + "learning_rate": 3.8454562775867684e-05, + "loss": 0.0065, + "reward": 5.187208652496338, + "reward_std": 3.31445574760437, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.456263065338135, + "rewards/no_repetition_reward_func": -0.2612417936325073, + "rewards/verse_reward_func": -0.0078125, + "step": 484 + }, + { + "completion_length": 507.859375, + "epoch": 3.88, + "grad_norm": 0.2314453125, + "kl": 0.1840328574180603, + "learning_rate": 3.8395669874474915e-05, + "loss": 0.0074, + "reward": 4.267908573150635, + "reward_std": 3.2971633672714233, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.504893779754639, + "rewards/no_repetition_reward_func": -0.23698516935110092, + "rewards/verse_reward_func": 0.0, + "step": 485 + }, + { + "completion_length": 516.0, + "epoch": 3.888, + "grad_norm": 0.236328125, + "kl": 0.11927275732159615, + "learning_rate": 3.8336672510957574e-05, + "loss": 0.0048, + "reward": 5.649379253387451, + "reward_std": 3.0360267162323, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.9452972412109375, + "rewards/no_repetition_reward_func": -0.29591839015483856, + "rewards/verse_reward_func": 0.0, + "step": 486 + }, + { + "completion_length": 502.890625, + "epoch": 3.896, + "grad_norm": 0.2333984375, + "kl": 0.2021339312195778, + "learning_rate": 3.827757114538892e-05, + "loss": 0.0081, + "reward": 3.4562724828720093, + "reward_std": 3.039677858352661, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.738091766834259, + "rewards/no_repetition_reward_func": -0.22713156044483185, + "rewards/verse_reward_func": -0.0546875, + "step": 487 + }, + { + "completion_length": 515.5625, + "epoch": 3.904, + "grad_norm": 0.2275390625, + "kl": 0.20770788192749023, + "learning_rate": 3.821836623865329e-05, + "loss": 0.0083, + "reward": 3.6433846950531006, + "reward_std": 3.184501528739929, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.878601551055908, + "rewards/no_repetition_reward_func": -0.20396703481674194, + "rewards/verse_reward_func": -0.015625, + "step": 488 + }, + { + "completion_length": 515.359375, + "epoch": 3.912, + "grad_norm": 0.2119140625, + "kl": 0.1590171828866005, + "learning_rate": 3.8159058252442446e-05, + "loss": 0.0064, + "reward": 5.022158861160278, + "reward_std": 3.5084046125411987, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.320943355560303, + "rewards/no_repetition_reward_func": -0.29097166657447815, + "rewards/verse_reward_func": -0.0078125, + "step": 489 + }, + { + "completion_length": 508.1875, + "epoch": 3.92, + "grad_norm": 0.203125, + "kl": 0.19003015756607056, + "learning_rate": 3.8099647649251986e-05, + "loss": 0.0076, + "reward": 4.432275295257568, + "reward_std": 3.0002970695495605, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.672489404678345, + "rewards/no_repetition_reward_func": -0.21677665412425995, + "rewards/verse_reward_func": -0.0234375, + "step": 490 + }, + { + "completion_length": 508.265625, + "epoch": 3.928, + "grad_norm": 0.2177734375, + "kl": 0.27863533049821854, + "learning_rate": 3.80401348923777e-05, + "loss": 0.0111, + "reward": 3.613530158996582, + "reward_std": 2.9570446014404297, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.7854321002960205, + "rewards/no_repetition_reward_func": -0.17190209031105042, + "rewards/verse_reward_func": 0.0, + "step": 491 + }, + { + "completion_length": 511.5625, + "epoch": 3.936, + "grad_norm": 0.1923828125, + "kl": 0.20870302617549896, + "learning_rate": 3.798052044591204e-05, + "loss": 0.0083, + "reward": 4.053704738616943, + "reward_std": 3.1799628734588623, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.296497464179993, + "rewards/no_repetition_reward_func": -0.219355046749115, + "rewards/verse_reward_func": -0.0078125, + "step": 492 + }, + { + "completion_length": 516.0, + "epoch": 3.944, + "grad_norm": 0.1796875, + "kl": 0.1964292898774147, + "learning_rate": 3.792080477474043e-05, + "loss": 0.0079, + "reward": 4.665344715118408, + "reward_std": 2.855156421661377, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.92292046546936, + "rewards/no_repetition_reward_func": -0.2419506013393402, + "rewards/verse_reward_func": 0.0, + "step": 493 + }, + { + "completion_length": 510.75, + "epoch": 3.952, + "grad_norm": 0.255859375, + "kl": 0.25930921733379364, + "learning_rate": 3.786098834453766e-05, + "loss": 0.0104, + "reward": 3.5263750553131104, + "reward_std": 2.19377064704895, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.810723304748535, + "rewards/no_repetition_reward_func": -0.1984107419848442, + "rewards/verse_reward_func": -0.0859375, + "step": 494 + }, + { + "completion_length": 505.625, + "epoch": 3.96, + "grad_norm": 0.2138671875, + "kl": 0.2025780975818634, + "learning_rate": 3.780107162176429e-05, + "loss": 0.0081, + "reward": 4.347127437591553, + "reward_std": 3.0032341480255127, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.586549639701843, + "rewards/no_repetition_reward_func": -0.223797008395195, + "rewards/verse_reward_func": 0.0, + "step": 495 + }, + { + "completion_length": 516.0, + "epoch": 3.968, + "grad_norm": 0.2294921875, + "kl": 0.2154790610074997, + "learning_rate": 3.7741055073662946e-05, + "loss": 0.0086, + "reward": 3.565968632698059, + "reward_std": 2.553309202194214, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.7757381200790405, + "rewards/no_repetition_reward_func": -0.19414447247982025, + "rewards/verse_reward_func": 0.0, + "step": 496 + }, + { + "completion_length": 508.078125, + "epoch": 3.976, + "grad_norm": 0.232421875, + "kl": 0.17013194411993027, + "learning_rate": 3.7680939168254733e-05, + "loss": 0.0068, + "reward": 4.495337963104248, + "reward_std": 2.8076168298721313, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.7717894315719604, + "rewards/no_repetition_reward_func": -0.24520136415958405, + "rewards/verse_reward_func": -0.03125, + "step": 497 + }, + { + "completion_length": 515.484375, + "epoch": 3.984, + "grad_norm": 0.1806640625, + "kl": 0.16821721196174622, + "learning_rate": 3.762072437433555e-05, + "loss": 0.0067, + "reward": 5.354635953903198, + "reward_std": 3.256480097770691, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.631888389587402, + "rewards/no_repetition_reward_func": -0.2616274952888489, + "rewards/verse_reward_func": 0.0, + "step": 498 + }, + { + "completion_length": 507.03125, + "epoch": 3.992, + "grad_norm": 0.1953125, + "kl": 0.2341996505856514, + "learning_rate": 3.7560411161472456e-05, + "loss": 0.0094, + "reward": 3.812806010246277, + "reward_std": 3.523435115814209, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.03594183921814, + "rewards/no_repetition_reward_func": -0.21532317996025085, + "rewards/verse_reward_func": -0.0078125, + "step": 499 + }, + { + "completion_length": 516.0, + "epoch": 4.0, + "grad_norm": 0.2197265625, + "kl": 0.15551456809043884, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.0062, + "reward": 4.94283652305603, + "reward_std": 3.035518765449524, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 5.282710552215576, + "rewards/no_repetition_reward_func": -0.27737413346767426, + "rewards/verse_reward_func": -0.015625, + "step": 500 + }, + { + "completion_length": 505.546875, + "epoch": 4.008, + "grad_norm": 0.2451171875, + "kl": 0.21039459109306335, + "learning_rate": 3.7439491361016564e-05, + "loss": 0.0084, + "reward": 4.835317611694336, + "reward_std": 2.8177051544189453, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.0725202560424805, + "rewards/no_repetition_reward_func": -0.23720252513885498, + "rewards/verse_reward_func": 0.0, + "step": 501 + }, + { + "completion_length": 516.0, + "epoch": 4.016, + "grad_norm": 0.1943359375, + "kl": 0.18561717867851257, + "learning_rate": 3.7378885716380664e-05, + "loss": 0.0074, + "reward": 4.469706296920776, + "reward_std": 3.2858978509902954, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.723701477050781, + "rewards/no_repetition_reward_func": -0.2461826577782631, + "rewards/verse_reward_func": -0.0078125, + "step": 502 + }, + { + "completion_length": 515.4375, + "epoch": 4.024, + "grad_norm": 0.201171875, + "kl": 0.15121734887361526, + "learning_rate": 3.731818353870729e-05, + "loss": 0.006, + "reward": 5.248231410980225, + "reward_std": 3.5150378942489624, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.5350022315979, + "rewards/no_repetition_reward_func": -0.2789582908153534, + "rewards/verse_reward_func": -0.0078125, + "step": 503 + }, + { + "completion_length": 514.53125, + "epoch": 4.032, + "grad_norm": 0.1953125, + "kl": 0.19492697715759277, + "learning_rate": 3.725738530136422e-05, + "loss": 0.0078, + "reward": 4.083330512046814, + "reward_std": 3.164367437362671, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.335059881210327, + "rewards/no_repetition_reward_func": -0.23610448092222214, + "rewards/verse_reward_func": -0.015625, + "step": 504 + }, + { + "completion_length": 506.671875, + "epoch": 4.04, + "grad_norm": 0.1826171875, + "kl": 0.21670028567314148, + "learning_rate": 3.719649147846832e-05, + "loss": 0.0087, + "reward": 4.316012620925903, + "reward_std": 3.3290915489196777, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.560983896255493, + "rewards/no_repetition_reward_func": -0.23715892434120178, + "rewards/verse_reward_func": -0.0078125, + "step": 505 + }, + { + "completion_length": 510.71875, + "epoch": 4.048, + "grad_norm": 0.20703125, + "kl": 0.14617466181516647, + "learning_rate": 3.713550254488185e-05, + "loss": 0.0058, + "reward": 4.9805824756622314, + "reward_std": 3.2167608737945557, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.266499042510986, + "rewards/no_repetition_reward_func": -0.28591662645339966, + "rewards/verse_reward_func": 0.0, + "step": 506 + }, + { + "completion_length": 511.5, + "epoch": 4.056, + "grad_norm": 0.2001953125, + "kl": 0.1717342883348465, + "learning_rate": 3.7074418976208766e-05, + "loss": 0.0069, + "reward": 5.033324956893921, + "reward_std": 3.511086344718933, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.303963899612427, + "rewards/no_repetition_reward_func": -0.2706386595964432, + "rewards/verse_reward_func": 0.0, + "step": 507 + }, + { + "completion_length": 499.84375, + "epoch": 4.064, + "grad_norm": 0.2421875, + "kl": 0.2014167606830597, + "learning_rate": 3.701324124879102e-05, + "loss": 0.0081, + "reward": 4.572264075279236, + "reward_std": 2.9593745470046997, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.82683801651001, + "rewards/no_repetition_reward_func": -0.23113616555929184, + "rewards/verse_reward_func": -0.0078125, + "step": 508 + }, + { + "completion_length": 507.34375, + "epoch": 4.072, + "grad_norm": 0.1982421875, + "kl": 0.1843513399362564, + "learning_rate": 3.695196983970481e-05, + "loss": 0.0074, + "reward": 5.146818161010742, + "reward_std": 3.4034054279327393, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.435227870941162, + "rewards/no_repetition_reward_func": -0.2649723067879677, + "rewards/verse_reward_func": -0.0078125, + "step": 509 + }, + { + "completion_length": 509.125, + "epoch": 4.08, + "grad_norm": 0.2470703125, + "kl": 0.16989582777023315, + "learning_rate": 3.689060522675689e-05, + "loss": 0.0068, + "reward": 5.585479736328125, + "reward_std": 3.165469527244568, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.866635799407959, + "rewards/no_repetition_reward_func": -0.2811562716960907, + "rewards/verse_reward_func": 0.0, + "step": 510 + }, + { + "completion_length": 510.5, + "epoch": 4.088, + "grad_norm": 0.19921875, + "kl": 0.14236313104629517, + "learning_rate": 3.682914788848083e-05, + "loss": 0.0057, + "reward": 6.010302782058716, + "reward_std": 2.735579013824463, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.313896179199219, + "rewards/no_repetition_reward_func": -0.2957811653614044, + "rewards/verse_reward_func": -0.0078125, + "step": 511 + }, + { + "completion_length": 507.828125, + "epoch": 4.096, + "grad_norm": 0.1767578125, + "kl": 0.16453421115875244, + "learning_rate": 3.6767598304133324e-05, + "loss": 0.0066, + "reward": 4.859346389770508, + "reward_std": 3.555054783821106, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.167161703109741, + "rewards/no_repetition_reward_func": -0.28437766432762146, + "rewards/verse_reward_func": -0.0234375, + "step": 512 + }, + { + "completion_length": 515.65625, + "epoch": 4.104, + "grad_norm": 0.212890625, + "kl": 0.14401807636022568, + "learning_rate": 3.6705956953690364e-05, + "loss": 0.0058, + "reward": 5.983894348144531, + "reward_std": 3.056508779525757, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.277858018875122, + "rewards/no_repetition_reward_func": -0.2939641624689102, + "rewards/verse_reward_func": 0.0, + "step": 513 + }, + { + "completion_length": 516.0, + "epoch": 4.112, + "grad_norm": 0.2177734375, + "kl": 0.11588628217577934, + "learning_rate": 3.664422431784361e-05, + "loss": 0.0046, + "reward": 6.134968042373657, + "reward_std": 3.028866171836853, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.465428829193115, + "rewards/no_repetition_reward_func": -0.33046093583106995, + "rewards/verse_reward_func": 0.0, + "step": 514 + }, + { + "completion_length": 510.828125, + "epoch": 4.12, + "grad_norm": 0.208984375, + "kl": 0.2616724669933319, + "learning_rate": 3.6582400877996546e-05, + "loss": 0.0105, + "reward": 3.938784599304199, + "reward_std": 3.470210075378418, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.156214475631714, + "rewards/no_repetition_reward_func": -0.20961735397577286, + "rewards/verse_reward_func": -0.0078125, + "step": 515 + }, + { + "completion_length": 509.125, + "epoch": 4.128, + "grad_norm": 0.193359375, + "kl": 0.2230229377746582, + "learning_rate": 3.6520487116260776e-05, + "loss": 0.0089, + "reward": 4.534682989120483, + "reward_std": 3.270304322242737, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.8083086013793945, + "rewards/no_repetition_reward_func": -0.2501879334449768, + "rewards/verse_reward_func": -0.0234375, + "step": 516 + }, + { + "completion_length": 513.5, + "epoch": 4.136, + "grad_norm": 0.1962890625, + "kl": 0.18645650148391724, + "learning_rate": 3.645848351545225e-05, + "loss": 0.0075, + "reward": 5.726111888885498, + "reward_std": 3.0909130573272705, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.989810466766357, + "rewards/no_repetition_reward_func": -0.2636983022093773, + "rewards/verse_reward_func": 0.0, + "step": 517 + }, + { + "completion_length": 516.0, + "epoch": 4.144, + "grad_norm": 0.1953125, + "kl": 0.2631427049636841, + "learning_rate": 3.639639055908751e-05, + "loss": 0.0105, + "reward": 4.260030746459961, + "reward_std": 2.777969479560852, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.484937429428101, + "rewards/no_repetition_reward_func": -0.2092815786600113, + "rewards/verse_reward_func": -0.015625, + "step": 518 + }, + { + "completion_length": 512.6875, + "epoch": 4.152, + "grad_norm": 0.1982421875, + "kl": 0.17641134560108185, + "learning_rate": 3.633420873137988e-05, + "loss": 0.0071, + "reward": 4.4241766929626465, + "reward_std": 3.3319244384765625, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.715724229812622, + "rewards/no_repetition_reward_func": -0.26029759645462036, + "rewards/verse_reward_func": -0.03125, + "step": 519 + }, + { + "completion_length": 499.53125, + "epoch": 4.16, + "grad_norm": 0.37109375, + "kl": 0.23208275437355042, + "learning_rate": 3.627193851723577e-05, + "loss": 0.0093, + "reward": 4.010048985481262, + "reward_std": 3.288179039955139, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.253429412841797, + "rewards/no_repetition_reward_func": -0.2199428677558899, + "rewards/verse_reward_func": -0.0078125, + "step": 520 + }, + { + "completion_length": 512.421875, + "epoch": 4.168, + "grad_norm": 0.1865234375, + "kl": 0.2655394896864891, + "learning_rate": 3.6209580402250815e-05, + "loss": 0.0106, + "reward": 3.3427741527557373, + "reward_std": 2.7971984148025513, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.566936492919922, + "rewards/no_repetition_reward_func": -0.1929125040769577, + "rewards/verse_reward_func": -0.03125, + "step": 521 + }, + { + "completion_length": 504.234375, + "epoch": 4.176, + "grad_norm": 0.2890625, + "kl": 0.17957287281751633, + "learning_rate": 3.614713487270611e-05, + "loss": 0.0072, + "reward": 5.901611804962158, + "reward_std": 3.1597976684570312, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.1876397132873535, + "rewards/no_repetition_reward_func": -0.2704026699066162, + "rewards/verse_reward_func": 0.0, + "step": 522 + }, + { + "completion_length": 514.8125, + "epoch": 4.184, + "grad_norm": 0.1982421875, + "kl": 0.22282682359218597, + "learning_rate": 3.608460241556443e-05, + "loss": 0.0089, + "reward": 4.842794895172119, + "reward_std": 2.627842903137207, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.077395677566528, + "rewards/no_repetition_reward_func": -0.23460112512111664, + "rewards/verse_reward_func": 0.0, + "step": 523 + }, + { + "completion_length": 513.703125, + "epoch": 4.192, + "grad_norm": 0.1689453125, + "kl": 0.2108691781759262, + "learning_rate": 3.602198351846647e-05, + "loss": 0.0084, + "reward": 4.531515121459961, + "reward_std": 3.402707815170288, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.7899510860443115, + "rewards/no_repetition_reward_func": -0.2506234794855118, + "rewards/verse_reward_func": -0.0078125, + "step": 524 + }, + { + "completion_length": 514.046875, + "epoch": 4.2, + "grad_norm": 0.1962890625, + "kl": 0.20827755331993103, + "learning_rate": 3.5959278669726935e-05, + "loss": 0.0083, + "reward": 4.453569173812866, + "reward_std": 3.799462676048279, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.702730894088745, + "rewards/no_repetition_reward_func": -0.24134930968284607, + "rewards/verse_reward_func": -0.0078125, + "step": 525 + }, + { + "completion_length": 512.109375, + "epoch": 4.208, + "grad_norm": 0.2109375, + "kl": 0.19707359373569489, + "learning_rate": 3.5896488358330856e-05, + "loss": 0.0079, + "reward": 5.3081231117248535, + "reward_std": 3.4814952611923218, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.580712080001831, + "rewards/no_repetition_reward_func": -0.26477664709091187, + "rewards/verse_reward_func": -0.0078125, + "step": 526 + }, + { + "completion_length": 510.9375, + "epoch": 4.216, + "grad_norm": 0.181640625, + "kl": 0.18273241072893143, + "learning_rate": 3.5833613073929684e-05, + "loss": 0.0073, + "reward": 5.048484563827515, + "reward_std": 3.065635323524475, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.315688610076904, + "rewards/no_repetition_reward_func": -0.26720406115055084, + "rewards/verse_reward_func": 0.0, + "step": 527 + }, + { + "completion_length": 516.0, + "epoch": 4.224, + "grad_norm": 0.1748046875, + "kl": 0.2418869435787201, + "learning_rate": 3.577065330683751e-05, + "loss": 0.0097, + "reward": 4.022174119949341, + "reward_std": 2.8857948780059814, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.265843391418457, + "rewards/no_repetition_reward_func": -0.23585684597492218, + "rewards/verse_reward_func": -0.0078125, + "step": 528 + }, + { + "completion_length": 516.0, + "epoch": 4.232, + "grad_norm": 0.2490234375, + "kl": 0.1962745413184166, + "learning_rate": 3.570760954802726e-05, + "loss": 0.0079, + "reward": 5.953819274902344, + "reward_std": 3.254130482673645, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.2503697872161865, + "rewards/no_repetition_reward_func": -0.27311310172080994, + "rewards/verse_reward_func": -0.0078125, + "step": 529 + }, + { + "completion_length": 510.359375, + "epoch": 4.24, + "grad_norm": 0.189453125, + "kl": 0.21667743474245071, + "learning_rate": 3.564448228912682e-05, + "loss": 0.0087, + "reward": 5.158280372619629, + "reward_std": 3.142836570739746, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.432362079620361, + "rewards/no_repetition_reward_func": -0.25845685601234436, + "rewards/verse_reward_func": -0.015625, + "step": 530 + }, + { + "completion_length": 510.921875, + "epoch": 4.248, + "grad_norm": 0.2041015625, + "kl": 0.21212803572416306, + "learning_rate": 3.5581272022415244e-05, + "loss": 0.0085, + "reward": 5.330845355987549, + "reward_std": 2.4041647911071777, + "rewards/check_divine_comedy_plagiarism": -0.046875, + "rewards/endecasillabo_reward_func": 5.690070152282715, + "rewards/no_repetition_reward_func": -0.2811000198125839, + "rewards/verse_reward_func": -0.03125, + "step": 531 + }, + { + "completion_length": 514.765625, + "epoch": 4.256, + "grad_norm": 0.2080078125, + "kl": 0.16418281942605972, + "learning_rate": 3.551797924081887e-05, + "loss": 0.0066, + "reward": 6.537445545196533, + "reward_std": 2.4950056076049805, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.8314502239227295, + "rewards/no_repetition_reward_func": -0.2940051257610321, + "rewards/verse_reward_func": 0.0, + "step": 532 + }, + { + "completion_length": 496.796875, + "epoch": 4.264, + "grad_norm": 0.345703125, + "kl": 0.24862173944711685, + "learning_rate": 3.545460443790753e-05, + "loss": 0.0099, + "reward": 4.604752063751221, + "reward_std": 2.8228639364242554, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 4.8736419677734375, + "rewards/no_repetition_reward_func": -0.22982747852802277, + "rewards/verse_reward_func": -0.0078125, + "step": 533 + }, + { + "completion_length": 508.375, + "epoch": 4.272, + "grad_norm": 0.2080078125, + "kl": 0.28755058348178864, + "learning_rate": 3.53911481078907e-05, + "loss": 0.0115, + "reward": 4.182548999786377, + "reward_std": 2.7739893198013306, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.391356587409973, + "rewards/no_repetition_reward_func": -0.20099499821662903, + "rewards/verse_reward_func": -0.0078125, + "step": 534 + }, + { + "completion_length": 507.375, + "epoch": 4.28, + "grad_norm": 0.62109375, + "kl": 0.19298004359006882, + "learning_rate": 3.532761074561355e-05, + "loss": 0.0077, + "reward": 5.843852519989014, + "reward_std": 2.7446974515914917, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.132093667984009, + "rewards/no_repetition_reward_func": -0.2648035138845444, + "rewards/verse_reward_func": -0.0078125, + "step": 535 + }, + { + "completion_length": 509.9375, + "epoch": 4.288, + "grad_norm": 0.2314453125, + "kl": 0.27341826260089874, + "learning_rate": 3.52639928465532e-05, + "loss": 0.0109, + "reward": 4.375577688217163, + "reward_std": 3.0629862546920776, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.590151309967041, + "rewards/no_repetition_reward_func": -0.1911366954445839, + "rewards/verse_reward_func": -0.0234375, + "step": 536 + }, + { + "completion_length": 516.0, + "epoch": 4.296, + "grad_norm": 0.2099609375, + "kl": 0.2686085104942322, + "learning_rate": 3.5200294906814824e-05, + "loss": 0.0107, + "reward": 3.824458360671997, + "reward_std": 3.0582739114761353, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.033471345901489, + "rewards/no_repetition_reward_func": -0.20120076090097427, + "rewards/verse_reward_func": -0.0078125, + "step": 537 + }, + { + "completion_length": 515.53125, + "epoch": 4.304, + "grad_norm": 0.1748046875, + "kl": 0.20293083041906357, + "learning_rate": 3.513651742312774e-05, + "loss": 0.0081, + "reward": 5.361874103546143, + "reward_std": 2.8834108114242554, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.623844861984253, + "rewards/no_repetition_reward_func": -0.2619709074497223, + "rewards/verse_reward_func": 0.0, + "step": 538 + }, + { + "completion_length": 510.1875, + "epoch": 4.312, + "grad_norm": 0.205078125, + "kl": 0.14090153574943542, + "learning_rate": 3.507266089284157e-05, + "loss": 0.0056, + "reward": 6.219050168991089, + "reward_std": 2.8846421241760254, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 6.564122676849365, + "rewards/no_repetition_reward_func": -0.31382226943969727, + "rewards/verse_reward_func": 0.0, + "step": 539 + }, + { + "completion_length": 514.65625, + "epoch": 4.32, + "grad_norm": 0.1953125, + "kl": 0.19098809361457825, + "learning_rate": 3.5008725813922386e-05, + "loss": 0.0076, + "reward": 4.925287246704102, + "reward_std": 3.076250195503235, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.193477392196655, + "rewards/no_repetition_reward_func": -0.2525651305913925, + "rewards/verse_reward_func": 0.0, + "step": 540 + }, + { + "completion_length": 496.359375, + "epoch": 4.328, + "grad_norm": 0.25390625, + "kl": 0.24945861101150513, + "learning_rate": 3.494471268494875e-05, + "loss": 0.01, + "reward": 3.7489601373672485, + "reward_std": 3.169994354248047, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.976336717605591, + "rewards/no_repetition_reward_func": -0.20393887162208557, + "rewards/verse_reward_func": -0.0234375, + "step": 541 + }, + { + "completion_length": 515.875, + "epoch": 4.336, + "grad_norm": 0.1923828125, + "kl": 0.1884372979402542, + "learning_rate": 3.488062200510791e-05, + "loss": 0.0075, + "reward": 5.233696222305298, + "reward_std": 3.2234585285186768, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.528154134750366, + "rewards/no_repetition_reward_func": -0.27102065086364746, + "rewards/verse_reward_func": -0.0078125, + "step": 542 + }, + { + "completion_length": 503.65625, + "epoch": 4.344, + "grad_norm": 0.18359375, + "kl": 0.2615293115377426, + "learning_rate": 3.481645427419188e-05, + "loss": 0.0105, + "reward": 3.8400638103485107, + "reward_std": 3.426116943359375, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.087488770484924, + "rewards/no_repetition_reward_func": -0.22398744523525238, + "rewards/verse_reward_func": -0.0234375, + "step": 543 + }, + { + "completion_length": 512.9375, + "epoch": 4.352, + "grad_norm": 0.1748046875, + "kl": 0.26356877386569977, + "learning_rate": 3.475220999259349e-05, + "loss": 0.0105, + "reward": 3.9069961309432983, + "reward_std": 2.753282070159912, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.165846824645996, + "rewards/no_repetition_reward_func": -0.21978838741779327, + "rewards/verse_reward_func": -0.0390625, + "step": 544 + }, + { + "completion_length": 516.0, + "epoch": 4.36, + "grad_norm": 0.2060546875, + "kl": 0.2234622687101364, + "learning_rate": 3.4687889661302576e-05, + "loss": 0.0089, + "reward": 4.491146564483643, + "reward_std": 3.5497554540634155, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.738298654556274, + "rewards/no_repetition_reward_func": -0.24715209752321243, + "rewards/verse_reward_func": 0.0, + "step": 545 + }, + { + "completion_length": 503.703125, + "epoch": 4.368, + "grad_norm": 0.2099609375, + "kl": 0.2157966047525406, + "learning_rate": 3.462349378190199e-05, + "loss": 0.0086, + "reward": 5.26504111289978, + "reward_std": 3.0456260442733765, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.5541462898254395, + "rewards/no_repetition_reward_func": -0.2656678259372711, + "rewards/verse_reward_func": -0.0234375, + "step": 546 + }, + { + "completion_length": 510.03125, + "epoch": 4.376, + "grad_norm": 1.9296875, + "kl": 0.20755886286497116, + "learning_rate": 3.455902285656373e-05, + "loss": 0.0083, + "reward": 6.1137073040008545, + "reward_std": 2.584822654724121, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.421464443206787, + "rewards/no_repetition_reward_func": -0.29213210940361023, + "rewards/verse_reward_func": -0.015625, + "step": 547 + }, + { + "completion_length": 495.265625, + "epoch": 4.384, + "grad_norm": 79.0, + "kl": 1.9659850969910622, + "learning_rate": 3.4494477388045035e-05, + "loss": 0.0786, + "reward": 5.596468210220337, + "reward_std": 2.803427815437317, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.861933469772339, + "rewards/no_repetition_reward_func": -0.26546528935432434, + "rewards/verse_reward_func": 0.0, + "step": 548 + }, + { + "completion_length": 513.46875, + "epoch": 4.392, + "grad_norm": 0.1767578125, + "kl": 0.19592411071062088, + "learning_rate": 3.442985787968442e-05, + "loss": 0.0078, + "reward": 5.476759433746338, + "reward_std": 2.7368521690368652, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.761664628982544, + "rewards/no_repetition_reward_func": -0.28490543365478516, + "rewards/verse_reward_func": 0.0, + "step": 549 + }, + { + "completion_length": 514.578125, + "epoch": 4.4, + "grad_norm": 0.2138671875, + "kl": 0.20074068009853363, + "learning_rate": 3.436516483539781e-05, + "loss": 0.008, + "reward": 4.975238800048828, + "reward_std": 3.109027862548828, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.231516361236572, + "rewards/no_repetition_reward_func": -0.25627756118774414, + "rewards/verse_reward_func": 0.0, + "step": 550 + }, + { + "completion_length": 510.1875, + "epoch": 4.408, + "grad_norm": 0.18359375, + "kl": 0.24647016823291779, + "learning_rate": 3.430039875967454e-05, + "loss": 0.0099, + "reward": 4.5355528593063354, + "reward_std": 2.7886996269226074, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.810683727264404, + "rewards/no_repetition_reward_func": -0.2282554656267166, + "rewards/verse_reward_func": -0.046875, + "step": 551 + }, + { + "completion_length": 516.0, + "epoch": 4.416, + "grad_norm": 0.205078125, + "kl": 0.18293453752994537, + "learning_rate": 3.423556015757349e-05, + "loss": 0.0073, + "reward": 5.367977619171143, + "reward_std": 2.9948999881744385, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.640580415725708, + "rewards/no_repetition_reward_func": -0.27260229736566544, + "rewards/verse_reward_func": 0.0, + "step": 552 + }, + { + "completion_length": 511.421875, + "epoch": 4.424, + "grad_norm": 0.1669921875, + "kl": 0.23238176107406616, + "learning_rate": 3.417064953471911e-05, + "loss": 0.0093, + "reward": 5.254671812057495, + "reward_std": 2.9631690979003906, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 5.559026479721069, + "rewards/no_repetition_reward_func": -0.25747985392808914, + "rewards/verse_reward_func": -0.015625, + "step": 553 + }, + { + "completion_length": 516.0, + "epoch": 4.432, + "grad_norm": 0.193359375, + "kl": 0.176967553794384, + "learning_rate": 3.410566739729746e-05, + "loss": 0.0071, + "reward": 5.7460198402404785, + "reward_std": 3.7330437898635864, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.045864105224609, + "rewards/no_repetition_reward_func": -0.2920314222574234, + "rewards/verse_reward_func": -0.0078125, + "step": 554 + }, + { + "completion_length": 516.0, + "epoch": 4.44, + "grad_norm": 0.2099609375, + "kl": 0.27003388851881027, + "learning_rate": 3.4040614252052305e-05, + "loss": 0.0108, + "reward": 4.215547800064087, + "reward_std": 2.5815398693084717, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.425406217575073, + "rewards/no_repetition_reward_func": -0.20204610377550125, + "rewards/verse_reward_func": -0.0078125, + "step": 555 + }, + { + "completion_length": 515.4375, + "epoch": 4.448, + "grad_norm": 0.2158203125, + "kl": 0.23902509361505508, + "learning_rate": 3.397549060628116e-05, + "loss": 0.0096, + "reward": 4.388993501663208, + "reward_std": 2.60500431060791, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.649957895278931, + "rewards/no_repetition_reward_func": -0.23752688616514206, + "rewards/verse_reward_func": -0.0078125, + "step": 556 + }, + { + "completion_length": 515.28125, + "epoch": 4.456, + "grad_norm": 0.2001953125, + "kl": 0.22313234955072403, + "learning_rate": 3.3910296967831266e-05, + "loss": 0.0089, + "reward": 4.3028950691223145, + "reward_std": 3.826229453086853, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.565396547317505, + "rewards/no_repetition_reward_func": -0.2546886131167412, + "rewards/verse_reward_func": -0.0078125, + "step": 557 + }, + { + "completion_length": 516.0, + "epoch": 4.464, + "grad_norm": 0.19140625, + "kl": 0.18375176936388016, + "learning_rate": 3.384503384509574e-05, + "loss": 0.0074, + "reward": 5.182091951370239, + "reward_std": 3.46955668926239, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.451651334762573, + "rewards/no_repetition_reward_func": -0.2695596069097519, + "rewards/verse_reward_func": 0.0, + "step": 558 + }, + { + "completion_length": 515.640625, + "epoch": 4.4719999999999995, + "grad_norm": 0.1943359375, + "kl": 0.17546287178993225, + "learning_rate": 3.3779701747009504e-05, + "loss": 0.007, + "reward": 5.963536024093628, + "reward_std": 3.4496405124664307, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.2451300621032715, + "rewards/no_repetition_reward_func": -0.28159406781196594, + "rewards/verse_reward_func": 0.0, + "step": 559 + }, + { + "completion_length": 516.0, + "epoch": 4.48, + "grad_norm": 0.1806640625, + "kl": 0.21535509079694748, + "learning_rate": 3.3714301183045385e-05, + "loss": 0.0086, + "reward": 4.514761447906494, + "reward_std": 3.0301601886749268, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.823708415031433, + "rewards/no_repetition_reward_func": -0.2542591765522957, + "rewards/verse_reward_func": -0.0390625, + "step": 560 + }, + { + "completion_length": 513.265625, + "epoch": 4.4879999999999995, + "grad_norm": 0.1923828125, + "kl": 0.27059417963027954, + "learning_rate": 3.3648832663210124e-05, + "loss": 0.0108, + "reward": 4.384165525436401, + "reward_std": 2.977095603942871, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.645999431610107, + "rewards/no_repetition_reward_func": -0.22277120500802994, + "rewards/verse_reward_func": -0.0390625, + "step": 561 + }, + { + "completion_length": 515.296875, + "epoch": 4.496, + "grad_norm": 0.18359375, + "kl": 0.22050277143716812, + "learning_rate": 3.3583296698040384e-05, + "loss": 0.0088, + "reward": 5.314608097076416, + "reward_std": 3.060106873512268, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.584301948547363, + "rewards/no_repetition_reward_func": -0.2696940675377846, + "rewards/verse_reward_func": 0.0, + "step": 562 + }, + { + "completion_length": 513.34375, + "epoch": 4.504, + "grad_norm": 0.173828125, + "kl": 0.2677934244275093, + "learning_rate": 3.35176937985988e-05, + "loss": 0.0107, + "reward": 3.556623101234436, + "reward_std": 3.1804572343826294, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.777412533760071, + "rewards/no_repetition_reward_func": -0.2051641196012497, + "rewards/verse_reward_func": -0.015625, + "step": 563 + }, + { + "completion_length": 510.921875, + "epoch": 4.5120000000000005, + "grad_norm": 0.193359375, + "kl": 0.23112258315086365, + "learning_rate": 3.3452024476469934e-05, + "loss": 0.0092, + "reward": 4.365535736083984, + "reward_std": 2.8397226333618164, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.625347018241882, + "rewards/no_repetition_reward_func": -0.23637376725673676, + "rewards/verse_reward_func": -0.0234375, + "step": 564 + }, + { + "completion_length": 502.484375, + "epoch": 4.52, + "grad_norm": 0.22265625, + "kl": 0.32302069664001465, + "learning_rate": 3.338628924375638e-05, + "loss": 0.0129, + "reward": 4.069131851196289, + "reward_std": 2.841530680656433, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.2911272048950195, + "rewards/no_repetition_reward_func": -0.19074541330337524, + "rewards/verse_reward_func": -0.03125, + "step": 565 + }, + { + "completion_length": 500.421875, + "epoch": 4.5280000000000005, + "grad_norm": 0.83984375, + "kl": 0.2808300852775574, + "learning_rate": 3.332048861307467e-05, + "loss": 0.0112, + "reward": 4.000061273574829, + "reward_std": 2.887525796890259, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.241569399833679, + "rewards/no_repetition_reward_func": -0.21807054430246353, + "rewards/verse_reward_func": -0.0234375, + "step": 566 + }, + { + "completion_length": 506.4375, + "epoch": 4.536, + "grad_norm": 0.19921875, + "kl": 0.20886264741420746, + "learning_rate": 3.325462309755134e-05, + "loss": 0.0084, + "reward": 4.600939989089966, + "reward_std": 3.4389188289642334, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.871828079223633, + "rewards/no_repetition_reward_func": -0.24745045602321625, + "rewards/verse_reward_func": -0.0078125, + "step": 567 + }, + { + "completion_length": 515.890625, + "epoch": 4.5440000000000005, + "grad_norm": 0.181640625, + "kl": 0.21459467709064484, + "learning_rate": 3.318869321081892e-05, + "loss": 0.0086, + "reward": 5.484324216842651, + "reward_std": 2.904571771621704, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.766284704208374, + "rewards/no_repetition_reward_func": -0.2663355693221092, + "rewards/verse_reward_func": -0.015625, + "step": 568 + }, + { + "completion_length": 510.28125, + "epoch": 4.552, + "grad_norm": 0.2041015625, + "kl": 0.18731652200222015, + "learning_rate": 3.312269946701191e-05, + "loss": 0.0075, + "reward": 5.789571523666382, + "reward_std": 3.2184120416641235, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.109745502471924, + "rewards/no_repetition_reward_func": -0.2967362254858017, + "rewards/verse_reward_func": -0.0234375, + "step": 569 + }, + { + "completion_length": 509.65625, + "epoch": 4.5600000000000005, + "grad_norm": 0.203125, + "kl": 0.1569320149719715, + "learning_rate": 3.305664238076278e-05, + "loss": 0.0063, + "reward": 6.146397113800049, + "reward_std": 2.8155157566070557, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.452664613723755, + "rewards/no_repetition_reward_func": -0.2984551787376404, + "rewards/verse_reward_func": -0.0078125, + "step": 570 + }, + { + "completion_length": 516.0, + "epoch": 4.568, + "grad_norm": 0.1826171875, + "kl": 0.17912110686302185, + "learning_rate": 3.299052246719795e-05, + "loss": 0.0072, + "reward": 5.80401086807251, + "reward_std": 3.1359071731567383, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 6.1298439502716064, + "rewards/no_repetition_reward_func": -0.29458291828632355, + "rewards/verse_reward_func": 0.0, + "step": 571 + }, + { + "completion_length": 502.203125, + "epoch": 4.576, + "grad_norm": 0.1982421875, + "kl": 0.28157514333724976, + "learning_rate": 3.29243402419338e-05, + "loss": 0.0113, + "reward": 4.405358910560608, + "reward_std": 1.9437495470046997, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.644304990768433, + "rewards/no_repetition_reward_func": -0.22332128882408142, + "rewards/verse_reward_func": -0.015625, + "step": 572 + }, + { + "completion_length": 508.5, + "epoch": 4.584, + "grad_norm": 0.275390625, + "kl": 0.24010606110095978, + "learning_rate": 3.28580962210726e-05, + "loss": 0.0096, + "reward": 4.553777456283569, + "reward_std": 2.6702784299850464, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.807963848114014, + "rewards/no_repetition_reward_func": -0.2385615035891533, + "rewards/verse_reward_func": -0.015625, + "step": 573 + }, + { + "completion_length": 516.0, + "epoch": 4.592, + "grad_norm": 0.2109375, + "kl": 0.21203209459781647, + "learning_rate": 3.279179092119855e-05, + "loss": 0.0085, + "reward": 5.331226587295532, + "reward_std": 3.5693247318267822, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.606134653091431, + "rewards/no_repetition_reward_func": -0.26709507405757904, + "rewards/verse_reward_func": -0.0078125, + "step": 574 + }, + { + "completion_length": 508.796875, + "epoch": 4.6, + "grad_norm": 0.2001953125, + "kl": 0.24711208045482635, + "learning_rate": 3.272542485937369e-05, + "loss": 0.0099, + "reward": 4.078946113586426, + "reward_std": 3.058727502822876, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.357556104660034, + "rewards/no_repetition_reward_func": -0.23173526674509048, + "rewards/verse_reward_func": -0.03125, + "step": 575 + }, + { + "completion_length": 515.5, + "epoch": 4.608, + "grad_norm": 0.212890625, + "kl": 0.25184937566518784, + "learning_rate": 3.2658998553133895e-05, + "loss": 0.0101, + "reward": 5.022035360336304, + "reward_std": 2.5013380646705627, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.279407024383545, + "rewards/no_repetition_reward_func": -0.2417466714978218, + "rewards/verse_reward_func": -0.015625, + "step": 576 + }, + { + "completion_length": 507.015625, + "epoch": 4.616, + "grad_norm": 0.181640625, + "kl": 0.24450106173753738, + "learning_rate": 3.2592512520484856e-05, + "loss": 0.0098, + "reward": 4.141484498977661, + "reward_std": 3.7475030422210693, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.420349597930908, + "rewards/no_repetition_reward_func": -0.24761539697647095, + "rewards/verse_reward_func": -0.03125, + "step": 577 + }, + { + "completion_length": 504.75, + "epoch": 4.624, + "grad_norm": 0.2197265625, + "kl": 0.1553335338830948, + "learning_rate": 3.2525967279898015e-05, + "loss": 0.0062, + "reward": 5.801112651824951, + "reward_std": 3.515202283859253, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.133133888244629, + "rewards/no_repetition_reward_func": -0.31639617681503296, + "rewards/verse_reward_func": 0.0, + "step": 578 + }, + { + "completion_length": 509.46875, + "epoch": 4.632, + "grad_norm": 0.2001953125, + "kl": 0.22476254403591156, + "learning_rate": 3.245936335030651e-05, + "loss": 0.009, + "reward": 4.40500283241272, + "reward_std": 3.1954180002212524, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.678601026535034, + "rewards/no_repetition_reward_func": -0.2501606345176697, + "rewards/verse_reward_func": -0.0078125, + "step": 579 + }, + { + "completion_length": 515.1875, + "epoch": 4.64, + "grad_norm": 0.1953125, + "kl": 0.24848103523254395, + "learning_rate": 3.239270125110117e-05, + "loss": 0.0099, + "reward": 4.960968017578125, + "reward_std": 2.733099579811096, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.226645469665527, + "rewards/no_repetition_reward_func": -0.2500525787472725, + "rewards/verse_reward_func": -0.015625, + "step": 580 + }, + { + "completion_length": 516.0, + "epoch": 4.648, + "grad_norm": 0.1865234375, + "kl": 0.1894937977194786, + "learning_rate": 3.2325981502126433e-05, + "loss": 0.0076, + "reward": 5.687351226806641, + "reward_std": 2.9971230030059814, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.0232648849487305, + "rewards/no_repetition_reward_func": -0.29685093462467194, + "rewards/verse_reward_func": -0.0234375, + "step": 581 + }, + { + "completion_length": 503.140625, + "epoch": 4.656, + "grad_norm": 0.439453125, + "kl": 0.2511332705616951, + "learning_rate": 3.225920462367632e-05, + "loss": 0.01, + "reward": 4.765644431114197, + "reward_std": 3.162333369255066, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.006086111068726, + "rewards/no_repetition_reward_func": -0.2326294183731079, + "rewards/verse_reward_func": -0.0078125, + "step": 582 + }, + { + "completion_length": 512.53125, + "epoch": 4.664, + "grad_norm": 0.189453125, + "kl": 0.24780171364545822, + "learning_rate": 3.219237113649032e-05, + "loss": 0.0099, + "reward": 4.742552757263184, + "reward_std": 3.5421230792999268, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.019674062728882, + "rewards/no_repetition_reward_func": -0.24587150663137436, + "rewards/verse_reward_func": -0.015625, + "step": 583 + }, + { + "completion_length": 516.0, + "epoch": 4.672, + "grad_norm": 0.18359375, + "kl": 0.18039565533399582, + "learning_rate": 3.21254815617494e-05, + "loss": 0.0072, + "reward": 5.8177876472473145, + "reward_std": 3.3211348056793213, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.12570595741272, + "rewards/no_repetition_reward_func": -0.3079179525375366, + "rewards/verse_reward_func": 0.0, + "step": 584 + }, + { + "completion_length": 516.0, + "epoch": 4.68, + "grad_norm": 0.2109375, + "kl": 0.20110145211219788, + "learning_rate": 3.205853642107192e-05, + "loss": 0.008, + "reward": 4.924796104431152, + "reward_std": 3.117445707321167, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.209899425506592, + "rewards/no_repetition_reward_func": -0.2694786339998245, + "rewards/verse_reward_func": -0.015625, + "step": 585 + }, + { + "completion_length": 514.609375, + "epoch": 4.688, + "grad_norm": 0.1748046875, + "kl": 0.19881455600261688, + "learning_rate": 3.19915362365095e-05, + "loss": 0.008, + "reward": 5.027056932449341, + "reward_std": 3.3335931301116943, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.330817937850952, + "rewards/no_repetition_reward_func": -0.2803232967853546, + "rewards/verse_reward_func": -0.0078125, + "step": 586 + }, + { + "completion_length": 506.796875, + "epoch": 4.696, + "grad_norm": 0.2216796875, + "kl": 0.19528641551733017, + "learning_rate": 3.192448153054306e-05, + "loss": 0.0078, + "reward": 5.730622053146362, + "reward_std": 2.2985953092575073, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.036288738250732, + "rewards/no_repetition_reward_func": -0.2978540360927582, + "rewards/verse_reward_func": -0.0078125, + "step": 587 + }, + { + "completion_length": 514.265625, + "epoch": 4.704, + "grad_norm": 0.205078125, + "kl": 0.2187308669090271, + "learning_rate": 3.185737282607867e-05, + "loss": 0.0087, + "reward": 5.173870325088501, + "reward_std": 3.3795015811920166, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.4399285316467285, + "rewards/no_repetition_reward_func": -0.2660585567355156, + "rewards/verse_reward_func": 0.0, + "step": 588 + }, + { + "completion_length": 516.0, + "epoch": 4.712, + "grad_norm": 0.18359375, + "kl": 0.1370883285999298, + "learning_rate": 3.179021064644347e-05, + "loss": 0.0055, + "reward": 6.247809886932373, + "reward_std": 2.8254271745681763, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.578667402267456, + "rewards/no_repetition_reward_func": -0.3308577537536621, + "rewards/verse_reward_func": 0.0, + "step": 589 + }, + { + "completion_length": 508.921875, + "epoch": 4.72, + "grad_norm": 0.3359375, + "kl": 0.15569307655096054, + "learning_rate": 3.172299551538164e-05, + "loss": 0.0062, + "reward": 5.813715934753418, + "reward_std": 3.013609766960144, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.1305811405181885, + "rewards/no_repetition_reward_func": -0.30905257165431976, + "rewards/verse_reward_func": -0.0078125, + "step": 590 + }, + { + "completion_length": 514.40625, + "epoch": 4.728, + "grad_norm": 0.208984375, + "kl": 0.23975630104541779, + "learning_rate": 3.1655727957050285e-05, + "loss": 0.0096, + "reward": 3.588590383529663, + "reward_std": 3.2029274702072144, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.8555498123168945, + "rewards/no_repetition_reward_func": -0.2278967648744583, + "rewards/verse_reward_func": -0.0390625, + "step": 591 + }, + { + "completion_length": 515.75, + "epoch": 4.736, + "grad_norm": 0.2001953125, + "kl": 0.20447112619876862, + "learning_rate": 3.158840849601532e-05, + "loss": 0.0082, + "reward": 5.500310659408569, + "reward_std": 3.5160282850265503, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.782203674316406, + "rewards/no_repetition_reward_func": -0.28189267963171005, + "rewards/verse_reward_func": 0.0, + "step": 592 + }, + { + "completion_length": 516.0, + "epoch": 4.744, + "grad_norm": 0.1748046875, + "kl": 0.12763425707817078, + "learning_rate": 3.152103765724743e-05, + "loss": 0.0051, + "reward": 6.68639874458313, + "reward_std": 2.576283574104309, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.024906635284424, + "rewards/no_repetition_reward_func": -0.3385075181722641, + "rewards/verse_reward_func": 0.0, + "step": 593 + }, + { + "completion_length": 506.5625, + "epoch": 4.752, + "grad_norm": 0.404296875, + "kl": 0.21976373344659805, + "learning_rate": 3.145361596611795e-05, + "loss": 0.0088, + "reward": 5.36754846572876, + "reward_std": 3.229415774345398, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.6472625732421875, + "rewards/no_repetition_reward_func": -0.2719014883041382, + "rewards/verse_reward_func": -0.0078125, + "step": 594 + }, + { + "completion_length": 495.28125, + "epoch": 4.76, + "grad_norm": 0.267578125, + "kl": 0.2704378515481949, + "learning_rate": 3.138614394839476e-05, + "loss": 0.0108, + "reward": 4.6498401165008545, + "reward_std": 3.670860767364502, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 4.906737923622131, + "rewards/no_repetition_reward_func": -0.22564759850502014, + "rewards/verse_reward_func": -0.015625, + "step": 595 + }, + { + "completion_length": 516.0, + "epoch": 4.768, + "grad_norm": 0.185546875, + "kl": 0.17064060270786285, + "learning_rate": 3.1318622130238236e-05, + "loss": 0.0068, + "reward": 6.372533082962036, + "reward_std": 2.9471863508224487, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.686938762664795, + "rewards/no_repetition_reward_func": -0.31440530717372894, + "rewards/verse_reward_func": 0.0, + "step": 596 + }, + { + "completion_length": 514.125, + "epoch": 4.776, + "grad_norm": 0.1796875, + "kl": 0.1633630320429802, + "learning_rate": 3.1251051038197055e-05, + "loss": 0.0065, + "reward": 6.001970052719116, + "reward_std": 2.365971565246582, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.321408748626709, + "rewards/no_repetition_reward_func": -0.3116268366575241, + "rewards/verse_reward_func": -0.0078125, + "step": 597 + }, + { + "completion_length": 516.0, + "epoch": 4.784, + "grad_norm": 0.181640625, + "kl": 0.2213788703083992, + "learning_rate": 3.118343119920418e-05, + "loss": 0.0089, + "reward": 5.0343005657196045, + "reward_std": 3.6028980016708374, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.322154402732849, + "rewards/no_repetition_reward_func": -0.27222882956266403, + "rewards/verse_reward_func": 0.0, + "step": 598 + }, + { + "completion_length": 515.1875, + "epoch": 4.792, + "grad_norm": 0.2099609375, + "kl": 0.21631046384572983, + "learning_rate": 3.111576314057268e-05, + "loss": 0.0087, + "reward": 4.925496339797974, + "reward_std": 3.479201316833496, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.199922800064087, + "rewards/no_repetition_reward_func": -0.2588016390800476, + "rewards/verse_reward_func": -0.015625, + "step": 599 + }, + { + "completion_length": 515.6875, + "epoch": 4.8, + "grad_norm": 0.19140625, + "kl": 0.2380562722682953, + "learning_rate": 3.104804738999169e-05, + "loss": 0.0095, + "reward": 5.331606149673462, + "reward_std": 3.5722521543502808, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.609500885009766, + "rewards/no_repetition_reward_func": -0.26226988434791565, + "rewards/verse_reward_func": -0.015625, + "step": 600 + }, + { + "completion_length": 508.984375, + "epoch": 4.808, + "grad_norm": 0.2080078125, + "kl": 0.1760089173913002, + "learning_rate": 3.098028447552224e-05, + "loss": 0.007, + "reward": 6.140415191650391, + "reward_std": 3.2219629287719727, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.457921981811523, + "rewards/no_repetition_reward_func": -0.30188198387622833, + "rewards/verse_reward_func": 0.0, + "step": 601 + }, + { + "completion_length": 516.0, + "epoch": 4.816, + "grad_norm": 0.1708984375, + "kl": 0.24913793802261353, + "learning_rate": 3.091247492559312e-05, + "loss": 0.01, + "reward": 4.799865961074829, + "reward_std": 3.5923553705215454, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.063502550125122, + "rewards/no_repetition_reward_func": -0.2636365592479706, + "rewards/verse_reward_func": 0.0, + "step": 602 + }, + { + "completion_length": 508.0625, + "epoch": 4.824, + "grad_norm": 0.28125, + "kl": 0.18750837445259094, + "learning_rate": 3.0844619268996845e-05, + "loss": 0.0075, + "reward": 5.714895963668823, + "reward_std": 3.2878360748291016, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.004732370376587, + "rewards/no_repetition_reward_func": -0.2898365259170532, + "rewards/verse_reward_func": 0.0, + "step": 603 + }, + { + "completion_length": 514.515625, + "epoch": 4.832, + "grad_norm": 0.2255859375, + "kl": 0.1873496100306511, + "learning_rate": 3.0776718034885454e-05, + "loss": 0.0075, + "reward": 6.542996644973755, + "reward_std": 2.5697269439697266, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.858989000320435, + "rewards/no_repetition_reward_func": -0.3003672808408737, + "rewards/verse_reward_func": 0.0, + "step": 604 + }, + { + "completion_length": 515.1875, + "epoch": 4.84, + "grad_norm": 0.189453125, + "kl": 0.1887839511036873, + "learning_rate": 3.0708771752766394e-05, + "loss": 0.0076, + "reward": 5.926172494888306, + "reward_std": 3.1925785541534424, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.234638929367065, + "rewards/no_repetition_reward_func": -0.2928412929177284, + "rewards/verse_reward_func": 0.0, + "step": 605 + }, + { + "completion_length": 510.875, + "epoch": 4.848, + "grad_norm": 0.26171875, + "kl": 0.12645846977829933, + "learning_rate": 3.064078095249844e-05, + "loss": 0.0051, + "reward": 6.991218328475952, + "reward_std": 2.5180020332336426, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.349045038223267, + "rewards/no_repetition_reward_func": -0.3422015756368637, + "rewards/verse_reward_func": 0.0, + "step": 606 + }, + { + "completion_length": 505.21875, + "epoch": 4.856, + "grad_norm": 0.21875, + "kl": 0.2457709163427353, + "learning_rate": 3.0572746164287514e-05, + "loss": 0.0098, + "reward": 5.2091851234436035, + "reward_std": 3.721817970275879, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.459376096725464, + "rewards/no_repetition_reward_func": -0.25019127130508423, + "rewards/verse_reward_func": 0.0, + "step": 607 + }, + { + "completion_length": 510.625, + "epoch": 4.864, + "grad_norm": 0.193359375, + "kl": 0.2114773616194725, + "learning_rate": 3.050466791868254e-05, + "loss": 0.0085, + "reward": 5.533069372177124, + "reward_std": 3.423860192298889, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.807988405227661, + "rewards/no_repetition_reward_func": -0.2749189957976341, + "rewards/verse_reward_func": 0.0, + "step": 608 + }, + { + "completion_length": 513.34375, + "epoch": 4.872, + "grad_norm": 0.2041015625, + "kl": 0.2933470755815506, + "learning_rate": 3.0436546746571372e-05, + "loss": 0.0117, + "reward": 3.077777862548828, + "reward_std": 3.3922618627548218, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.303995966911316, + "rewards/no_repetition_reward_func": -0.18715576082468033, + "rewards/verse_reward_func": -0.0390625, + "step": 609 + }, + { + "completion_length": 513.234375, + "epoch": 4.88, + "grad_norm": 0.1953125, + "kl": 0.1959448680281639, + "learning_rate": 3.0368383179176585e-05, + "loss": 0.0078, + "reward": 6.138750076293945, + "reward_std": 3.4504711627960205, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.45858907699585, + "rewards/no_repetition_reward_func": -0.2964017391204834, + "rewards/verse_reward_func": -0.0078125, + "step": 610 + }, + { + "completion_length": 516.0, + "epoch": 4.888, + "grad_norm": 0.1630859375, + "kl": 0.1348143070936203, + "learning_rate": 3.0300177748051373e-05, + "loss": 0.0054, + "reward": 6.3840250968933105, + "reward_std": 3.4283504486083984, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.729896783828735, + "rewards/no_repetition_reward_func": -0.3458718955516815, + "rewards/verse_reward_func": 0.0, + "step": 611 + }, + { + "completion_length": 515.109375, + "epoch": 4.896, + "grad_norm": 0.203125, + "kl": 0.1324361264705658, + "learning_rate": 3.023193098507538e-05, + "loss": 0.0053, + "reward": 6.58003568649292, + "reward_std": 2.837936043739319, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.92301344871521, + "rewards/no_repetition_reward_func": -0.3351650834083557, + "rewards/verse_reward_func": -0.0078125, + "step": 612 + }, + { + "completion_length": 514.953125, + "epoch": 4.904, + "grad_norm": 0.1982421875, + "kl": 0.18375316262245178, + "learning_rate": 3.016364342245059e-05, + "loss": 0.0074, + "reward": 5.758064270019531, + "reward_std": 3.112334370613098, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.106787443161011, + "rewards/no_repetition_reward_func": -0.3096608370542526, + "rewards/verse_reward_func": -0.0390625, + "step": 613 + }, + { + "completion_length": 516.0, + "epoch": 4.912, + "grad_norm": 0.16796875, + "kl": 0.2547123357653618, + "learning_rate": 3.0095315592697126e-05, + "loss": 0.0102, + "reward": 4.80853545665741, + "reward_std": 2.5227972269058228, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.05780827999115, + "rewards/no_repetition_reward_func": -0.24927300959825516, + "rewards/verse_reward_func": 0.0, + "step": 614 + }, + { + "completion_length": 514.546875, + "epoch": 4.92, + "grad_norm": 0.18359375, + "kl": 0.2569102346897125, + "learning_rate": 3.002694802864912e-05, + "loss": 0.0103, + "reward": 5.003098964691162, + "reward_std": 2.870938777923584, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.262174606323242, + "rewards/no_repetition_reward_func": -0.24345087260007858, + "rewards/verse_reward_func": 0.0, + "step": 615 + }, + { + "completion_length": 511.03125, + "epoch": 4.928, + "grad_norm": 0.201171875, + "kl": 0.2061474695801735, + "learning_rate": 2.9958541263450584e-05, + "loss": 0.0082, + "reward": 5.489931344985962, + "reward_std": 2.9883373975753784, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.793691635131836, + "rewards/no_repetition_reward_func": -0.28032275289297104, + "rewards/verse_reward_func": -0.0234375, + "step": 616 + }, + { + "completion_length": 516.0, + "epoch": 4.936, + "grad_norm": 0.1669921875, + "kl": 0.20505118370056152, + "learning_rate": 2.9890095830551207e-05, + "loss": 0.0082, + "reward": 5.455273151397705, + "reward_std": 2.5982197523117065, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.7504096031188965, + "rewards/no_repetition_reward_func": -0.29513655602931976, + "rewards/verse_reward_func": 0.0, + "step": 617 + }, + { + "completion_length": 510.578125, + "epoch": 4.944, + "grad_norm": 0.1552734375, + "kl": 0.12448585778474808, + "learning_rate": 2.9821612263702226e-05, + "loss": 0.005, + "reward": 6.307903051376343, + "reward_std": 3.349727749824524, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.670935869216919, + "rewards/no_repetition_reward_func": -0.3474079668521881, + "rewards/verse_reward_func": -0.015625, + "step": 618 + }, + { + "completion_length": 509.15625, + "epoch": 4.952, + "grad_norm": 0.19921875, + "kl": 0.1755780577659607, + "learning_rate": 2.9753091096952255e-05, + "loss": 0.007, + "reward": 5.120368957519531, + "reward_std": 3.4990864992141724, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.422945022583008, + "rewards/no_repetition_reward_func": -0.3025761544704437, + "rewards/verse_reward_func": 0.0, + "step": 619 + }, + { + "completion_length": 516.0, + "epoch": 4.96, + "grad_norm": 0.2001953125, + "kl": 0.18755263090133667, + "learning_rate": 2.9684532864643122e-05, + "loss": 0.0075, + "reward": 5.644726753234863, + "reward_std": 3.2994948625564575, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.953995704650879, + "rewards/no_repetition_reward_func": -0.29364389181137085, + "rewards/verse_reward_func": 0.0, + "step": 620 + }, + { + "completion_length": 503.96875, + "epoch": 4.968, + "grad_norm": 0.2392578125, + "kl": 0.1960517168045044, + "learning_rate": 2.9615938101405676e-05, + "loss": 0.0078, + "reward": 6.5996949672698975, + "reward_std": 1.8525903820991516, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.9021077156066895, + "rewards/no_repetition_reward_func": -0.2946000024676323, + "rewards/verse_reward_func": -0.0078125, + "step": 621 + }, + { + "completion_length": 506.515625, + "epoch": 4.976, + "grad_norm": 0.1826171875, + "kl": 0.20246191322803497, + "learning_rate": 2.9547307342155673e-05, + "loss": 0.0081, + "reward": 6.0406389236450195, + "reward_std": 2.117041051387787, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.346596002578735, + "rewards/no_repetition_reward_func": -0.2825196832418442, + "rewards/verse_reward_func": -0.0234375, + "step": 622 + }, + { + "completion_length": 516.0, + "epoch": 4.984, + "grad_norm": 0.1767578125, + "kl": 0.20153185725212097, + "learning_rate": 2.9478641122089562e-05, + "loss": 0.0081, + "reward": 5.7961554527282715, + "reward_std": 3.0231058597564697, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.083817720413208, + "rewards/no_repetition_reward_func": -0.2876621335744858, + "rewards/verse_reward_func": 0.0, + "step": 623 + }, + { + "completion_length": 515.234375, + "epoch": 4.992, + "grad_norm": 0.2314453125, + "kl": 0.13752461224794388, + "learning_rate": 2.9409939976680313e-05, + "loss": 0.0055, + "reward": 6.890280246734619, + "reward_std": 2.632196068763733, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.241381406784058, + "rewards/no_repetition_reward_func": -0.3432881087064743, + "rewards/verse_reward_func": -0.0078125, + "step": 624 + }, + { + "completion_length": 516.0, + "epoch": 5.0, + "grad_norm": 0.1796875, + "kl": 0.1949106827378273, + "learning_rate": 2.9341204441673266e-05, + "loss": 0.0078, + "reward": 5.797587156295776, + "reward_std": 3.2299904823303223, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.1107141971588135, + "rewards/no_repetition_reward_func": -0.3053145855665207, + "rewards/verse_reward_func": -0.0078125, + "step": 625 + }, + { + "completion_length": 500.96875, + "epoch": 5.008, + "grad_norm": 0.458984375, + "kl": 0.14213557913899422, + "learning_rate": 2.9272435053081922e-05, + "loss": 0.0057, + "reward": 6.198402404785156, + "reward_std": 3.389514923095703, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.52408242225647, + "rewards/no_repetition_reward_func": -0.32568007707595825, + "rewards/verse_reward_func": 0.0, + "step": 626 + }, + { + "completion_length": 495.09375, + "epoch": 5.016, + "grad_norm": 0.625, + "kl": 0.2185637727379799, + "learning_rate": 2.920363234718379e-05, + "loss": 0.0087, + "reward": 5.403896331787109, + "reward_std": 3.1148195266723633, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.723889112472534, + "rewards/no_repetition_reward_func": -0.2731180414557457, + "rewards/verse_reward_func": -0.046875, + "step": 627 + }, + { + "completion_length": 516.0, + "epoch": 5.024, + "grad_norm": 0.173828125, + "kl": 0.19484518468379974, + "learning_rate": 2.9134796860516194e-05, + "loss": 0.0078, + "reward": 5.888418436050415, + "reward_std": 2.489211320877075, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.203460216522217, + "rewards/no_repetition_reward_func": -0.2994163781404495, + "rewards/verse_reward_func": -0.015625, + "step": 628 + }, + { + "completion_length": 512.578125, + "epoch": 5.032, + "grad_norm": 0.1767578125, + "kl": 0.2090793326497078, + "learning_rate": 2.9065929129872094e-05, + "loss": 0.0084, + "reward": 4.7783123254776, + "reward_std": 3.2869350910186768, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.073755741119385, + "rewards/no_repetition_reward_func": -0.2876308858394623, + "rewards/verse_reward_func": -0.0078125, + "step": 629 + }, + { + "completion_length": 515.5, + "epoch": 5.04, + "grad_norm": 0.20703125, + "kl": 0.2264072149991989, + "learning_rate": 2.8997029692295874e-05, + "loss": 0.0091, + "reward": 6.056830644607544, + "reward_std": 3.2361398935317993, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.316887855529785, + "rewards/no_repetition_reward_func": -0.260057657957077, + "rewards/verse_reward_func": 0.0, + "step": 630 + }, + { + "completion_length": 510.0, + "epoch": 5.048, + "grad_norm": 0.1962890625, + "kl": 0.22923777997493744, + "learning_rate": 2.8928099085079197e-05, + "loss": 0.0092, + "reward": 5.449827671051025, + "reward_std": 3.508596420288086, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.729449033737183, + "rewards/no_repetition_reward_func": -0.2718086838722229, + "rewards/verse_reward_func": -0.0078125, + "step": 631 + }, + { + "completion_length": 516.0, + "epoch": 5.056, + "grad_norm": 0.171875, + "kl": 0.12794525921344757, + "learning_rate": 2.8859137845756784e-05, + "loss": 0.0051, + "reward": 6.447595119476318, + "reward_std": 3.4378491640090942, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.811089992523193, + "rewards/no_repetition_reward_func": -0.3556826561689377, + "rewards/verse_reward_func": -0.0078125, + "step": 632 + }, + { + "completion_length": 516.0, + "epoch": 5.064, + "grad_norm": 0.1650390625, + "kl": 0.1608082912862301, + "learning_rate": 2.879014651210223e-05, + "loss": 0.0064, + "reward": 6.727198600769043, + "reward_std": 2.507846415042877, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.057853937149048, + "rewards/no_repetition_reward_func": -0.32284215092658997, + "rewards/verse_reward_func": -0.0078125, + "step": 633 + }, + { + "completion_length": 513.5625, + "epoch": 5.072, + "grad_norm": 0.1630859375, + "kl": 0.15352094173431396, + "learning_rate": 2.8721125622123806e-05, + "loss": 0.0061, + "reward": 5.725430488586426, + "reward_std": 3.428788900375366, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.0502729415893555, + "rewards/no_repetition_reward_func": -0.3248424828052521, + "rewards/verse_reward_func": 0.0, + "step": 634 + }, + { + "completion_length": 509.90625, + "epoch": 5.08, + "grad_norm": 0.1953125, + "kl": 0.20308247953653336, + "learning_rate": 2.8652075714060295e-05, + "loss": 0.0081, + "reward": 5.635324001312256, + "reward_std": 2.353144407272339, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 5.999676942825317, + "rewards/no_repetition_reward_func": -0.2940404713153839, + "rewards/verse_reward_func": -0.0390625, + "step": 635 + }, + { + "completion_length": 515.90625, + "epoch": 5.088, + "grad_norm": 0.1748046875, + "kl": 0.14186183735728264, + "learning_rate": 2.858299732637674e-05, + "loss": 0.0057, + "reward": 6.344038009643555, + "reward_std": 3.4011298418045044, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.688887357711792, + "rewards/no_repetition_reward_func": -0.34484899044036865, + "rewards/verse_reward_func": 0.0, + "step": 636 + }, + { + "completion_length": 512.328125, + "epoch": 5.096, + "grad_norm": 0.162109375, + "kl": 0.16218054667115211, + "learning_rate": 2.8513890997760272e-05, + "loss": 0.0065, + "reward": 6.275375604629517, + "reward_std": 3.059145450592041, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.5913310050964355, + "rewards/no_repetition_reward_func": -0.3159550577402115, + "rewards/verse_reward_func": 0.0, + "step": 637 + }, + { + "completion_length": 516.0, + "epoch": 5.104, + "grad_norm": 0.189453125, + "kl": 0.12401092052459717, + "learning_rate": 2.844475726711595e-05, + "loss": 0.005, + "reward": 6.900593280792236, + "reward_std": 2.5964099764823914, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.237835645675659, + "rewards/no_repetition_reward_func": -0.3372427076101303, + "rewards/verse_reward_func": 0.0, + "step": 638 + }, + { + "completion_length": 513.109375, + "epoch": 5.112, + "grad_norm": 0.2470703125, + "kl": 0.17664818465709686, + "learning_rate": 2.8375596673562482e-05, + "loss": 0.0071, + "reward": 6.8994786739349365, + "reward_std": 2.628869414329529, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 7.243361473083496, + "rewards/no_repetition_reward_func": -0.312632754445076, + "rewards/verse_reward_func": 0.0, + "step": 639 + }, + { + "completion_length": 505.703125, + "epoch": 5.12, + "grad_norm": 0.201171875, + "kl": 0.22689010947942734, + "learning_rate": 2.8306409756428064e-05, + "loss": 0.0091, + "reward": 5.82088041305542, + "reward_std": 1.780029296875, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.146688222885132, + "rewards/no_repetition_reward_func": -0.27893267571926117, + "rewards/verse_reward_func": -0.046875, + "step": 640 + }, + { + "completion_length": 509.34375, + "epoch": 5.128, + "grad_norm": 0.2353515625, + "kl": 0.26753829419612885, + "learning_rate": 2.8237197055246172e-05, + "loss": 0.0107, + "reward": 4.327687382698059, + "reward_std": 3.750219941139221, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.589497923851013, + "rewards/no_repetition_reward_func": -0.2383730262517929, + "rewards/verse_reward_func": -0.0234375, + "step": 641 + }, + { + "completion_length": 516.0, + "epoch": 5.136, + "grad_norm": 0.18359375, + "kl": 0.15873728692531586, + "learning_rate": 2.816795910975137e-05, + "loss": 0.0063, + "reward": 6.847696304321289, + "reward_std": 2.93625807762146, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.189026594161987, + "rewards/no_repetition_reward_func": -0.3335185796022415, + "rewards/verse_reward_func": -0.0078125, + "step": 642 + }, + { + "completion_length": 516.0, + "epoch": 5.144, + "grad_norm": 0.1689453125, + "kl": 0.12347468733787537, + "learning_rate": 2.8098696459875046e-05, + "loss": 0.0049, + "reward": 7.065732002258301, + "reward_std": 2.9837812185287476, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.433406829833984, + "rewards/no_repetition_reward_func": -0.3676748275756836, + "rewards/verse_reward_func": 0.0, + "step": 643 + }, + { + "completion_length": 516.0, + "epoch": 5.152, + "grad_norm": 0.19140625, + "kl": 0.1354011744260788, + "learning_rate": 2.8029409645741267e-05, + "loss": 0.0054, + "reward": 6.671532869338989, + "reward_std": 2.7982765436172485, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.025006294250488, + "rewards/no_repetition_reward_func": -0.33784861862659454, + "rewards/verse_reward_func": 0.0, + "step": 644 + }, + { + "completion_length": 515.734375, + "epoch": 5.16, + "grad_norm": 0.1767578125, + "kl": 0.25340035930275917, + "learning_rate": 2.7960099207662532e-05, + "loss": 0.0101, + "reward": 4.860542893409729, + "reward_std": 2.421943187713623, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.165318489074707, + "rewards/no_repetition_reward_func": -0.25790026038885117, + "rewards/verse_reward_func": -0.046875, + "step": 645 + }, + { + "completion_length": 509.0625, + "epoch": 5.168, + "grad_norm": 0.298828125, + "kl": 0.2184632644057274, + "learning_rate": 2.7890765686135544e-05, + "loss": 0.0087, + "reward": 5.974319219589233, + "reward_std": 3.394289493560791, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.256080865859985, + "rewards/no_repetition_reward_func": -0.28176169097423553, + "rewards/verse_reward_func": 0.0, + "step": 646 + }, + { + "completion_length": 516.0, + "epoch": 5.176, + "grad_norm": 0.177734375, + "kl": 0.23372486978769302, + "learning_rate": 2.782140962183704e-05, + "loss": 0.0093, + "reward": 5.693535804748535, + "reward_std": 2.765926480293274, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.981910467147827, + "rewards/no_repetition_reward_func": -0.27274955809116364, + "rewards/verse_reward_func": -0.015625, + "step": 647 + }, + { + "completion_length": 516.0, + "epoch": 5.184, + "grad_norm": 0.1748046875, + "kl": 0.2129667028784752, + "learning_rate": 2.7752031555619555e-05, + "loss": 0.0085, + "reward": 5.509127140045166, + "reward_std": 3.0827438831329346, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.827958583831787, + "rewards/no_repetition_reward_func": -0.28758107125759125, + "rewards/verse_reward_func": -0.03125, + "step": 648 + }, + { + "completion_length": 513.25, + "epoch": 5.192, + "grad_norm": 0.17578125, + "kl": 0.18902723491191864, + "learning_rate": 2.7682632028507167e-05, + "loss": 0.0076, + "reward": 5.600545406341553, + "reward_std": 3.541477680206299, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.904689311981201, + "rewards/no_repetition_reward_func": -0.3041435182094574, + "rewards/verse_reward_func": 0.0, + "step": 649 + }, + { + "completion_length": 516.0, + "epoch": 5.2, + "grad_norm": 0.15625, + "kl": 0.16207893937826157, + "learning_rate": 2.761321158169134e-05, + "loss": 0.0065, + "reward": 6.103037118911743, + "reward_std": 2.958517551422119, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.423388957977295, + "rewards/no_repetition_reward_func": -0.3125394582748413, + "rewards/verse_reward_func": -0.0078125, + "step": 650 + }, + { + "completion_length": 516.0, + "epoch": 5.208, + "grad_norm": 0.1708984375, + "kl": 0.2622169405221939, + "learning_rate": 2.754377075652666e-05, + "loss": 0.0105, + "reward": 4.839907884597778, + "reward_std": 3.249625563621521, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.114637136459351, + "rewards/no_repetition_reward_func": -0.2512917220592499, + "rewards/verse_reward_func": -0.0234375, + "step": 651 + }, + { + "completion_length": 510.90625, + "epoch": 5.216, + "grad_norm": 0.1904296875, + "kl": 0.24180752038955688, + "learning_rate": 2.747431009452663e-05, + "loss": 0.0097, + "reward": 5.487088203430176, + "reward_std": 2.601705551147461, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.770691394805908, + "rewards/no_repetition_reward_func": -0.2679784744977951, + "rewards/verse_reward_func": -0.015625, + "step": 652 + }, + { + "completion_length": 516.0, + "epoch": 5.224, + "grad_norm": 0.17578125, + "kl": 0.11701197922229767, + "learning_rate": 2.7404830137359444e-05, + "loss": 0.0047, + "reward": 7.471851825714111, + "reward_std": 2.143284857273102, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.847654819488525, + "rewards/no_repetition_reward_func": -0.3679901957511902, + "rewards/verse_reward_func": -0.0078125, + "step": 653 + }, + { + "completion_length": 512.0, + "epoch": 5.232, + "grad_norm": 0.1982421875, + "kl": 0.2164444848895073, + "learning_rate": 2.733533142684377e-05, + "loss": 0.0087, + "reward": 5.826837062835693, + "reward_std": 2.6904834508895874, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.125123500823975, + "rewards/no_repetition_reward_func": -0.28266138583421707, + "rewards/verse_reward_func": -0.015625, + "step": 654 + }, + { + "completion_length": 515.78125, + "epoch": 5.24, + "grad_norm": 0.201171875, + "kl": 0.20679500699043274, + "learning_rate": 2.726581450494451e-05, + "loss": 0.0083, + "reward": 5.660398483276367, + "reward_std": 3.6214418411254883, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.961563587188721, + "rewards/no_repetition_reward_func": -0.2855398505926132, + "rewards/verse_reward_func": 0.0, + "step": 655 + }, + { + "completion_length": 509.3125, + "epoch": 5.248, + "grad_norm": 0.267578125, + "kl": 0.35962244868278503, + "learning_rate": 2.7196279913768584e-05, + "loss": 0.0144, + "reward": 3.9806346893310547, + "reward_std": 3.039091467857361, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.211110830307007, + "rewards/no_repetition_reward_func": -0.19922614842653275, + "rewards/verse_reward_func": -0.03125, + "step": 656 + }, + { + "completion_length": 516.0, + "epoch": 5.256, + "grad_norm": 0.1865234375, + "kl": 0.23406126350164413, + "learning_rate": 2.7126728195560702e-05, + "loss": 0.0094, + "reward": 5.163386344909668, + "reward_std": 3.004625678062439, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.483241558074951, + "rewards/no_repetition_reward_func": -0.26516780257225037, + "rewards/verse_reward_func": -0.0546875, + "step": 657 + }, + { + "completion_length": 516.0, + "epoch": 5.264, + "grad_norm": 0.1845703125, + "kl": 0.209986612200737, + "learning_rate": 2.705715989269914e-05, + "loss": 0.0084, + "reward": 5.983837366104126, + "reward_std": 3.1269973516464233, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.272311687469482, + "rewards/no_repetition_reward_func": -0.2884744256734848, + "rewards/verse_reward_func": 0.0, + "step": 658 + }, + { + "completion_length": 516.0, + "epoch": 5.272, + "grad_norm": 0.1904296875, + "kl": 0.2576320469379425, + "learning_rate": 2.6987575547691497e-05, + "loss": 0.0103, + "reward": 4.522177696228027, + "reward_std": 3.5544450283050537, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.811301112174988, + "rewards/no_repetition_reward_func": -0.2500614672899246, + "rewards/verse_reward_func": -0.0390625, + "step": 659 + }, + { + "completion_length": 510.40625, + "epoch": 5.28, + "grad_norm": 0.185546875, + "kl": 0.1914951652288437, + "learning_rate": 2.6917975703170466e-05, + "loss": 0.0077, + "reward": 6.016743421554565, + "reward_std": 2.899466037750244, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.326798439025879, + "rewards/no_repetition_reward_func": -0.29443006217479706, + "rewards/verse_reward_func": 0.0, + "step": 660 + }, + { + "completion_length": 515.78125, + "epoch": 5.288, + "grad_norm": 0.1943359375, + "kl": 0.2985571101307869, + "learning_rate": 2.684836090188963e-05, + "loss": 0.0119, + "reward": 5.009984493255615, + "reward_std": 3.3050589561462402, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.266479849815369, + "rewards/no_repetition_reward_func": -0.24868275970220566, + "rewards/verse_reward_func": -0.0078125, + "step": 661 + }, + { + "completion_length": 505.8125, + "epoch": 5.296, + "grad_norm": 0.1943359375, + "kl": 0.30690453946590424, + "learning_rate": 2.6778731686719178e-05, + "loss": 0.0123, + "reward": 4.621723413467407, + "reward_std": 2.7921931743621826, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.865250110626221, + "rewards/no_repetition_reward_func": -0.2200894132256508, + "rewards/verse_reward_func": -0.0234375, + "step": 662 + }, + { + "completion_length": 514.515625, + "epoch": 5.304, + "grad_norm": 0.1767578125, + "kl": 0.15294113010168076, + "learning_rate": 2.6709088600641717e-05, + "loss": 0.0061, + "reward": 6.122514724731445, + "reward_std": 3.073371410369873, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.463867664337158, + "rewards/no_repetition_reward_func": -0.3413529545068741, + "rewards/verse_reward_func": 0.0, + "step": 663 + }, + { + "completion_length": 514.171875, + "epoch": 5.312, + "grad_norm": 0.1904296875, + "kl": 0.2323378324508667, + "learning_rate": 2.6639432186748043e-05, + "loss": 0.0093, + "reward": 5.373915910720825, + "reward_std": 3.240971565246582, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.648488998413086, + "rewards/no_repetition_reward_func": -0.2745730131864548, + "rewards/verse_reward_func": 0.0, + "step": 664 + }, + { + "completion_length": 509.453125, + "epoch": 5.32, + "grad_norm": 0.203125, + "kl": 0.1906152442097664, + "learning_rate": 2.656976298823284e-05, + "loss": 0.0076, + "reward": 5.839623689651489, + "reward_std": 3.4591480493545532, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.189352512359619, + "rewards/no_repetition_reward_func": -0.3184790164232254, + "rewards/verse_reward_func": -0.03125, + "step": 665 + }, + { + "completion_length": 513.953125, + "epoch": 5.328, + "grad_norm": 0.2041015625, + "kl": 0.24031688272953033, + "learning_rate": 2.650008154839052e-05, + "loss": 0.0096, + "reward": 5.651813983917236, + "reward_std": 3.3952611684799194, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.943041086196899, + "rewards/no_repetition_reward_func": -0.27560243010520935, + "rewards/verse_reward_func": -0.015625, + "step": 666 + }, + { + "completion_length": 513.8125, + "epoch": 5.336, + "grad_norm": 0.17578125, + "kl": 0.22338654845952988, + "learning_rate": 2.6430388410610955e-05, + "loss": 0.0089, + "reward": 5.388790607452393, + "reward_std": 3.225326418876648, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.673320531845093, + "rewards/no_repetition_reward_func": -0.27671732008457184, + "rewards/verse_reward_func": -0.0078125, + "step": 667 + }, + { + "completion_length": 515.828125, + "epoch": 5.344, + "grad_norm": 0.201171875, + "kl": 0.23840393126010895, + "learning_rate": 2.636068411837523e-05, + "loss": 0.0095, + "reward": 5.239973306655884, + "reward_std": 3.1339597702026367, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.5067713260650635, + "rewards/no_repetition_reward_func": -0.26679830253124237, + "rewards/verse_reward_func": 0.0, + "step": 668 + }, + { + "completion_length": 510.453125, + "epoch": 5.352, + "grad_norm": 0.1865234375, + "kl": 0.19500833749771118, + "learning_rate": 2.6290969215251416e-05, + "loss": 0.0078, + "reward": 5.97485089302063, + "reward_std": 2.665772020816803, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.279499053955078, + "rewards/no_repetition_reward_func": -0.2968357503414154, + "rewards/verse_reward_func": -0.0078125, + "step": 669 + }, + { + "completion_length": 516.0, + "epoch": 5.36, + "grad_norm": 0.1826171875, + "kl": 0.2586742118000984, + "learning_rate": 2.6221244244890336e-05, + "loss": 0.0103, + "reward": 5.078291654586792, + "reward_std": 3.333042025566101, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.341426134109497, + "rewards/no_repetition_reward_func": -0.2631344348192215, + "rewards/verse_reward_func": 0.0, + "step": 670 + }, + { + "completion_length": 510.640625, + "epoch": 5.368, + "grad_norm": 0.2275390625, + "kl": 0.22702453285455704, + "learning_rate": 2.615150975102131e-05, + "loss": 0.0091, + "reward": 5.8652472496032715, + "reward_std": 2.6521828174591064, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.161533832550049, + "rewards/no_repetition_reward_func": -0.28066176176071167, + "rewards/verse_reward_func": -0.015625, + "step": 671 + }, + { + "completion_length": 513.296875, + "epoch": 5.376, + "grad_norm": 0.197265625, + "kl": 0.2730438634753227, + "learning_rate": 2.6081766277447927e-05, + "loss": 0.0109, + "reward": 4.462066888809204, + "reward_std": 2.8929654359817505, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.755192518234253, + "rewards/no_repetition_reward_func": -0.24625037610530853, + "rewards/verse_reward_func": -0.046875, + "step": 672 + }, + { + "completion_length": 514.578125, + "epoch": 5.384, + "grad_norm": 0.1748046875, + "kl": 0.17482279241085052, + "learning_rate": 2.6012014368043814e-05, + "loss": 0.007, + "reward": 5.96382737159729, + "reward_std": 3.0522634983062744, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.277121305465698, + "rewards/no_repetition_reward_func": -0.3132939040660858, + "rewards/verse_reward_func": 0.0, + "step": 673 + }, + { + "completion_length": 514.546875, + "epoch": 5.392, + "grad_norm": 0.1708984375, + "kl": 0.21739815175533295, + "learning_rate": 2.594225456674837e-05, + "loss": 0.0087, + "reward": 5.72670841217041, + "reward_std": 2.8930301666259766, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.030989646911621, + "rewards/no_repetition_reward_func": -0.28084365278482437, + "rewards/verse_reward_func": -0.0234375, + "step": 674 + }, + { + "completion_length": 516.0, + "epoch": 5.4, + "grad_norm": 0.228515625, + "kl": 0.2642039954662323, + "learning_rate": 2.587248741756253e-05, + "loss": 0.0106, + "reward": 5.041175842285156, + "reward_std": 2.445306897163391, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.3098602294921875, + "rewards/no_repetition_reward_func": -0.2608712166547775, + "rewards/verse_reward_func": -0.0078125, + "step": 675 + }, + { + "completion_length": 515.75, + "epoch": 5.408, + "grad_norm": 0.1787109375, + "kl": 0.21472570300102234, + "learning_rate": 2.5802713464544542e-05, + "loss": 0.0086, + "reward": 6.234508991241455, + "reward_std": 2.4853627681732178, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.544671297073364, + "rewards/no_repetition_reward_func": -0.2945369780063629, + "rewards/verse_reward_func": 0.0, + "step": 676 + }, + { + "completion_length": 511.09375, + "epoch": 5.416, + "grad_norm": 0.203125, + "kl": 0.1550954505801201, + "learning_rate": 2.5732933251805713e-05, + "loss": 0.0062, + "reward": 6.4582555294036865, + "reward_std": 2.5962283611297607, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.796572685241699, + "rewards/no_repetition_reward_func": -0.33831728994846344, + "rewards/verse_reward_func": 0.0, + "step": 677 + }, + { + "completion_length": 508.140625, + "epoch": 5.424, + "grad_norm": 0.306640625, + "kl": 0.2668718546628952, + "learning_rate": 2.566314732350615e-05, + "loss": 0.0107, + "reward": 4.955024242401123, + "reward_std": 3.1688945293426514, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.2199718952178955, + "rewards/no_repetition_reward_func": -0.24932251870632172, + "rewards/verse_reward_func": -0.015625, + "step": 678 + }, + { + "completion_length": 516.0, + "epoch": 5.432, + "grad_norm": 0.2021484375, + "kl": 0.22789643704891205, + "learning_rate": 2.559335622385055e-05, + "loss": 0.0091, + "reward": 5.372319459915161, + "reward_std": 3.5704230070114136, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.670783519744873, + "rewards/no_repetition_reward_func": -0.27502669394016266, + "rewards/verse_reward_func": -0.0078125, + "step": 679 + }, + { + "completion_length": 516.0, + "epoch": 5.44, + "grad_norm": 0.19140625, + "kl": 0.26405711472034454, + "learning_rate": 2.5523560497083926e-05, + "loss": 0.0106, + "reward": 5.192977428436279, + "reward_std": 2.8473607301712036, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.461740493774414, + "rewards/no_repetition_reward_func": -0.2609505206346512, + "rewards/verse_reward_func": -0.0078125, + "step": 680 + }, + { + "completion_length": 516.0, + "epoch": 5.448, + "grad_norm": 0.1953125, + "kl": 0.2503761649131775, + "learning_rate": 2.545376068748737e-05, + "loss": 0.01, + "reward": 5.265399217605591, + "reward_std": 3.6802051067352295, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.520808935165405, + "rewards/no_repetition_reward_func": -0.25540996342897415, + "rewards/verse_reward_func": 0.0, + "step": 681 + }, + { + "completion_length": 501.96875, + "epoch": 5.456, + "grad_norm": 0.32421875, + "kl": 0.16176078096032143, + "learning_rate": 2.5383957339373825e-05, + "loss": 0.0065, + "reward": 6.579750299453735, + "reward_std": 2.746151566505432, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.926898717880249, + "rewards/no_repetition_reward_func": -0.3237111568450928, + "rewards/verse_reward_func": -0.0234375, + "step": 682 + }, + { + "completion_length": 516.0, + "epoch": 5.464, + "grad_norm": 0.18359375, + "kl": 0.15263376384973526, + "learning_rate": 2.531415099708382e-05, + "loss": 0.0061, + "reward": 6.607088565826416, + "reward_std": 2.62143075466156, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.954285383224487, + "rewards/no_repetition_reward_func": -0.34719739854335785, + "rewards/verse_reward_func": 0.0, + "step": 683 + }, + { + "completion_length": 509.46875, + "epoch": 5.4719999999999995, + "grad_norm": 0.1875, + "kl": 0.173417866230011, + "learning_rate": 2.524434220498123e-05, + "loss": 0.0069, + "reward": 5.664780378341675, + "reward_std": 3.6255499124526978, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.9787609577178955, + "rewards/no_repetition_reward_func": -0.31398046016693115, + "rewards/verse_reward_func": 0.0, + "step": 684 + }, + { + "completion_length": 511.328125, + "epoch": 5.48, + "grad_norm": 0.1953125, + "kl": 0.3373987227678299, + "learning_rate": 2.517453150744904e-05, + "loss": 0.0135, + "reward": 3.5758272409439087, + "reward_std": 2.8234939575195312, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.817586302757263, + "rewards/no_repetition_reward_func": -0.2026965729892254, + "rewards/verse_reward_func": -0.0390625, + "step": 685 + }, + { + "completion_length": 516.0, + "epoch": 5.4879999999999995, + "grad_norm": 0.1728515625, + "kl": 0.184630885720253, + "learning_rate": 2.51047194488851e-05, + "loss": 0.0074, + "reward": 5.937535524368286, + "reward_std": 3.181824803352356, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.249551296234131, + "rewards/no_repetition_reward_func": -0.31201550364494324, + "rewards/verse_reward_func": 0.0, + "step": 686 + }, + { + "completion_length": 514.453125, + "epoch": 5.496, + "grad_norm": 0.201171875, + "kl": 0.1751096546649933, + "learning_rate": 2.5034906573697864e-05, + "loss": 0.007, + "reward": 6.388206243515015, + "reward_std": 3.042833685874939, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.707280397415161, + "rewards/no_repetition_reward_func": -0.31907421350479126, + "rewards/verse_reward_func": 0.0, + "step": 687 + }, + { + "completion_length": 516.0, + "epoch": 5.504, + "grad_norm": 0.17578125, + "kl": 0.14149917662143707, + "learning_rate": 2.496509342630214e-05, + "loss": 0.0057, + "reward": 6.66121768951416, + "reward_std": 3.0561769008636475, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.0223236083984375, + "rewards/no_repetition_reward_func": -0.3454810678958893, + "rewards/verse_reward_func": -0.015625, + "step": 688 + }, + { + "completion_length": 516.0, + "epoch": 5.5120000000000005, + "grad_norm": 0.1630859375, + "kl": 0.188470758497715, + "learning_rate": 2.4895280551114907e-05, + "loss": 0.0075, + "reward": 6.380093097686768, + "reward_std": 2.8208121061325073, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.6962058544158936, + "rewards/no_repetition_reward_func": -0.31611302495002747, + "rewards/verse_reward_func": 0.0, + "step": 689 + }, + { + "completion_length": 516.0, + "epoch": 5.52, + "grad_norm": 0.1904296875, + "kl": 0.24167504161596298, + "learning_rate": 2.4825468492550964e-05, + "loss": 0.0097, + "reward": 5.567473888397217, + "reward_std": 2.788156032562256, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.860048294067383, + "rewards/no_repetition_reward_func": -0.27694928646087646, + "rewards/verse_reward_func": -0.015625, + "step": 690 + }, + { + "completion_length": 515.203125, + "epoch": 5.5280000000000005, + "grad_norm": 0.2041015625, + "kl": 0.154065303504467, + "learning_rate": 2.475565779501878e-05, + "loss": 0.0062, + "reward": 6.578710556030273, + "reward_std": 2.520150899887085, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.905343532562256, + "rewards/no_repetition_reward_func": -0.3266321122646332, + "rewards/verse_reward_func": 0.0, + "step": 691 + }, + { + "completion_length": 508.578125, + "epoch": 5.536, + "grad_norm": 0.4375, + "kl": 0.1892726719379425, + "learning_rate": 2.4685849002916183e-05, + "loss": 0.0076, + "reward": 5.825688362121582, + "reward_std": 2.9695732593536377, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.124634742736816, + "rewards/no_repetition_reward_func": -0.2989467978477478, + "rewards/verse_reward_func": 0.0, + "step": 692 + }, + { + "completion_length": 516.0, + "epoch": 5.5440000000000005, + "grad_norm": 0.1875, + "kl": 0.20913514494895935, + "learning_rate": 2.4616042660626177e-05, + "loss": 0.0084, + "reward": 5.059658765792847, + "reward_std": 3.467974543571472, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.353377819061279, + "rewards/no_repetition_reward_func": -0.2859063446521759, + "rewards/verse_reward_func": -0.0078125, + "step": 693 + }, + { + "completion_length": 513.34375, + "epoch": 5.552, + "grad_norm": 0.2333984375, + "kl": 0.18050530552864075, + "learning_rate": 2.4546239312512635e-05, + "loss": 0.0072, + "reward": 6.664430856704712, + "reward_std": 1.9690083861351013, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.995410442352295, + "rewards/no_repetition_reward_func": -0.33097973465919495, + "rewards/verse_reward_func": 0.0, + "step": 694 + }, + { + "completion_length": 513.640625, + "epoch": 5.5600000000000005, + "grad_norm": 0.166015625, + "kl": 0.22391174733638763, + "learning_rate": 2.447643950291608e-05, + "loss": 0.009, + "reward": 5.81162691116333, + "reward_std": 3.1062376499176025, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.107891082763672, + "rewards/no_repetition_reward_func": -0.28845153748989105, + "rewards/verse_reward_func": -0.0078125, + "step": 695 + }, + { + "completion_length": 512.546875, + "epoch": 5.568, + "grad_norm": 0.1982421875, + "kl": 0.21921157836914062, + "learning_rate": 2.4406643776149458e-05, + "loss": 0.0088, + "reward": 6.209155082702637, + "reward_std": 2.943284749984741, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.517666816711426, + "rewards/no_repetition_reward_func": -0.2928866297006607, + "rewards/verse_reward_func": 0.0, + "step": 696 + }, + { + "completion_length": 516.0, + "epoch": 5.576, + "grad_norm": 0.1943359375, + "kl": 0.17793772369623184, + "learning_rate": 2.4336852676493847e-05, + "loss": 0.0071, + "reward": 6.3866589069366455, + "reward_std": 3.374434471130371, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.713340997695923, + "rewards/no_repetition_reward_func": -0.32668234407901764, + "rewards/verse_reward_func": 0.0, + "step": 697 + }, + { + "completion_length": 514.046875, + "epoch": 5.584, + "grad_norm": 0.1787109375, + "kl": 0.18776960670948029, + "learning_rate": 2.4267066748194296e-05, + "loss": 0.0075, + "reward": 6.228465557098389, + "reward_std": 2.694515883922577, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.550173759460449, + "rewards/no_repetition_reward_func": -0.3217076063156128, + "rewards/verse_reward_func": 0.0, + "step": 698 + }, + { + "completion_length": 516.0, + "epoch": 5.592, + "grad_norm": 0.1953125, + "kl": 0.17215180397033691, + "learning_rate": 2.4197286535455464e-05, + "loss": 0.0069, + "reward": 6.5055832862854, + "reward_std": 2.970685362815857, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.824329376220703, + "rewards/no_repetition_reward_func": -0.3187464475631714, + "rewards/verse_reward_func": 0.0, + "step": 699 + }, + { + "completion_length": 511.234375, + "epoch": 5.6, + "grad_norm": 0.1884765625, + "kl": 0.15369392931461334, + "learning_rate": 2.4127512582437485e-05, + "loss": 0.0061, + "reward": 7.346147537231445, + "reward_std": 2.118213653564453, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.681209087371826, + "rewards/no_repetition_reward_func": -0.33506132662296295, + "rewards/verse_reward_func": 0.0, + "step": 700 + }, + { + "completion_length": 505.90625, + "epoch": 5.608, + "grad_norm": 0.1845703125, + "kl": 0.19633637368679047, + "learning_rate": 2.4057745433251635e-05, + "loss": 0.0079, + "reward": 5.7891845703125, + "reward_std": 2.5196900963783264, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.095479249954224, + "rewards/no_repetition_reward_func": -0.2906699627637863, + "rewards/verse_reward_func": -0.015625, + "step": 701 + }, + { + "completion_length": 507.828125, + "epoch": 5.616, + "grad_norm": 0.2236328125, + "kl": 0.22191153466701508, + "learning_rate": 2.398798563195619e-05, + "loss": 0.0089, + "reward": 4.79936671257019, + "reward_std": 2.568552017211914, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.092932462692261, + "rewards/no_repetition_reward_func": -0.27012838423252106, + "rewards/verse_reward_func": -0.0234375, + "step": 702 + }, + { + "completion_length": 512.328125, + "epoch": 5.624, + "grad_norm": 0.19140625, + "kl": 0.1998957097530365, + "learning_rate": 2.391823372255208e-05, + "loss": 0.008, + "reward": 6.298149108886719, + "reward_std": 2.091644287109375, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.601037263870239, + "rewards/no_repetition_reward_func": -0.2950759679079056, + "rewards/verse_reward_func": -0.0078125, + "step": 703 + }, + { + "completion_length": 516.0, + "epoch": 5.632, + "grad_norm": 0.19140625, + "kl": 0.2645105719566345, + "learning_rate": 2.384849024897869e-05, + "loss": 0.0106, + "reward": 4.993340015411377, + "reward_std": 2.9146488904953003, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.280778408050537, + "rewards/no_repetition_reward_func": -0.2561881020665169, + "rewards/verse_reward_func": -0.03125, + "step": 704 + }, + { + "completion_length": 516.0, + "epoch": 5.64, + "grad_norm": 0.15234375, + "kl": 0.15134360641241074, + "learning_rate": 2.377875575510967e-05, + "loss": 0.0061, + "reward": 6.060352802276611, + "reward_std": 3.3283865451812744, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.417653560638428, + "rewards/no_repetition_reward_func": -0.3416752964258194, + "rewards/verse_reward_func": -0.015625, + "step": 705 + }, + { + "completion_length": 516.0, + "epoch": 5.648, + "grad_norm": 0.1748046875, + "kl": 0.1758585199713707, + "learning_rate": 2.3709030784748587e-05, + "loss": 0.007, + "reward": 6.332947254180908, + "reward_std": 3.3395482301712036, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.6508469581604, + "rewards/no_repetition_reward_func": -0.3179000914096832, + "rewards/verse_reward_func": 0.0, + "step": 706 + }, + { + "completion_length": 514.421875, + "epoch": 5.656, + "grad_norm": 0.171875, + "kl": 0.23633674532175064, + "learning_rate": 2.3639315881624777e-05, + "loss": 0.0095, + "reward": 5.307573556900024, + "reward_std": 3.2242904901504517, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.599431753158569, + "rewards/no_repetition_reward_func": -0.28404583036899567, + "rewards/verse_reward_func": -0.0078125, + "step": 707 + }, + { + "completion_length": 515.0, + "epoch": 5.664, + "grad_norm": 0.1806640625, + "kl": 0.21851187944412231, + "learning_rate": 2.3569611589389047e-05, + "loss": 0.0087, + "reward": 6.053457260131836, + "reward_std": 3.358890175819397, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.348464488983154, + "rewards/no_repetition_reward_func": -0.2871946096420288, + "rewards/verse_reward_func": -0.0078125, + "step": 708 + }, + { + "completion_length": 516.0, + "epoch": 5.672, + "grad_norm": 0.185546875, + "kl": 0.24413596093654633, + "learning_rate": 2.349991845160949e-05, + "loss": 0.0098, + "reward": 5.195557594299316, + "reward_std": 3.5528547763824463, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.444633483886719, + "rewards/no_repetition_reward_func": -0.24907545745372772, + "rewards/verse_reward_func": 0.0, + "step": 709 + }, + { + "completion_length": 507.78125, + "epoch": 5.68, + "grad_norm": 0.1923828125, + "kl": 0.16206902265548706, + "learning_rate": 2.3430237011767167e-05, + "loss": 0.0065, + "reward": 6.77687668800354, + "reward_std": 2.679603099822998, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.130105257034302, + "rewards/no_repetition_reward_func": -0.33760349452495575, + "rewards/verse_reward_func": 0.0, + "step": 710 + }, + { + "completion_length": 510.296875, + "epoch": 5.688, + "grad_norm": 0.203125, + "kl": 0.20391343533992767, + "learning_rate": 2.336056781325197e-05, + "loss": 0.0082, + "reward": 6.355374097824097, + "reward_std": 1.5630100071430206, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 6.710090160369873, + "rewards/no_repetition_reward_func": -0.30002858489751816, + "rewards/verse_reward_func": -0.0234375, + "step": 711 + }, + { + "completion_length": 509.921875, + "epoch": 5.696, + "grad_norm": 0.19140625, + "kl": 0.17729371786117554, + "learning_rate": 2.3290911399358285e-05, + "loss": 0.0071, + "reward": 5.891215562820435, + "reward_std": 3.064835548400879, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.226222515106201, + "rewards/no_repetition_reward_func": -0.31938230991363525, + "rewards/verse_reward_func": -0.015625, + "step": 712 + }, + { + "completion_length": 511.25, + "epoch": 5.704, + "grad_norm": 0.1884765625, + "kl": 0.1643700897693634, + "learning_rate": 2.3221268313280838e-05, + "loss": 0.0066, + "reward": 6.522888898849487, + "reward_std": 2.735443353652954, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.848114728927612, + "rewards/no_repetition_reward_func": -0.32522575557231903, + "rewards/verse_reward_func": 0.0, + "step": 713 + }, + { + "completion_length": 516.0, + "epoch": 5.712, + "grad_norm": 0.193359375, + "kl": 0.18962538987398148, + "learning_rate": 2.3151639098110377e-05, + "loss": 0.0076, + "reward": 6.375661373138428, + "reward_std": 1.9941034317016602, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.728941202163696, + "rewards/no_repetition_reward_func": -0.3220299184322357, + "rewards/verse_reward_func": -0.03125, + "step": 714 + }, + { + "completion_length": 516.0, + "epoch": 5.72, + "grad_norm": 0.1953125, + "kl": 0.27200255542993546, + "learning_rate": 2.3082024296829536e-05, + "loss": 0.0109, + "reward": 5.447978734970093, + "reward_std": 1.6380634307861328, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.7222795486450195, + "rewards/no_repetition_reward_func": -0.2508634477853775, + "rewards/verse_reward_func": -0.0234375, + "step": 715 + }, + { + "completion_length": 513.609375, + "epoch": 5.728, + "grad_norm": 0.1669921875, + "kl": 0.31218069791793823, + "learning_rate": 2.301242445230851e-05, + "loss": 0.0125, + "reward": 3.3930611610412598, + "reward_std": 3.378011465072632, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.627830147743225, + "rewards/no_repetition_reward_func": -0.21133162826299667, + "rewards/verse_reward_func": -0.0234375, + "step": 716 + }, + { + "completion_length": 513.453125, + "epoch": 5.736, + "grad_norm": 0.1904296875, + "kl": 0.23356442153453827, + "learning_rate": 2.294284010730086e-05, + "loss": 0.0093, + "reward": 5.7910990715026855, + "reward_std": 2.3911900520324707, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.090015172958374, + "rewards/no_repetition_reward_func": -0.2832910865545273, + "rewards/verse_reward_func": -0.015625, + "step": 717 + }, + { + "completion_length": 504.8125, + "epoch": 5.744, + "grad_norm": 0.224609375, + "kl": 0.2540774494409561, + "learning_rate": 2.28732718044393e-05, + "loss": 0.0102, + "reward": 5.179577827453613, + "reward_std": 3.257111072540283, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.471523284912109, + "rewards/no_repetition_reward_func": -0.2606954723596573, + "rewards/verse_reward_func": -0.015625, + "step": 718 + }, + { + "completion_length": 499.65625, + "epoch": 5.752, + "grad_norm": 0.609375, + "kl": 0.24407707154750824, + "learning_rate": 2.280372008623142e-05, + "loss": 0.0098, + "reward": 4.502797603607178, + "reward_std": 3.3580939769744873, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.782882452011108, + "rewards/no_repetition_reward_func": -0.24883447587490082, + "rewards/verse_reward_func": -0.03125, + "step": 719 + }, + { + "completion_length": 510.65625, + "epoch": 5.76, + "grad_norm": 0.1708984375, + "kl": 0.2301592156291008, + "learning_rate": 2.2734185495055503e-05, + "loss": 0.0092, + "reward": 5.159515738487244, + "reward_std": 3.765817165374756, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.4373459815979, + "rewards/no_repetition_reward_func": -0.27783019095659256, + "rewards/verse_reward_func": 0.0, + "step": 720 + }, + { + "completion_length": 505.796875, + "epoch": 5.768, + "grad_norm": 0.357421875, + "kl": 0.20448963344097137, + "learning_rate": 2.266466857315624e-05, + "loss": 0.0082, + "reward": 6.11919093132019, + "reward_std": 3.4051852226257324, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.470093488693237, + "rewards/no_repetition_reward_func": -0.3118402510881424, + "rewards/verse_reward_func": -0.0390625, + "step": 721 + }, + { + "completion_length": 502.328125, + "epoch": 5.776, + "grad_norm": 0.30078125, + "kl": 0.22968008369207382, + "learning_rate": 2.2595169862640568e-05, + "loss": 0.0092, + "reward": 5.111587285995483, + "reward_std": 3.7305744886398315, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.3969526290893555, + "rewards/no_repetition_reward_func": -0.2697402536869049, + "rewards/verse_reward_func": -0.015625, + "step": 722 + }, + { + "completion_length": 503.171875, + "epoch": 5.784, + "grad_norm": 0.26171875, + "kl": 0.36373433470726013, + "learning_rate": 2.2525689905473376e-05, + "loss": 0.0145, + "reward": 3.446166753768921, + "reward_std": 3.3266544342041016, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.6642398834228516, + "rewards/no_repetition_reward_func": -0.19463561475276947, + "rewards/verse_reward_func": -0.0078125, + "step": 723 + }, + { + "completion_length": 516.0, + "epoch": 5.792, + "grad_norm": 0.205078125, + "kl": 0.23208089172840118, + "learning_rate": 2.2456229243473345e-05, + "loss": 0.0093, + "reward": 6.218995571136475, + "reward_std": 2.988460898399353, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.517859220504761, + "rewards/no_repetition_reward_func": -0.298863485455513, + "rewards/verse_reward_func": 0.0, + "step": 724 + }, + { + "completion_length": 508.328125, + "epoch": 5.8, + "grad_norm": 0.19921875, + "kl": 0.2677079513669014, + "learning_rate": 2.238678841830867e-05, + "loss": 0.0107, + "reward": 4.829641103744507, + "reward_std": 3.293970823287964, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.087658405303955, + "rewards/no_repetition_reward_func": -0.25020459294319153, + "rewards/verse_reward_func": -0.0078125, + "step": 725 + }, + { + "completion_length": 516.0, + "epoch": 5.808, + "grad_norm": 0.162109375, + "kl": 0.19857291877269745, + "learning_rate": 2.2317367971492835e-05, + "loss": 0.0079, + "reward": 5.43665337562561, + "reward_std": 3.294142961502075, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.765371084213257, + "rewards/no_repetition_reward_func": -0.3052804172039032, + "rewards/verse_reward_func": -0.0078125, + "step": 726 + }, + { + "completion_length": 516.0, + "epoch": 5.816, + "grad_norm": 0.1728515625, + "kl": 0.17542316764593124, + "learning_rate": 2.224796844438045e-05, + "loss": 0.007, + "reward": 6.179279804229736, + "reward_std": 2.985247254371643, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.50320839881897, + "rewards/no_repetition_reward_func": -0.31611672043800354, + "rewards/verse_reward_func": -0.0078125, + "step": 727 + }, + { + "completion_length": 509.265625, + "epoch": 5.824, + "grad_norm": 0.208984375, + "kl": 0.11834131553769112, + "learning_rate": 2.217859037816296e-05, + "loss": 0.0047, + "reward": 7.068414688110352, + "reward_std": 2.987117648124695, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.443948268890381, + "rewards/no_repetition_reward_func": -0.35990865528583527, + "rewards/verse_reward_func": 0.0, + "step": 728 + }, + { + "completion_length": 516.0, + "epoch": 5.832, + "grad_norm": 0.2080078125, + "kl": 0.22758492827415466, + "learning_rate": 2.2109234313864465e-05, + "loss": 0.0091, + "reward": 5.933314085006714, + "reward_std": 3.0294418334960938, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.242379903793335, + "rewards/no_repetition_reward_func": -0.2778160348534584, + "rewards/verse_reward_func": -0.015625, + "step": 729 + }, + { + "completion_length": 506.15625, + "epoch": 5.84, + "grad_norm": 0.181640625, + "kl": 0.2005663514137268, + "learning_rate": 2.2039900792337474e-05, + "loss": 0.008, + "reward": 5.918019771575928, + "reward_std": 3.4873164892196655, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.238011598587036, + "rewards/no_repetition_reward_func": -0.3043665587902069, + "rewards/verse_reward_func": 0.0, + "step": 730 + }, + { + "completion_length": 511.984375, + "epoch": 5.848, + "grad_norm": 0.19921875, + "kl": 0.27172476053237915, + "learning_rate": 2.1970590354258745e-05, + "loss": 0.0109, + "reward": 4.8352437019348145, + "reward_std": 2.6442973613739014, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.096220254898071, + "rewards/no_repetition_reward_func": -0.2453519031405449, + "rewards/verse_reward_func": 0.0, + "step": 731 + }, + { + "completion_length": 514.9375, + "epoch": 5.856, + "grad_norm": 0.169921875, + "kl": 0.12077179551124573, + "learning_rate": 2.1901303540124956e-05, + "loss": 0.0048, + "reward": 7.051027536392212, + "reward_std": 2.9944779872894287, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.421402215957642, + "rewards/no_repetition_reward_func": -0.3625619262456894, + "rewards/verse_reward_func": -0.0078125, + "step": 732 + }, + { + "completion_length": 512.65625, + "epoch": 5.864, + "grad_norm": 0.1708984375, + "kl": 0.21908294409513474, + "learning_rate": 2.183204089024864e-05, + "loss": 0.0088, + "reward": 5.492286205291748, + "reward_std": 3.1466927528381348, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.791689157485962, + "rewards/no_repetition_reward_func": -0.2994031459093094, + "rewards/verse_reward_func": 0.0, + "step": 733 + }, + { + "completion_length": 508.796875, + "epoch": 5.872, + "grad_norm": 0.173828125, + "kl": 0.1057685986161232, + "learning_rate": 2.176280294475383e-05, + "loss": 0.0042, + "reward": 7.331297397613525, + "reward_std": 2.466745615005493, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.71379280090332, + "rewards/no_repetition_reward_func": -0.36687037348747253, + "rewards/verse_reward_func": 0.0, + "step": 734 + }, + { + "completion_length": 505.59375, + "epoch": 5.88, + "grad_norm": 0.306640625, + "kl": 0.19485847651958466, + "learning_rate": 2.1693590243571938e-05, + "loss": 0.0078, + "reward": 5.963371276855469, + "reward_std": 2.5147998929023743, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.292361736297607, + "rewards/no_repetition_reward_func": -0.29774023592472076, + "rewards/verse_reward_func": -0.03125, + "step": 735 + }, + { + "completion_length": 510.953125, + "epoch": 5.888, + "grad_norm": 0.1787109375, + "kl": 0.2649780660867691, + "learning_rate": 2.1624403326437523e-05, + "loss": 0.0106, + "reward": 5.01844048500061, + "reward_std": 2.787447690963745, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.331363677978516, + "rewards/no_repetition_reward_func": -0.26604826748371124, + "rewards/verse_reward_func": -0.046875, + "step": 736 + }, + { + "completion_length": 505.4375, + "epoch": 5.896, + "grad_norm": 0.1875, + "kl": 0.22047294676303864, + "learning_rate": 2.155524273288405e-05, + "loss": 0.0088, + "reward": 4.972723722457886, + "reward_std": 3.4665560722351074, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.292182445526123, + "rewards/no_repetition_reward_func": -0.28039635717868805, + "rewards/verse_reward_func": -0.0234375, + "step": 737 + }, + { + "completion_length": 508.34375, + "epoch": 5.904, + "grad_norm": 0.5078125, + "kl": 0.19797049462795258, + "learning_rate": 2.148610900223973e-05, + "loss": 0.0079, + "reward": 6.495041608810425, + "reward_std": 3.147133231163025, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.827469348907471, + "rewards/no_repetition_reward_func": -0.3168022930622101, + "rewards/verse_reward_func": 0.0, + "step": 738 + }, + { + "completion_length": 516.0, + "epoch": 5.912, + "grad_norm": 0.2080078125, + "kl": 0.19102302938699722, + "learning_rate": 2.1417002673623264e-05, + "loss": 0.0076, + "reward": 6.273883581161499, + "reward_std": 3.3121002912521362, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.583108425140381, + "rewards/no_repetition_reward_func": -0.3014119565486908, + "rewards/verse_reward_func": -0.0078125, + "step": 739 + }, + { + "completion_length": 507.25, + "epoch": 5.92, + "grad_norm": 0.201171875, + "kl": 0.2055477276444435, + "learning_rate": 2.1347924285939714e-05, + "loss": 0.0082, + "reward": 5.843545436859131, + "reward_std": 3.620043992996216, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.138182878494263, + "rewards/no_repetition_reward_func": -0.2946377545595169, + "rewards/verse_reward_func": 0.0, + "step": 740 + }, + { + "completion_length": 515.109375, + "epoch": 5.928, + "grad_norm": 0.16796875, + "kl": 0.12595270574092865, + "learning_rate": 2.1278874377876197e-05, + "loss": 0.005, + "reward": 7.131193161010742, + "reward_std": 2.7459890842437744, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.488378047943115, + "rewards/no_repetition_reward_func": -0.357184961438179, + "rewards/verse_reward_func": 0.0, + "step": 741 + }, + { + "completion_length": 510.34375, + "epoch": 5.936, + "grad_norm": 0.220703125, + "kl": 0.20764509588479996, + "learning_rate": 2.1209853487897784e-05, + "loss": 0.0083, + "reward": 6.334040880203247, + "reward_std": 3.2992751598358154, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.639424085617065, + "rewards/no_repetition_reward_func": -0.305383637547493, + "rewards/verse_reward_func": 0.0, + "step": 742 + }, + { + "completion_length": 511.125, + "epoch": 5.944, + "grad_norm": 0.2001953125, + "kl": 0.1552182212471962, + "learning_rate": 2.114086215424322e-05, + "loss": 0.0062, + "reward": 6.780239105224609, + "reward_std": 2.4964685440063477, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 7.148821115493774, + "rewards/no_repetition_reward_func": -0.33733227849006653, + "rewards/verse_reward_func": 0.0, + "step": 743 + }, + { + "completion_length": 504.9375, + "epoch": 5.952, + "grad_norm": 0.1630859375, + "kl": 0.1605360060930252, + "learning_rate": 2.1071900914920816e-05, + "loss": 0.0064, + "reward": 6.303893327713013, + "reward_std": 2.499689221382141, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.637839317321777, + "rewards/no_repetition_reward_func": -0.32613368332386017, + "rewards/verse_reward_func": -0.0078125, + "step": 744 + }, + { + "completion_length": 515.375, + "epoch": 5.96, + "grad_norm": 0.185546875, + "kl": 0.2529315948486328, + "learning_rate": 2.1002970307704132e-05, + "loss": 0.0101, + "reward": 5.590619325637817, + "reward_std": 3.0038349628448486, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.870557069778442, + "rewards/no_repetition_reward_func": -0.26431263983249664, + "rewards/verse_reward_func": -0.015625, + "step": 745 + }, + { + "completion_length": 516.0, + "epoch": 5.968, + "grad_norm": 0.185546875, + "kl": 0.24745021760463715, + "learning_rate": 2.0934070870127912e-05, + "loss": 0.0099, + "reward": 5.919786691665649, + "reward_std": 2.4950015544891357, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.199976444244385, + "rewards/no_repetition_reward_func": -0.27237727493047714, + "rewards/verse_reward_func": -0.0078125, + "step": 746 + }, + { + "completion_length": 516.0, + "epoch": 5.976, + "grad_norm": 0.189453125, + "kl": 0.18178033083677292, + "learning_rate": 2.0865203139483812e-05, + "loss": 0.0073, + "reward": 6.312311172485352, + "reward_std": 3.058098554611206, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.648967266082764, + "rewards/no_repetition_reward_func": -0.32103148102760315, + "rewards/verse_reward_func": -0.015625, + "step": 747 + }, + { + "completion_length": 507.71875, + "epoch": 5.984, + "grad_norm": 0.193359375, + "kl": 0.23826581239700317, + "learning_rate": 2.0796367652816213e-05, + "loss": 0.0095, + "reward": 5.666552782058716, + "reward_std": 3.411422610282898, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.9561755657196045, + "rewards/no_repetition_reward_func": -0.2661854773759842, + "rewards/verse_reward_func": -0.0078125, + "step": 748 + }, + { + "completion_length": 515.09375, + "epoch": 5.992, + "grad_norm": 0.162109375, + "kl": 0.15963705629110336, + "learning_rate": 2.0727564946918087e-05, + "loss": 0.0064, + "reward": 6.945555686950684, + "reward_std": 2.5608036518096924, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.300014495849609, + "rewards/no_repetition_reward_func": -0.3466462939977646, + "rewards/verse_reward_func": -0.0078125, + "step": 749 + }, + { + "completion_length": 516.0, + "epoch": 6.0, + "grad_norm": 0.1572265625, + "kl": 0.12228991836309433, + "learning_rate": 2.0658795558326743e-05, + "loss": 0.0049, + "reward": 7.387929916381836, + "reward_std": 2.5184473991394043, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.7601237297058105, + "rewards/no_repetition_reward_func": -0.3721940815448761, + "rewards/verse_reward_func": 0.0, + "step": 750 + }, + { + "completion_length": 515.859375, + "epoch": 6.008, + "grad_norm": 0.1640625, + "kl": 0.14067648723721504, + "learning_rate": 2.0590060023319696e-05, + "loss": 0.0056, + "reward": 7.571289539337158, + "reward_std": 2.225834012031555, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.934950113296509, + "rewards/no_repetition_reward_func": -0.3636605590581894, + "rewards/verse_reward_func": 0.0, + "step": 751 + }, + { + "completion_length": 516.0, + "epoch": 6.016, + "grad_norm": 0.177734375, + "kl": 0.22538131475448608, + "learning_rate": 2.0521358877910444e-05, + "loss": 0.009, + "reward": 5.512373924255371, + "reward_std": 3.5157346725463867, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.825729131698608, + "rewards/no_repetition_reward_func": -0.28991789370775223, + "rewards/verse_reward_func": -0.0234375, + "step": 752 + }, + { + "completion_length": 505.015625, + "epoch": 6.024, + "grad_norm": 0.2255859375, + "kl": 0.16015058755874634, + "learning_rate": 2.0452692657844333e-05, + "loss": 0.0064, + "reward": 7.114716291427612, + "reward_std": 2.1534090638160706, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.4591779708862305, + "rewards/no_repetition_reward_func": -0.3288367837667465, + "rewards/verse_reward_func": 0.0, + "step": 753 + }, + { + "completion_length": 516.0, + "epoch": 6.032, + "grad_norm": 0.18359375, + "kl": 0.1081378310918808, + "learning_rate": 2.038406189859433e-05, + "loss": 0.0043, + "reward": 7.926457643508911, + "reward_std": 1.5506442785263062, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 8.30586051940918, + "rewards/no_repetition_reward_func": -0.3794022798538208, + "rewards/verse_reward_func": 0.0, + "step": 754 + }, + { + "completion_length": 514.5625, + "epoch": 6.04, + "grad_norm": 0.1708984375, + "kl": 0.19652553647756577, + "learning_rate": 2.031546713535688e-05, + "loss": 0.0079, + "reward": 5.759355545043945, + "reward_std": 3.4472659826278687, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.073369979858398, + "rewards/no_repetition_reward_func": -0.3062018007040024, + "rewards/verse_reward_func": -0.0078125, + "step": 755 + }, + { + "completion_length": 516.0, + "epoch": 6.048, + "grad_norm": 0.1826171875, + "kl": 0.19257626682519913, + "learning_rate": 2.024690890304775e-05, + "loss": 0.0077, + "reward": 6.162311792373657, + "reward_std": 3.2436476945877075, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.5146825313568115, + "rewards/no_repetition_reward_func": -0.3289329409599304, + "rewards/verse_reward_func": -0.0234375, + "step": 756 + }, + { + "completion_length": 505.671875, + "epoch": 6.056, + "grad_norm": 0.4453125, + "kl": 0.15739984810352325, + "learning_rate": 2.0178387736297773e-05, + "loss": 0.0063, + "reward": 6.242648363113403, + "reward_std": 2.9558610916137695, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.6085045337677, + "rewards/no_repetition_reward_func": -0.33460649847984314, + "rewards/verse_reward_func": -0.015625, + "step": 757 + }, + { + "completion_length": 513.125, + "epoch": 6.064, + "grad_norm": 0.19921875, + "kl": 0.20578179508447647, + "learning_rate": 2.01099041694488e-05, + "loss": 0.0082, + "reward": 6.274184226989746, + "reward_std": 3.3320603370666504, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.590709686279297, + "rewards/no_repetition_reward_func": -0.3087131232023239, + "rewards/verse_reward_func": -0.0078125, + "step": 758 + }, + { + "completion_length": 515.890625, + "epoch": 6.072, + "grad_norm": 0.189453125, + "kl": 0.3218530863523483, + "learning_rate": 2.004145873654942e-05, + "loss": 0.0129, + "reward": 4.6027233600616455, + "reward_std": 2.7109880447387695, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.859450578689575, + "rewards/no_repetition_reward_func": -0.2254771962761879, + "rewards/verse_reward_func": -0.03125, + "step": 759 + }, + { + "completion_length": 515.390625, + "epoch": 6.08, + "grad_norm": 0.1826171875, + "kl": 0.1914285644888878, + "learning_rate": 1.9973051971350888e-05, + "loss": 0.0077, + "reward": 6.5157692432403564, + "reward_std": 3.0779054164886475, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.83673620223999, + "rewards/no_repetition_reward_func": -0.3209667205810547, + "rewards/verse_reward_func": 0.0, + "step": 760 + }, + { + "completion_length": 511.453125, + "epoch": 6.088, + "grad_norm": 0.1875, + "kl": 0.27273763716220856, + "learning_rate": 1.9904684407302883e-05, + "loss": 0.0109, + "reward": 5.342665672302246, + "reward_std": 2.3653894662857056, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.628942966461182, + "rewards/no_repetition_reward_func": -0.2550269216299057, + "rewards/verse_reward_func": -0.03125, + "step": 761 + }, + { + "completion_length": 512.328125, + "epoch": 6.096, + "grad_norm": 0.1923828125, + "kl": 0.1891346275806427, + "learning_rate": 1.983635657754942e-05, + "loss": 0.0076, + "reward": 6.235241174697876, + "reward_std": 2.7509584426879883, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.553521633148193, + "rewards/no_repetition_reward_func": -0.3182802200317383, + "rewards/verse_reward_func": 0.0, + "step": 762 + }, + { + "completion_length": 516.0, + "epoch": 6.104, + "grad_norm": 0.2080078125, + "kl": 0.16999120265245438, + "learning_rate": 1.9768069014924622e-05, + "loss": 0.0068, + "reward": 6.960066318511963, + "reward_std": 2.803093671798706, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.280762434005737, + "rewards/no_repetition_reward_func": -0.3206964284181595, + "rewards/verse_reward_func": 0.0, + "step": 763 + }, + { + "completion_length": 515.09375, + "epoch": 6.112, + "grad_norm": 0.1953125, + "kl": 0.22121810913085938, + "learning_rate": 1.969982225194864e-05, + "loss": 0.0088, + "reward": 5.95420503616333, + "reward_std": 3.228724479675293, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.2472312450408936, + "rewards/no_repetition_reward_func": -0.29302653670310974, + "rewards/verse_reward_func": 0.0, + "step": 764 + }, + { + "completion_length": 516.0, + "epoch": 6.12, + "grad_norm": 0.189453125, + "kl": 0.21867690980434418, + "learning_rate": 1.963161682082342e-05, + "loss": 0.0087, + "reward": 6.080960988998413, + "reward_std": 2.5647419691085815, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.377989768981934, + "rewards/no_repetition_reward_func": -0.2970290184020996, + "rewards/verse_reward_func": 0.0, + "step": 765 + }, + { + "completion_length": 515.34375, + "epoch": 6.128, + "grad_norm": 0.171875, + "kl": 0.2865344062447548, + "learning_rate": 1.956345325342863e-05, + "loss": 0.0115, + "reward": 4.733314752578735, + "reward_std": 2.892749786376953, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.016860485076904, + "rewards/no_repetition_reward_func": -0.2366705909371376, + "rewards/verse_reward_func": -0.046875, + "step": 766 + }, + { + "completion_length": 500.578125, + "epoch": 6.136, + "grad_norm": 0.2421875, + "kl": 0.1224500983953476, + "learning_rate": 1.9495332081317464e-05, + "loss": 0.0049, + "reward": 7.639893054962158, + "reward_std": 2.235634684562683, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 8.017152309417725, + "rewards/no_repetition_reward_func": -0.36163465678691864, + "rewards/verse_reward_func": 0.0, + "step": 767 + }, + { + "completion_length": 510.6875, + "epoch": 6.144, + "grad_norm": 0.25, + "kl": 0.30420735478401184, + "learning_rate": 1.942725383571249e-05, + "loss": 0.0122, + "reward": 4.82927942276001, + "reward_std": 2.9327924251556396, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.078862905502319, + "rewards/no_repetition_reward_func": -0.22614601254463196, + "rewards/verse_reward_func": -0.0234375, + "step": 768 + }, + { + "completion_length": 512.5, + "epoch": 6.152, + "grad_norm": 0.185546875, + "kl": 0.2021980881690979, + "learning_rate": 1.9359219047501565e-05, + "loss": 0.0081, + "reward": 5.464231729507446, + "reward_std": 2.9778809547424316, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.802417278289795, + "rewards/no_repetition_reward_func": -0.31474851071834564, + "rewards/verse_reward_func": -0.0234375, + "step": 769 + }, + { + "completion_length": 509.234375, + "epoch": 6.16, + "grad_norm": 0.1748046875, + "kl": 0.1853901445865631, + "learning_rate": 1.9291228247233605e-05, + "loss": 0.0074, + "reward": 6.452356576919556, + "reward_std": 2.7951886653900146, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.783344507217407, + "rewards/no_repetition_reward_func": -0.3153630942106247, + "rewards/verse_reward_func": 0.0, + "step": 770 + }, + { + "completion_length": 515.09375, + "epoch": 6.168, + "grad_norm": 0.1630859375, + "kl": 0.16003714501857758, + "learning_rate": 1.922328196511456e-05, + "loss": 0.0064, + "reward": 7.002309322357178, + "reward_std": 3.016436815261841, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.351240873336792, + "rewards/no_repetition_reward_func": -0.3489317148923874, + "rewards/verse_reward_func": 0.0, + "step": 771 + }, + { + "completion_length": 512.546875, + "epoch": 6.176, + "grad_norm": 0.208984375, + "kl": 0.21766002476215363, + "learning_rate": 1.915538073100316e-05, + "loss": 0.0087, + "reward": 5.940852403640747, + "reward_std": 3.636554002761841, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.242175102233887, + "rewards/no_repetition_reward_func": -0.2935101240873337, + "rewards/verse_reward_func": -0.0078125, + "step": 772 + }, + { + "completion_length": 515.953125, + "epoch": 6.184, + "grad_norm": 0.2109375, + "kl": 0.2552897036075592, + "learning_rate": 1.908752507440689e-05, + "loss": 0.0102, + "reward": 5.636173248291016, + "reward_std": 2.766258120536804, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.9262813329696655, + "rewards/no_repetition_reward_func": -0.274483323097229, + "rewards/verse_reward_func": -0.015625, + "step": 773 + }, + { + "completion_length": 516.0, + "epoch": 6.192, + "grad_norm": 0.173828125, + "kl": 0.16195235028862953, + "learning_rate": 1.9019715524477767e-05, + "loss": 0.0065, + "reward": 7.040695905685425, + "reward_std": 2.8603646755218506, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.408877372741699, + "rewards/no_repetition_reward_func": -0.3525571674108505, + "rewards/verse_reward_func": -0.015625, + "step": 774 + }, + { + "completion_length": 515.0625, + "epoch": 6.2, + "grad_norm": 0.1669921875, + "kl": 0.22971779853105545, + "learning_rate": 1.895195261000831e-05, + "loss": 0.0092, + "reward": 5.337550640106201, + "reward_std": 3.382264733314514, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.637848377227783, + "rewards/no_repetition_reward_func": -0.27685999870300293, + "rewards/verse_reward_func": -0.0234375, + "step": 775 + }, + { + "completion_length": 516.0, + "epoch": 6.208, + "grad_norm": 0.2197265625, + "kl": 0.3182336241006851, + "learning_rate": 1.888423685942732e-05, + "loss": 0.0127, + "reward": 4.810047388076782, + "reward_std": 2.884355068206787, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.07569432258606, + "rewards/no_repetition_reward_func": -0.24220948666334152, + "rewards/verse_reward_func": -0.0234375, + "step": 776 + }, + { + "completion_length": 514.859375, + "epoch": 6.216, + "grad_norm": 0.1728515625, + "kl": 0.14084116369485855, + "learning_rate": 1.8816568800795822e-05, + "loss": 0.0056, + "reward": 6.988978624343872, + "reward_std": 2.6574301719665527, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.356966972351074, + "rewards/no_repetition_reward_func": -0.34455054998397827, + "rewards/verse_reward_func": -0.0078125, + "step": 777 + }, + { + "completion_length": 509.578125, + "epoch": 6.224, + "grad_norm": 0.1865234375, + "kl": 0.25385400652885437, + "learning_rate": 1.8748948961802948e-05, + "loss": 0.0102, + "reward": 5.279417514801025, + "reward_std": 2.7072641849517822, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.554509401321411, + "rewards/no_repetition_reward_func": -0.25946735590696335, + "rewards/verse_reward_func": -0.015625, + "step": 778 + }, + { + "completion_length": 500.359375, + "epoch": 6.232, + "grad_norm": 0.294921875, + "kl": 0.19404038786888123, + "learning_rate": 1.868137786976177e-05, + "loss": 0.0078, + "reward": 6.0413498878479, + "reward_std": 2.4336193799972534, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.354204893112183, + "rewards/no_repetition_reward_func": -0.30504210293293, + "rewards/verse_reward_func": -0.0078125, + "step": 779 + }, + { + "completion_length": 511.890625, + "epoch": 6.24, + "grad_norm": 0.2001953125, + "kl": 0.17297524213790894, + "learning_rate": 1.8613856051605243e-05, + "loss": 0.0069, + "reward": 6.633041143417358, + "reward_std": 2.3864641189575195, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.977561712265015, + "rewards/no_repetition_reward_func": -0.321083202958107, + "rewards/verse_reward_func": -0.0234375, + "step": 780 + }, + { + "completion_length": 516.0, + "epoch": 6.248, + "grad_norm": 0.169921875, + "kl": 0.163069486618042, + "learning_rate": 1.8546384033882062e-05, + "loss": 0.0065, + "reward": 6.629687309265137, + "reward_std": 3.2675715684890747, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.980237722396851, + "rewards/no_repetition_reward_func": -0.3427376002073288, + "rewards/verse_reward_func": -0.0078125, + "step": 781 + }, + { + "completion_length": 516.0, + "epoch": 6.256, + "grad_norm": 0.177734375, + "kl": 0.19932983815670013, + "learning_rate": 1.8478962342752583e-05, + "loss": 0.008, + "reward": 6.33183217048645, + "reward_std": 3.175233483314514, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.648416757583618, + "rewards/no_repetition_reward_func": -0.3087720572948456, + "rewards/verse_reward_func": -0.0078125, + "step": 782 + }, + { + "completion_length": 511.546875, + "epoch": 6.264, + "grad_norm": 0.1669921875, + "kl": 0.2553647756576538, + "learning_rate": 1.841159150398469e-05, + "loss": 0.0102, + "reward": 5.365797281265259, + "reward_std": 3.111661911010742, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.7042176723480225, + "rewards/no_repetition_reward_func": -0.27592095732688904, + "rewards/verse_reward_func": -0.0625, + "step": 783 + }, + { + "completion_length": 515.578125, + "epoch": 6.272, + "grad_norm": 0.185546875, + "kl": 0.180130735039711, + "learning_rate": 1.8344272042949724e-05, + "loss": 0.0072, + "reward": 6.697963714599609, + "reward_std": 3.0550743341445923, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.023851633071899, + "rewards/no_repetition_reward_func": -0.3180757164955139, + "rewards/verse_reward_func": -0.0078125, + "step": 784 + }, + { + "completion_length": 516.0, + "epoch": 6.28, + "grad_norm": 0.1953125, + "kl": 0.1084451712667942, + "learning_rate": 1.827700448461836e-05, + "loss": 0.0043, + "reward": 7.77692437171936, + "reward_std": 2.2164286375045776, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 8.157846450805664, + "rewards/no_repetition_reward_func": -0.3809223920106888, + "rewards/verse_reward_func": 0.0, + "step": 785 + }, + { + "completion_length": 515.203125, + "epoch": 6.288, + "grad_norm": 0.1669921875, + "kl": 0.15791600942611694, + "learning_rate": 1.820978935355653e-05, + "loss": 0.0063, + "reward": 6.616531610488892, + "reward_std": 3.288264513015747, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.955045700073242, + "rewards/no_repetition_reward_func": -0.3385142832994461, + "rewards/verse_reward_func": 0.0, + "step": 786 + }, + { + "completion_length": 512.96875, + "epoch": 6.296, + "grad_norm": 0.1591796875, + "kl": 0.23218808323144913, + "learning_rate": 1.8142627173921338e-05, + "loss": 0.0093, + "reward": 5.467164754867554, + "reward_std": 3.464704155921936, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.751298427581787, + "rewards/no_repetition_reward_func": -0.2763212099671364, + "rewards/verse_reward_func": -0.0078125, + "step": 787 + }, + { + "completion_length": 511.15625, + "epoch": 6.304, + "grad_norm": 0.1884765625, + "kl": 0.19545840471982956, + "learning_rate": 1.807551846945694e-05, + "loss": 0.0078, + "reward": 6.447376489639282, + "reward_std": 3.1716195344924927, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.769300222396851, + "rewards/no_repetition_reward_func": -0.32192331552505493, + "rewards/verse_reward_func": 0.0, + "step": 788 + }, + { + "completion_length": 509.5625, + "epoch": 6.312, + "grad_norm": 0.1826171875, + "kl": 0.23764902353286743, + "learning_rate": 1.800846376349051e-05, + "loss": 0.0095, + "reward": 6.210136413574219, + "reward_std": 2.9500712156295776, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.509424448013306, + "rewards/no_repetition_reward_func": -0.2992878258228302, + "rewards/verse_reward_func": 0.0, + "step": 789 + }, + { + "completion_length": 511.125, + "epoch": 6.32, + "grad_norm": 0.203125, + "kl": 0.1548669971525669, + "learning_rate": 1.7941463578928086e-05, + "loss": 0.0062, + "reward": 6.807904243469238, + "reward_std": 2.5404438972473145, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.165772438049316, + "rewards/no_repetition_reward_func": -0.3500558137893677, + "rewards/verse_reward_func": -0.0078125, + "step": 790 + }, + { + "completion_length": 514.28125, + "epoch": 6.328, + "grad_norm": 0.208984375, + "kl": 0.20471110939979553, + "learning_rate": 1.7874518438250597e-05, + "loss": 0.0082, + "reward": 6.494274139404297, + "reward_std": 2.834963083267212, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.809087753295898, + "rewards/no_repetition_reward_func": -0.31481409072875977, + "rewards/verse_reward_func": 0.0, + "step": 791 + }, + { + "completion_length": 516.0, + "epoch": 6.336, + "grad_norm": 0.1806640625, + "kl": 0.18793028593063354, + "learning_rate": 1.7807628863509685e-05, + "loss": 0.0075, + "reward": 6.931738376617432, + "reward_std": 2.75593239068985, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.274221420288086, + "rewards/no_repetition_reward_func": -0.33467014133930206, + "rewards/verse_reward_func": -0.0078125, + "step": 792 + }, + { + "completion_length": 505.125, + "epoch": 6.344, + "grad_norm": 0.423828125, + "kl": 0.21492183208465576, + "learning_rate": 1.7740795376323692e-05, + "loss": 0.0086, + "reward": 5.733757734298706, + "reward_std": 2.4410014152526855, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.03771710395813, + "rewards/no_repetition_reward_func": -0.2961469292640686, + "rewards/verse_reward_func": -0.0078125, + "step": 793 + }, + { + "completion_length": 516.0, + "epoch": 6.352, + "grad_norm": 0.16015625, + "kl": 0.14590217918157578, + "learning_rate": 1.767401849787357e-05, + "loss": 0.0058, + "reward": 6.87451696395874, + "reward_std": 1.5293593406677246, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.219367265701294, + "rewards/no_repetition_reward_func": -0.34485042095184326, + "rewards/verse_reward_func": 0.0, + "step": 794 + }, + { + "completion_length": 514.453125, + "epoch": 6.36, + "grad_norm": 0.1611328125, + "kl": 0.18428754806518555, + "learning_rate": 1.7607298748898842e-05, + "loss": 0.0074, + "reward": 6.588886260986328, + "reward_std": 3.0521217584609985, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.9130024909973145, + "rewards/no_repetition_reward_func": -0.3163035959005356, + "rewards/verse_reward_func": -0.0078125, + "step": 795 + }, + { + "completion_length": 515.90625, + "epoch": 6.368, + "grad_norm": 0.16796875, + "kl": 0.19154923781752586, + "learning_rate": 1.7540636649693496e-05, + "loss": 0.0077, + "reward": 5.86194634437561, + "reward_std": 2.6436686515808105, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.190975189208984, + "rewards/no_repetition_reward_func": -0.31340403854846954, + "rewards/verse_reward_func": -0.015625, + "step": 796 + }, + { + "completion_length": 516.0, + "epoch": 6.376, + "grad_norm": 0.1806640625, + "kl": 0.1705680564045906, + "learning_rate": 1.747403272010199e-05, + "loss": 0.0068, + "reward": 6.7879626750946045, + "reward_std": 3.0870789289474487, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.149655103683472, + "rewards/no_repetition_reward_func": -0.33825506269931793, + "rewards/verse_reward_func": -0.0078125, + "step": 797 + }, + { + "completion_length": 516.0, + "epoch": 6.384, + "grad_norm": 0.1767578125, + "kl": 0.1280275508761406, + "learning_rate": 1.7407487479515147e-05, + "loss": 0.0051, + "reward": 6.427568435668945, + "reward_std": 3.1393665075302124, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.793744325637817, + "rewards/no_repetition_reward_func": -0.35836298763751984, + "rewards/verse_reward_func": -0.0078125, + "step": 798 + }, + { + "completion_length": 514.65625, + "epoch": 6.392, + "grad_norm": 0.25390625, + "kl": 0.19892165064811707, + "learning_rate": 1.73410014468661e-05, + "loss": 0.008, + "reward": 6.544047594070435, + "reward_std": 2.0198206305503845, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.8678812980651855, + "rewards/no_repetition_reward_func": -0.3082083761692047, + "rewards/verse_reward_func": -0.015625, + "step": 799 + }, + { + "completion_length": 515.40625, + "epoch": 6.4, + "grad_norm": 0.185546875, + "kl": 0.15979530662298203, + "learning_rate": 1.7274575140626318e-05, + "loss": 0.0064, + "reward": 6.712156534194946, + "reward_std": 2.9789888858795166, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.069041013717651, + "rewards/no_repetition_reward_func": -0.34907166659832, + "rewards/verse_reward_func": -0.0078125, + "step": 800 + }, + { + "completion_length": 513.28125, + "epoch": 6.408, + "grad_norm": 0.1865234375, + "kl": 0.15607761591672897, + "learning_rate": 1.7208209078801454e-05, + "loss": 0.0062, + "reward": 6.8695759773254395, + "reward_std": 2.3897876739501953, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.20291805267334, + "rewards/no_repetition_reward_func": -0.3333416134119034, + "rewards/verse_reward_func": 0.0, + "step": 801 + }, + { + "completion_length": 506.15625, + "epoch": 6.416, + "grad_norm": 0.2265625, + "kl": 0.1375354528427124, + "learning_rate": 1.7141903778927406e-05, + "loss": 0.0055, + "reward": 6.482586145401001, + "reward_std": 2.887751340866089, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.827888250350952, + "rewards/no_repetition_reward_func": -0.3296768516302109, + "rewards/verse_reward_func": -0.015625, + "step": 802 + }, + { + "completion_length": 515.0, + "epoch": 6.424, + "grad_norm": 0.1826171875, + "kl": 0.13385267555713654, + "learning_rate": 1.7075659758066208e-05, + "loss": 0.0054, + "reward": 6.963380813598633, + "reward_std": 2.934823751449585, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.3342978954315186, + "rewards/no_repetition_reward_func": -0.3474792391061783, + "rewards/verse_reward_func": -0.0078125, + "step": 803 + }, + { + "completion_length": 516.0, + "epoch": 6.432, + "grad_norm": 0.1787109375, + "kl": 0.27043259143829346, + "learning_rate": 1.7009477532802054e-05, + "loss": 0.0108, + "reward": 5.265694856643677, + "reward_std": 2.924737811088562, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.5371012687683105, + "rewards/no_repetition_reward_func": -0.25578125566244125, + "rewards/verse_reward_func": -0.015625, + "step": 804 + }, + { + "completion_length": 500.390625, + "epoch": 6.44, + "grad_norm": 0.328125, + "kl": 0.3204628676176071, + "learning_rate": 1.6943357619237226e-05, + "loss": 0.0128, + "reward": 5.1291210651397705, + "reward_std": 3.403800129890442, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.416815280914307, + "rewards/no_repetition_reward_func": -0.24081959575414658, + "rewards/verse_reward_func": -0.046875, + "step": 805 + }, + { + "completion_length": 516.0, + "epoch": 6.448, + "grad_norm": 0.16015625, + "kl": 0.19001927226781845, + "learning_rate": 1.6877300532988094e-05, + "loss": 0.0076, + "reward": 6.7490763664245605, + "reward_std": 2.85036301612854, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.0734522342681885, + "rewards/no_repetition_reward_func": -0.3165638744831085, + "rewards/verse_reward_func": -0.0078125, + "step": 806 + }, + { + "completion_length": 516.0, + "epoch": 6.456, + "grad_norm": 0.1787109375, + "kl": 0.24828723073005676, + "learning_rate": 1.681130678918108e-05, + "loss": 0.0099, + "reward": 5.0962454080581665, + "reward_std": 3.6821091175079346, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.402203321456909, + "rewards/no_repetition_reward_func": -0.2747075781226158, + "rewards/verse_reward_func": -0.03125, + "step": 807 + }, + { + "completion_length": 516.0, + "epoch": 6.464, + "grad_norm": 0.1630859375, + "kl": 0.1777566857635975, + "learning_rate": 1.6745376902448656e-05, + "loss": 0.0071, + "reward": 5.9705681800842285, + "reward_std": 3.108624577522278, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.28438401222229, + "rewards/no_repetition_reward_func": -0.3138158768415451, + "rewards/verse_reward_func": 0.0, + "step": 808 + }, + { + "completion_length": 508.421875, + "epoch": 6.4719999999999995, + "grad_norm": 0.494140625, + "kl": 0.15380962938070297, + "learning_rate": 1.6679511386925337e-05, + "loss": 0.0062, + "reward": 7.500708103179932, + "reward_std": 2.3143975734710693, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.857238054275513, + "rewards/no_repetition_reward_func": -0.3565298020839691, + "rewards/verse_reward_func": 0.0, + "step": 809 + }, + { + "completion_length": 516.0, + "epoch": 6.48, + "grad_norm": 0.17578125, + "kl": 0.2625695765018463, + "learning_rate": 1.6613710756243626e-05, + "loss": 0.0105, + "reward": 5.180132865905762, + "reward_std": 2.598453938961029, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.459207057952881, + "rewards/no_repetition_reward_func": -0.2634493410587311, + "rewards/verse_reward_func": -0.015625, + "step": 810 + }, + { + "completion_length": 516.0, + "epoch": 6.4879999999999995, + "grad_norm": 0.1796875, + "kl": 0.19329293072223663, + "learning_rate": 1.6547975523530075e-05, + "loss": 0.0077, + "reward": 5.967850208282471, + "reward_std": 2.924728035926819, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.2919921875, + "rewards/no_repetition_reward_func": -0.3163294792175293, + "rewards/verse_reward_func": -0.0078125, + "step": 811 + }, + { + "completion_length": 508.96875, + "epoch": 6.496, + "grad_norm": 0.19921875, + "kl": 0.24054941534996033, + "learning_rate": 1.648230620140121e-05, + "loss": 0.0096, + "reward": 5.453131675720215, + "reward_std": 3.5251433849334717, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.741034507751465, + "rewards/no_repetition_reward_func": -0.2879025489091873, + "rewards/verse_reward_func": 0.0, + "step": 812 + }, + { + "completion_length": 510.296875, + "epoch": 6.504, + "grad_norm": 0.2275390625, + "kl": 0.19387678802013397, + "learning_rate": 1.6416703301959622e-05, + "loss": 0.0078, + "reward": 6.382873296737671, + "reward_std": 3.2961524724960327, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.698626518249512, + "rewards/no_repetition_reward_func": -0.30794061720371246, + "rewards/verse_reward_func": -0.0078125, + "step": 813 + }, + { + "completion_length": 509.34375, + "epoch": 6.5120000000000005, + "grad_norm": 0.1826171875, + "kl": 0.19028941541910172, + "learning_rate": 1.635116733678988e-05, + "loss": 0.0076, + "reward": 6.523875713348389, + "reward_std": 2.9537227153778076, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.8579230308532715, + "rewards/no_repetition_reward_func": -0.3184220790863037, + "rewards/verse_reward_func": 0.0, + "step": 814 + }, + { + "completion_length": 512.0, + "epoch": 6.52, + "grad_norm": 0.1875, + "kl": 0.19803021103143692, + "learning_rate": 1.6285698816954624e-05, + "loss": 0.0079, + "reward": 6.2412965297698975, + "reward_std": 2.5011229515075684, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.544342756271362, + "rewards/no_repetition_reward_func": -0.3030463010072708, + "rewards/verse_reward_func": 0.0, + "step": 815 + }, + { + "completion_length": 511.25, + "epoch": 6.5280000000000005, + "grad_norm": 0.1904296875, + "kl": 0.1865399330854416, + "learning_rate": 1.6220298252990502e-05, + "loss": 0.0075, + "reward": 6.40861701965332, + "reward_std": 3.1091984510421753, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.7313392162323, + "rewards/no_repetition_reward_func": -0.3149094581604004, + "rewards/verse_reward_func": -0.0078125, + "step": 816 + }, + { + "completion_length": 516.0, + "epoch": 6.536, + "grad_norm": 0.1923828125, + "kl": 0.16641809046268463, + "learning_rate": 1.6154966154904265e-05, + "loss": 0.0067, + "reward": 6.419846534729004, + "reward_std": 2.992170810699463, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.741972923278809, + "rewards/no_repetition_reward_func": -0.3221266269683838, + "rewards/verse_reward_func": 0.0, + "step": 817 + }, + { + "completion_length": 516.0, + "epoch": 6.5440000000000005, + "grad_norm": 0.189453125, + "kl": 0.17070432007312775, + "learning_rate": 1.6089703032168733e-05, + "loss": 0.0068, + "reward": 6.257592678070068, + "reward_std": 3.050842523574829, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.584623575210571, + "rewards/no_repetition_reward_func": -0.3270307034254074, + "rewards/verse_reward_func": 0.0, + "step": 818 + }, + { + "completion_length": 516.0, + "epoch": 6.552, + "grad_norm": 0.17578125, + "kl": 0.21284641325473785, + "learning_rate": 1.6024509393718844e-05, + "loss": 0.0085, + "reward": 6.85333776473999, + "reward_std": 1.926945447921753, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.178687572479248, + "rewards/no_repetition_reward_func": -0.31753748655319214, + "rewards/verse_reward_func": -0.0078125, + "step": 819 + }, + { + "completion_length": 501.515625, + "epoch": 6.5600000000000005, + "grad_norm": 0.248046875, + "kl": 0.31943847239017487, + "learning_rate": 1.5959385747947698e-05, + "loss": 0.0128, + "reward": 4.512991666793823, + "reward_std": 3.1887874603271484, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.759637713432312, + "rewards/no_repetition_reward_func": -0.2153959795832634, + "rewards/verse_reward_func": -0.03125, + "step": 820 + }, + { + "completion_length": 510.4375, + "epoch": 6.568, + "grad_norm": 0.1923828125, + "kl": 0.22479645907878876, + "learning_rate": 1.5894332602702545e-05, + "loss": 0.009, + "reward": 5.744458913803101, + "reward_std": 3.015574097633362, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.051399230957031, + "rewards/no_repetition_reward_func": -0.2835026830434799, + "rewards/verse_reward_func": -0.0234375, + "step": 821 + }, + { + "completion_length": 516.0, + "epoch": 6.576, + "grad_norm": 0.2001953125, + "kl": 0.17785222083330154, + "learning_rate": 1.58293504652809e-05, + "loss": 0.0071, + "reward": 6.729619979858398, + "reward_std": 3.000239849090576, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.075102806091309, + "rewards/no_repetition_reward_func": -0.32985737919807434, + "rewards/verse_reward_func": 0.0, + "step": 822 + }, + { + "completion_length": 513.0, + "epoch": 6.584, + "grad_norm": 0.1982421875, + "kl": 0.24895483255386353, + "learning_rate": 1.5764439842426515e-05, + "loss": 0.01, + "reward": 5.289915323257446, + "reward_std": 2.861257314682007, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.572822093963623, + "rewards/no_repetition_reward_func": -0.2594688981771469, + "rewards/verse_reward_func": -0.0234375, + "step": 823 + }, + { + "completion_length": 515.078125, + "epoch": 6.592, + "grad_norm": 0.1845703125, + "kl": 0.21277977526187897, + "learning_rate": 1.5699601240325474e-05, + "loss": 0.0085, + "reward": 5.437778949737549, + "reward_std": 3.3216097354888916, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.754423141479492, + "rewards/no_repetition_reward_func": -0.3010195642709732, + "rewards/verse_reward_func": -0.015625, + "step": 824 + }, + { + "completion_length": 516.0, + "epoch": 6.6, + "grad_norm": 0.19921875, + "kl": 0.2247595191001892, + "learning_rate": 1.56348351646022e-05, + "loss": 0.009, + "reward": 6.184688568115234, + "reward_std": 3.2960084676742554, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.487991809844971, + "rewards/no_repetition_reward_func": -0.29549045860767365, + "rewards/verse_reward_func": -0.0078125, + "step": 825 + }, + { + "completion_length": 515.859375, + "epoch": 6.608, + "grad_norm": 0.1494140625, + "kl": 0.1636379286646843, + "learning_rate": 1.557014212031559e-05, + "loss": 0.0065, + "reward": 7.506812572479248, + "reward_std": 2.2365167140960693, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.847954034805298, + "rewards/no_repetition_reward_func": -0.3411414474248886, + "rewards/verse_reward_func": 0.0, + "step": 826 + }, + { + "completion_length": 508.125, + "epoch": 6.616, + "grad_norm": 0.4765625, + "kl": 0.2162659615278244, + "learning_rate": 1.5505522611954975e-05, + "loss": 0.0087, + "reward": 6.322905540466309, + "reward_std": 3.5107332468032837, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.656800270080566, + "rewards/no_repetition_reward_func": -0.31045734882354736, + "rewards/verse_reward_func": -0.0234375, + "step": 827 + }, + { + "completion_length": 506.8125, + "epoch": 6.624, + "grad_norm": 0.291015625, + "kl": 0.2481691762804985, + "learning_rate": 1.544097714343627e-05, + "loss": 0.0099, + "reward": 5.848810434341431, + "reward_std": 3.0803704261779785, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.168421030044556, + "rewards/no_repetition_reward_func": -0.2961735427379608, + "rewards/verse_reward_func": -0.0234375, + "step": 828 + }, + { + "completion_length": 516.0, + "epoch": 6.632, + "grad_norm": 0.158203125, + "kl": 0.1694183498620987, + "learning_rate": 1.5376506218098015e-05, + "loss": 0.0068, + "reward": 6.73761510848999, + "reward_std": 3.088104248046875, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.082449913024902, + "rewards/no_repetition_reward_func": -0.3370222896337509, + "rewards/verse_reward_func": -0.0078125, + "step": 829 + }, + { + "completion_length": 516.0, + "epoch": 6.64, + "grad_norm": 0.216796875, + "kl": 0.14702265709638596, + "learning_rate": 1.5312110338697426e-05, + "loss": 0.0059, + "reward": 7.191505670547485, + "reward_std": 2.323424220085144, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.561346054077148, + "rewards/no_repetition_reward_func": -0.3464028984308243, + "rewards/verse_reward_func": -0.0234375, + "step": 830 + }, + { + "completion_length": 515.40625, + "epoch": 6.648, + "grad_norm": 0.1826171875, + "kl": 0.17514482140541077, + "learning_rate": 1.524779000740651e-05, + "loss": 0.007, + "reward": 6.685232162475586, + "reward_std": 2.948024034500122, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.026321887969971, + "rewards/no_repetition_reward_func": -0.32546477019786835, + "rewards/verse_reward_func": -0.015625, + "step": 831 + }, + { + "completion_length": 516.0, + "epoch": 6.656, + "grad_norm": 0.16796875, + "kl": 0.17346914112567902, + "learning_rate": 1.5183545725808127e-05, + "loss": 0.0069, + "reward": 6.44541597366333, + "reward_std": 3.099383592605591, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.779925346374512, + "rewards/no_repetition_reward_func": -0.3266966789960861, + "rewards/verse_reward_func": -0.0078125, + "step": 832 + }, + { + "completion_length": 508.890625, + "epoch": 6.664, + "grad_norm": 0.1884765625, + "kl": 0.2654127776622772, + "learning_rate": 1.5119377994892094e-05, + "loss": 0.0106, + "reward": 5.665979623794556, + "reward_std": 1.9201484620571136, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.949611663818359, + "rewards/no_repetition_reward_func": -0.26800744235515594, + "rewards/verse_reward_func": -0.015625, + "step": 833 + }, + { + "completion_length": 515.59375, + "epoch": 6.672, + "grad_norm": 0.2060546875, + "kl": 0.15293572843074799, + "learning_rate": 1.505528731505126e-05, + "loss": 0.0061, + "reward": 7.220753908157349, + "reward_std": 2.6118332147598267, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.577713251113892, + "rewards/no_repetition_reward_func": -0.35695941746234894, + "rewards/verse_reward_func": 0.0, + "step": 834 + }, + { + "completion_length": 512.46875, + "epoch": 6.68, + "grad_norm": 0.171875, + "kl": 0.2698884457349777, + "learning_rate": 1.4991274186077632e-05, + "loss": 0.0108, + "reward": 5.324091672897339, + "reward_std": 2.9827611446380615, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.607865571975708, + "rewards/no_repetition_reward_func": -0.2603365629911423, + "rewards/verse_reward_func": -0.0234375, + "step": 835 + }, + { + "completion_length": 516.0, + "epoch": 6.688, + "grad_norm": 0.1689453125, + "kl": 0.17023492604494095, + "learning_rate": 1.4927339107158437e-05, + "loss": 0.0068, + "reward": 6.8210790157318115, + "reward_std": 3.194422125816345, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.1577935218811035, + "rewards/no_repetition_reward_func": -0.32890188694000244, + "rewards/verse_reward_func": -0.0078125, + "step": 836 + }, + { + "completion_length": 510.65625, + "epoch": 6.696, + "grad_norm": 0.236328125, + "kl": 0.24818094074726105, + "learning_rate": 1.4863482576872275e-05, + "loss": 0.0099, + "reward": 5.9993298053741455, + "reward_std": 2.7619487047195435, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.308370590209961, + "rewards/no_repetition_reward_func": -0.2856028974056244, + "rewards/verse_reward_func": -0.0078125, + "step": 837 + }, + { + "completion_length": 510.609375, + "epoch": 6.704, + "grad_norm": 0.2138671875, + "kl": 0.29697054624557495, + "learning_rate": 1.4799705093185181e-05, + "loss": 0.0119, + "reward": 5.4482667446136475, + "reward_std": 3.44553279876709, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.750333786010742, + "rewards/no_repetition_reward_func": -0.286441907286644, + "rewards/verse_reward_func": -0.015625, + "step": 838 + }, + { + "completion_length": 516.0, + "epoch": 6.712, + "grad_norm": 0.1826171875, + "kl": 0.1611843705177307, + "learning_rate": 1.4736007153446801e-05, + "loss": 0.0064, + "reward": 6.95662260055542, + "reward_std": 2.516318202018738, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.2942280769348145, + "rewards/no_repetition_reward_func": -0.3376053273677826, + "rewards/verse_reward_func": 0.0, + "step": 839 + }, + { + "completion_length": 501.921875, + "epoch": 6.72, + "grad_norm": 0.2197265625, + "kl": 0.24554511904716492, + "learning_rate": 1.467238925438646e-05, + "loss": 0.0098, + "reward": 5.226513624191284, + "reward_std": 3.1207586526870728, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.49443793296814, + "rewards/no_repetition_reward_func": -0.26011189818382263, + "rewards/verse_reward_func": -0.0078125, + "step": 840 + }, + { + "completion_length": 516.0, + "epoch": 6.728, + "grad_norm": 0.193359375, + "kl": 0.2225652039051056, + "learning_rate": 1.4608851892109304e-05, + "loss": 0.0089, + "reward": 6.232762336730957, + "reward_std": 2.499190866947174, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.55870246887207, + "rewards/no_repetition_reward_func": -0.3103153854608536, + "rewards/verse_reward_func": 0.0, + "step": 841 + }, + { + "completion_length": 511.71875, + "epoch": 6.736, + "grad_norm": 0.1650390625, + "kl": 0.19146110862493515, + "learning_rate": 1.4545395562092468e-05, + "loss": 0.0077, + "reward": 6.3420469760894775, + "reward_std": 2.971831440925598, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.653954029083252, + "rewards/no_repetition_reward_func": -0.31190723180770874, + "rewards/verse_reward_func": 0.0, + "step": 842 + }, + { + "completion_length": 515.09375, + "epoch": 6.744, + "grad_norm": 0.1962890625, + "kl": 0.29601840674877167, + "learning_rate": 1.4482020759181135e-05, + "loss": 0.0118, + "reward": 4.497159957885742, + "reward_std": 3.007424831390381, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.738631963729858, + "rewards/no_repetition_reward_func": -0.2414720505475998, + "rewards/verse_reward_func": 0.0, + "step": 843 + }, + { + "completion_length": 516.0, + "epoch": 6.752, + "grad_norm": 0.1865234375, + "kl": 0.13232539594173431, + "learning_rate": 1.4418727977584774e-05, + "loss": 0.0053, + "reward": 7.12213659286499, + "reward_std": 2.731326460838318, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.47395920753479, + "rewards/no_repetition_reward_func": -0.3518224358558655, + "rewards/verse_reward_func": 0.0, + "step": 844 + }, + { + "completion_length": 510.34375, + "epoch": 6.76, + "grad_norm": 0.25390625, + "kl": 0.2280101627111435, + "learning_rate": 1.4355517710873184e-05, + "loss": 0.0091, + "reward": 5.935611248016357, + "reward_std": 3.5109397172927856, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.2597081661224365, + "rewards/no_repetition_reward_func": -0.3084724098443985, + "rewards/verse_reward_func": -0.015625, + "step": 845 + }, + { + "completion_length": 516.0, + "epoch": 6.768, + "grad_norm": 0.18359375, + "kl": 0.1647563800215721, + "learning_rate": 1.4292390451972745e-05, + "loss": 0.0066, + "reward": 6.668372392654419, + "reward_std": 3.1512975692749023, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.02032470703125, + "rewards/no_repetition_reward_func": -0.3363274335861206, + "rewards/verse_reward_func": -0.015625, + "step": 846 + }, + { + "completion_length": 516.0, + "epoch": 6.776, + "grad_norm": 0.1669921875, + "kl": 0.139445960521698, + "learning_rate": 1.42293466931625e-05, + "loss": 0.0056, + "reward": 7.463610887527466, + "reward_std": 2.2317153215408325, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.822521686553955, + "rewards/no_repetition_reward_func": -0.35891062021255493, + "rewards/verse_reward_func": 0.0, + "step": 847 + }, + { + "completion_length": 511.03125, + "epoch": 6.784, + "grad_norm": 0.18359375, + "kl": 0.16591069847345352, + "learning_rate": 1.4166386926070322e-05, + "loss": 0.0066, + "reward": 6.126009702682495, + "reward_std": 3.7284984588623047, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.477410554885864, + "rewards/no_repetition_reward_func": -0.3357759416103363, + "rewards/verse_reward_func": 0.0, + "step": 848 + }, + { + "completion_length": 516.0, + "epoch": 6.792, + "grad_norm": 0.181640625, + "kl": 0.2832646518945694, + "learning_rate": 1.4103511641669152e-05, + "loss": 0.0113, + "reward": 5.144346714019775, + "reward_std": 3.3437581062316895, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.4228515625, + "rewards/no_repetition_reward_func": -0.24725457280874252, + "rewards/verse_reward_func": -0.03125, + "step": 849 + }, + { + "completion_length": 516.0, + "epoch": 6.8, + "grad_norm": 0.2041015625, + "kl": 0.29926450550556183, + "learning_rate": 1.4040721330273062e-05, + "loss": 0.012, + "reward": 5.20207142829895, + "reward_std": 2.9660509824752808, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.461638927459717, + "rewards/no_repetition_reward_func": -0.24394240230321884, + "rewards/verse_reward_func": -0.015625, + "step": 850 + }, + { + "completion_length": 515.34375, + "epoch": 6.808, + "grad_norm": 0.1630859375, + "kl": 0.2128991186618805, + "learning_rate": 1.397801648153354e-05, + "loss": 0.0085, + "reward": 5.8121864795684814, + "reward_std": 2.7756924629211426, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.131340026855469, + "rewards/no_repetition_reward_func": -0.2957165092229843, + "rewards/verse_reward_func": -0.0234375, + "step": 851 + }, + { + "completion_length": 515.0625, + "epoch": 6.816, + "grad_norm": 0.19921875, + "kl": 0.18035408109426498, + "learning_rate": 1.3915397584435563e-05, + "loss": 0.0072, + "reward": 6.475473403930664, + "reward_std": 3.3597841262817383, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.795717000961304, + "rewards/no_repetition_reward_func": -0.3124309182167053, + "rewards/verse_reward_func": -0.0078125, + "step": 852 + }, + { + "completion_length": 515.796875, + "epoch": 6.824, + "grad_norm": 0.162109375, + "kl": 0.146553136408329, + "learning_rate": 1.3852865127293902e-05, + "loss": 0.0059, + "reward": 7.069700479507446, + "reward_std": 2.8513022661209106, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.445460319519043, + "rewards/no_repetition_reward_func": -0.360134482383728, + "rewards/verse_reward_func": 0.0, + "step": 853 + }, + { + "completion_length": 510.09375, + "epoch": 6.832, + "grad_norm": 0.232421875, + "kl": 0.20125699788331985, + "learning_rate": 1.3790419597749199e-05, + "loss": 0.0081, + "reward": 6.460810661315918, + "reward_std": 2.6549630165100098, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.763975381851196, + "rewards/no_repetition_reward_func": -0.30316443741321564, + "rewards/verse_reward_func": 0.0, + "step": 854 + }, + { + "completion_length": 516.0, + "epoch": 6.84, + "grad_norm": 0.1923828125, + "kl": 0.19869322329759598, + "learning_rate": 1.3728061482764238e-05, + "loss": 0.0079, + "reward": 6.017510652542114, + "reward_std": 2.4990289211273193, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.354944229125977, + "rewards/no_repetition_reward_func": -0.29837101697921753, + "rewards/verse_reward_func": -0.0234375, + "step": 855 + }, + { + "completion_length": 511.296875, + "epoch": 6.848, + "grad_norm": 0.1884765625, + "kl": 0.226016104221344, + "learning_rate": 1.366579126862012e-05, + "loss": 0.009, + "reward": 5.531196117401123, + "reward_std": 3.8815345764160156, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.843848943710327, + "rewards/no_repetition_reward_func": -0.28140250593423843, + "rewards/verse_reward_func": -0.03125, + "step": 856 + }, + { + "completion_length": 516.0, + "epoch": 6.856, + "grad_norm": 0.216796875, + "kl": 0.24504418671131134, + "learning_rate": 1.3603609440912507e-05, + "loss": 0.0098, + "reward": 5.571438312530518, + "reward_std": 2.8026922941207886, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.87706184387207, + "rewards/no_repetition_reward_func": -0.2743729501962662, + "rewards/verse_reward_func": -0.03125, + "step": 857 + }, + { + "completion_length": 516.0, + "epoch": 6.864, + "grad_norm": 0.169921875, + "kl": 0.2577463388442993, + "learning_rate": 1.3541516484547753e-05, + "loss": 0.0103, + "reward": 5.470128536224365, + "reward_std": 3.256219983100891, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.753244638442993, + "rewards/no_repetition_reward_func": -0.2831161767244339, + "rewards/verse_reward_func": 0.0, + "step": 858 + }, + { + "completion_length": 511.484375, + "epoch": 6.872, + "grad_norm": 0.1767578125, + "kl": 0.20346342027187347, + "learning_rate": 1.3479512883739232e-05, + "loss": 0.0081, + "reward": 6.275417804718018, + "reward_std": 2.7641009092330933, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.570582389831543, + "rewards/no_repetition_reward_func": -0.295164555311203, + "rewards/verse_reward_func": 0.0, + "step": 859 + }, + { + "completion_length": 516.0, + "epoch": 6.88, + "grad_norm": 0.1796875, + "kl": 0.2323712855577469, + "learning_rate": 1.3417599122003464e-05, + "loss": 0.0093, + "reward": 5.456499338150024, + "reward_std": 3.0018726587295532, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.7483885288238525, + "rewards/no_repetition_reward_func": -0.2840769290924072, + "rewards/verse_reward_func": -0.0078125, + "step": 860 + }, + { + "completion_length": 516.0, + "epoch": 6.888, + "grad_norm": 0.2041015625, + "kl": 0.197649247944355, + "learning_rate": 1.3355775682156393e-05, + "loss": 0.0079, + "reward": 6.546603679656982, + "reward_std": 2.737797260284424, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.879337787628174, + "rewards/no_repetition_reward_func": -0.3171091675758362, + "rewards/verse_reward_func": 0.0, + "step": 861 + }, + { + "completion_length": 514.6875, + "epoch": 6.896, + "grad_norm": 0.1875, + "kl": 0.3204898536205292, + "learning_rate": 1.329404304630964e-05, + "loss": 0.0128, + "reward": 4.393243312835693, + "reward_std": 3.6332204341888428, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.641370058059692, + "rewards/no_repetition_reward_func": -0.2325015813112259, + "rewards/verse_reward_func": -0.015625, + "step": 862 + }, + { + "completion_length": 516.0, + "epoch": 6.904, + "grad_norm": 0.1689453125, + "kl": 0.25045672059059143, + "learning_rate": 1.3232401695866687e-05, + "loss": 0.01, + "reward": 5.4434874057769775, + "reward_std": 3.118802070617676, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.7422168254852295, + "rewards/no_repetition_reward_func": -0.28310446441173553, + "rewards/verse_reward_func": -0.015625, + "step": 863 + }, + { + "completion_length": 516.0, + "epoch": 6.912, + "grad_norm": 0.16796875, + "kl": 0.22837073355913162, + "learning_rate": 1.3170852111519175e-05, + "loss": 0.0091, + "reward": 6.083322286605835, + "reward_std": 2.6811845302581787, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.389477491378784, + "rewards/no_repetition_reward_func": -0.2905304878950119, + "rewards/verse_reward_func": -0.015625, + "step": 864 + }, + { + "completion_length": 508.46875, + "epoch": 6.92, + "grad_norm": 0.2216796875, + "kl": 0.22444022446870804, + "learning_rate": 1.3109394773243117e-05, + "loss": 0.009, + "reward": 5.325010061264038, + "reward_std": 3.8559160232543945, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.6392598152160645, + "rewards/no_repetition_reward_func": -0.2986246943473816, + "rewards/verse_reward_func": 0.0, + "step": 865 + }, + { + "completion_length": 516.0, + "epoch": 6.928, + "grad_norm": 0.185546875, + "kl": 0.2932434380054474, + "learning_rate": 1.3048030160295196e-05, + "loss": 0.0117, + "reward": 4.9611968994140625, + "reward_std": 3.5135830640792847, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.2209553718566895, + "rewards/no_repetition_reward_func": -0.24413372576236725, + "rewards/verse_reward_func": -0.015625, + "step": 866 + }, + { + "completion_length": 509.9375, + "epoch": 6.936, + "grad_norm": 0.203125, + "kl": 0.34182223677635193, + "learning_rate": 1.2986758751208983e-05, + "loss": 0.0137, + "reward": 3.681870222091675, + "reward_std": 2.435154676437378, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 3.9519604444503784, + "rewards/no_repetition_reward_func": -0.20759014785289764, + "rewards/verse_reward_func": -0.046875, + "step": 867 + }, + { + "completion_length": 516.0, + "epoch": 6.944, + "grad_norm": 0.2275390625, + "kl": 0.17250486463308334, + "learning_rate": 1.292558102379124e-05, + "loss": 0.0069, + "reward": 6.727075099945068, + "reward_std": 3.109068512916565, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.039540767669678, + "rewards/no_repetition_reward_func": -0.312465101480484, + "rewards/verse_reward_func": 0.0, + "step": 868 + }, + { + "completion_length": 507.375, + "epoch": 6.952, + "grad_norm": 0.2119140625, + "kl": 0.21277575194835663, + "learning_rate": 1.2864497455118152e-05, + "loss": 0.0085, + "reward": 6.063355922698975, + "reward_std": 2.942259430885315, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.372440814971924, + "rewards/no_repetition_reward_func": -0.3012722283601761, + "rewards/verse_reward_func": -0.0078125, + "step": 869 + }, + { + "completion_length": 515.15625, + "epoch": 6.96, + "grad_norm": 0.203125, + "kl": 0.20480011403560638, + "learning_rate": 1.280350852153168e-05, + "loss": 0.0082, + "reward": 6.251322269439697, + "reward_std": 3.465595006942749, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.551779747009277, + "rewards/no_repetition_reward_func": -0.300457701086998, + "rewards/verse_reward_func": 0.0, + "step": 870 + }, + { + "completion_length": 516.0, + "epoch": 6.968, + "grad_norm": 0.1630859375, + "kl": 0.15482491254806519, + "learning_rate": 1.2742614698635782e-05, + "loss": 0.0062, + "reward": 6.388710975646973, + "reward_std": 3.2078664302825928, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.772089004516602, + "rewards/no_repetition_reward_func": -0.3443157225847244, + "rewards/verse_reward_func": -0.0390625, + "step": 871 + }, + { + "completion_length": 515.109375, + "epoch": 6.976, + "grad_norm": 0.1943359375, + "kl": 0.18400952219963074, + "learning_rate": 1.2681816461292715e-05, + "loss": 0.0074, + "reward": 6.921034336090088, + "reward_std": 2.815249800682068, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.254708766937256, + "rewards/no_repetition_reward_func": -0.3258618712425232, + "rewards/verse_reward_func": -0.0078125, + "step": 872 + }, + { + "completion_length": 515.53125, + "epoch": 6.984, + "grad_norm": 0.1728515625, + "kl": 0.21698392927646637, + "learning_rate": 1.2621114283619345e-05, + "loss": 0.0087, + "reward": 6.489520072937012, + "reward_std": 3.0512701272964478, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.814892053604126, + "rewards/no_repetition_reward_func": -0.30974726378917694, + "rewards/verse_reward_func": -0.015625, + "step": 873 + }, + { + "completion_length": 516.0, + "epoch": 6.992, + "grad_norm": 0.1728515625, + "kl": 0.1690705567598343, + "learning_rate": 1.2560508638983437e-05, + "loss": 0.0068, + "reward": 7.143735408782959, + "reward_std": 2.3558915853500366, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.4846436977386475, + "rewards/no_repetition_reward_func": -0.3409079909324646, + "rewards/verse_reward_func": 0.0, + "step": 874 + }, + { + "completion_length": 516.0, + "epoch": 7.0, + "grad_norm": 0.2490234375, + "kl": 0.21342819929122925, + "learning_rate": 1.2500000000000006e-05, + "loss": 0.0085, + "reward": 6.240611791610718, + "reward_std": 3.117835283279419, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.5285279750823975, + "rewards/no_repetition_reward_func": -0.28791651129722595, + "rewards/verse_reward_func": 0.0, + "step": 875 + }, + { + "completion_length": 501.234375, + "epoch": 7.008, + "grad_norm": 0.52734375, + "kl": 0.21716012433171272, + "learning_rate": 1.243958883852755e-05, + "loss": 0.0087, + "reward": 7.149141311645508, + "reward_std": 2.7121100425720215, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.51563835144043, + "rewards/no_repetition_reward_func": -0.35868409276008606, + "rewards/verse_reward_func": -0.0078125, + "step": 876 + }, + { + "completion_length": 516.0, + "epoch": 7.016, + "grad_norm": 0.173828125, + "kl": 0.33708302676677704, + "learning_rate": 1.2379275625664461e-05, + "loss": 0.0135, + "reward": 4.229436278343201, + "reward_std": 2.4882028102874756, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.4646559953689575, + "rewards/no_repetition_reward_func": -0.2117825150489807, + "rewards/verse_reward_func": -0.0234375, + "step": 877 + }, + { + "completion_length": 515.71875, + "epoch": 7.024, + "grad_norm": 0.181640625, + "kl": 0.23155120015144348, + "learning_rate": 1.2319060831745272e-05, + "loss": 0.0093, + "reward": 6.084773063659668, + "reward_std": 3.1889907121658325, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.3941192626953125, + "rewards/no_repetition_reward_func": -0.3015340566635132, + "rewards/verse_reward_func": -0.0078125, + "step": 878 + }, + { + "completion_length": 509.0625, + "epoch": 7.032, + "grad_norm": 0.310546875, + "kl": 0.17633900046348572, + "learning_rate": 1.2258944926337057e-05, + "loss": 0.0071, + "reward": 6.197081565856934, + "reward_std": 3.368439197540283, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.536088705062866, + "rewards/no_repetition_reward_func": -0.32338254153728485, + "rewards/verse_reward_func": -0.015625, + "step": 879 + }, + { + "completion_length": 507.5625, + "epoch": 7.04, + "grad_norm": 0.2392578125, + "kl": 0.21122759580612183, + "learning_rate": 1.2198928378235716e-05, + "loss": 0.0084, + "reward": 6.787936449050903, + "reward_std": 3.1802785396575928, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.120457410812378, + "rewards/no_repetition_reward_func": -0.32470859587192535, + "rewards/verse_reward_func": -0.0078125, + "step": 880 + }, + { + "completion_length": 516.0, + "epoch": 7.048, + "grad_norm": 0.1650390625, + "kl": 0.24790892004966736, + "learning_rate": 1.2139011655462337e-05, + "loss": 0.0099, + "reward": 5.855077266693115, + "reward_std": 2.555476665496826, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.160896301269531, + "rewards/no_repetition_reward_func": -0.29800644516944885, + "rewards/verse_reward_func": -0.0078125, + "step": 881 + }, + { + "completion_length": 513.53125, + "epoch": 7.056, + "grad_norm": 0.1728515625, + "kl": 0.19415901601314545, + "learning_rate": 1.2079195225259579e-05, + "loss": 0.0078, + "reward": 6.604831218719482, + "reward_std": 2.4778029918670654, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.953097820281982, + "rewards/no_repetition_reward_func": -0.3170166313648224, + "rewards/verse_reward_func": -0.015625, + "step": 882 + }, + { + "completion_length": 514.9375, + "epoch": 7.064, + "grad_norm": 0.19140625, + "kl": 0.20971039682626724, + "learning_rate": 1.2019479554087964e-05, + "loss": 0.0084, + "reward": 6.23832631111145, + "reward_std": 2.8419041633605957, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.573937177658081, + "rewards/no_repetition_reward_func": -0.3043612688779831, + "rewards/verse_reward_func": -0.015625, + "step": 883 + }, + { + "completion_length": 513.078125, + "epoch": 7.072, + "grad_norm": 0.16015625, + "kl": 0.1363869085907936, + "learning_rate": 1.1959865107622307e-05, + "loss": 0.0055, + "reward": 7.061327934265137, + "reward_std": 3.0586901903152466, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.425154209136963, + "rewards/no_repetition_reward_func": -0.3560136556625366, + "rewards/verse_reward_func": -0.0078125, + "step": 884 + }, + { + "completion_length": 509.125, + "epoch": 7.08, + "grad_norm": 0.193359375, + "kl": 0.23321808129549026, + "learning_rate": 1.1900352350748026e-05, + "loss": 0.0093, + "reward": 6.11139702796936, + "reward_std": 3.2361204624176025, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.424077987670898, + "rewards/no_repetition_reward_func": -0.2892434298992157, + "rewards/verse_reward_func": -0.0078125, + "step": 885 + }, + { + "completion_length": 508.625, + "epoch": 7.088, + "grad_norm": 0.18359375, + "kl": 0.27085475623607635, + "learning_rate": 1.1840941747557558e-05, + "loss": 0.0108, + "reward": 5.324520826339722, + "reward_std": 3.592620849609375, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.588459730148315, + "rewards/no_repetition_reward_func": -0.25612659752368927, + "rewards/verse_reward_func": -0.0078125, + "step": 886 + }, + { + "completion_length": 516.0, + "epoch": 7.096, + "grad_norm": 0.1875, + "kl": 0.13728976249694824, + "learning_rate": 1.1781633761346707e-05, + "loss": 0.0055, + "reward": 7.216569185256958, + "reward_std": 2.3233503103256226, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.570141077041626, + "rewards/no_repetition_reward_func": -0.34575919806957245, + "rewards/verse_reward_func": -0.0078125, + "step": 887 + }, + { + "completion_length": 514.328125, + "epoch": 7.104, + "grad_norm": 0.1884765625, + "kl": 0.21864160895347595, + "learning_rate": 1.172242885461109e-05, + "loss": 0.0087, + "reward": 6.3264243602752686, + "reward_std": 3.2665061950683594, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.644644737243652, + "rewards/no_repetition_reward_func": -0.3025956451892853, + "rewards/verse_reward_func": -0.015625, + "step": 888 + }, + { + "completion_length": 505.484375, + "epoch": 7.112, + "grad_norm": 0.17578125, + "kl": 0.2545260339975357, + "learning_rate": 1.1663327489042435e-05, + "loss": 0.0102, + "reward": 5.450331687927246, + "reward_std": 3.640956997871399, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.741487741470337, + "rewards/no_repetition_reward_func": -0.2755310535430908, + "rewards/verse_reward_func": 0.0, + "step": 889 + }, + { + "completion_length": 511.125, + "epoch": 7.12, + "grad_norm": 0.1650390625, + "kl": 0.2256334275007248, + "learning_rate": 1.1604330125525079e-05, + "loss": 0.009, + "reward": 5.8226728439331055, + "reward_std": 2.5328832864761353, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.141903400421143, + "rewards/no_repetition_reward_func": -0.30360570549964905, + "rewards/verse_reward_func": -0.015625, + "step": 890 + }, + { + "completion_length": 513.625, + "epoch": 7.128, + "grad_norm": 0.216796875, + "kl": 0.19805681705474854, + "learning_rate": 1.1545437224132318e-05, + "loss": 0.0079, + "reward": 5.726328372955322, + "reward_std": 3.696807384490967, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.045193672180176, + "rewards/no_repetition_reward_func": -0.311053067445755, + "rewards/verse_reward_func": -0.0078125, + "step": 891 + }, + { + "completion_length": 515.5625, + "epoch": 7.136, + "grad_norm": 0.181640625, + "kl": 0.15098332986235619, + "learning_rate": 1.1486649244122824e-05, + "loss": 0.006, + "reward": 7.099849224090576, + "reward_std": 2.7554492950439453, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.455130338668823, + "rewards/no_repetition_reward_func": -0.34746895730495453, + "rewards/verse_reward_func": -0.0078125, + "step": 892 + }, + { + "completion_length": 516.0, + "epoch": 7.144, + "grad_norm": 0.1962890625, + "kl": 0.2609659880399704, + "learning_rate": 1.1427966643937069e-05, + "loss": 0.0104, + "reward": 5.538159132003784, + "reward_std": 2.5020891427993774, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.814048767089844, + "rewards/no_repetition_reward_func": -0.2680772617459297, + "rewards/verse_reward_func": -0.0078125, + "step": 893 + }, + { + "completion_length": 509.5625, + "epoch": 7.152, + "grad_norm": 0.173828125, + "kl": 0.20435188710689545, + "learning_rate": 1.1369389881193749e-05, + "loss": 0.0082, + "reward": 5.855881452560425, + "reward_std": 2.779667854309082, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.17923903465271, + "rewards/no_repetition_reward_func": -0.29992012679576874, + "rewards/verse_reward_func": -0.0234375, + "step": 894 + }, + { + "completion_length": 516.0, + "epoch": 7.16, + "grad_norm": 0.1611328125, + "kl": 0.22203732281923294, + "learning_rate": 1.1310919412686247e-05, + "loss": 0.0089, + "reward": 6.021417617797852, + "reward_std": 3.439122200012207, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.338422060012817, + "rewards/no_repetition_reward_func": -0.2935671955347061, + "rewards/verse_reward_func": -0.0078125, + "step": 895 + }, + { + "completion_length": 514.34375, + "epoch": 7.168, + "grad_norm": 0.212890625, + "kl": 0.3742239773273468, + "learning_rate": 1.1252555694379006e-05, + "loss": 0.015, + "reward": 3.9691076278686523, + "reward_std": 3.242581844329834, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.179956674575806, + "rewards/no_repetition_reward_func": -0.1952238231897354, + "rewards/verse_reward_func": -0.015625, + "step": 896 + }, + { + "completion_length": 514.296875, + "epoch": 7.176, + "grad_norm": 0.1728515625, + "kl": 0.1948346570134163, + "learning_rate": 1.1194299181404036e-05, + "loss": 0.0078, + "reward": 6.191808700561523, + "reward_std": 2.651533842086792, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.509184122085571, + "rewards/no_repetition_reward_func": -0.2939378619194031, + "rewards/verse_reward_func": -0.0078125, + "step": 897 + }, + { + "completion_length": 508.96875, + "epoch": 7.184, + "grad_norm": 0.2158203125, + "kl": 0.18703240901231766, + "learning_rate": 1.1136150328057324e-05, + "loss": 0.0075, + "reward": 6.965450048446655, + "reward_std": 2.2580090761184692, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.292325496673584, + "rewards/no_repetition_reward_func": -0.32687535881996155, + "rewards/verse_reward_func": 0.0, + "step": 898 + }, + { + "completion_length": 512.453125, + "epoch": 7.192, + "grad_norm": 0.1826171875, + "kl": 0.23275882005691528, + "learning_rate": 1.107810958779531e-05, + "loss": 0.0093, + "reward": 5.854774475097656, + "reward_std": 3.1740180253982544, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.165790557861328, + "rewards/no_repetition_reward_func": -0.29539068043231964, + "rewards/verse_reward_func": -0.015625, + "step": 899 + }, + { + "completion_length": 516.0, + "epoch": 7.2, + "grad_norm": 0.166015625, + "kl": 0.1741258054971695, + "learning_rate": 1.1020177413231334e-05, + "loss": 0.007, + "reward": 6.6739301681518555, + "reward_std": 3.1057496070861816, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.036861419677734, + "rewards/no_repetition_reward_func": -0.33949385583400726, + "rewards/verse_reward_func": -0.0078125, + "step": 900 + }, + { + "completion_length": 511.1875, + "epoch": 7.208, + "grad_norm": 0.208984375, + "kl": 0.33953939378261566, + "learning_rate": 1.0962354256132141e-05, + "loss": 0.0136, + "reward": 4.729617595672607, + "reward_std": 3.3324332237243652, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.966574430465698, + "rewards/no_repetition_reward_func": -0.21351917833089828, + "rewards/verse_reward_func": -0.0234375, + "step": 901 + }, + { + "completion_length": 516.0, + "epoch": 7.216, + "grad_norm": 0.220703125, + "kl": 0.16550758481025696, + "learning_rate": 1.0904640567414332e-05, + "loss": 0.0066, + "reward": 6.964184284210205, + "reward_std": 3.032602548599243, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.319129228591919, + "rewards/no_repetition_reward_func": -0.3549443781375885, + "rewards/verse_reward_func": 0.0, + "step": 902 + }, + { + "completion_length": 513.046875, + "epoch": 7.224, + "grad_norm": 0.1787109375, + "kl": 0.1780056580901146, + "learning_rate": 1.0847036797140831e-05, + "loss": 0.0071, + "reward": 6.959910869598389, + "reward_std": 2.813833236694336, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.298203229904175, + "rewards/no_repetition_reward_func": -0.3382926434278488, + "rewards/verse_reward_func": 0.0, + "step": 903 + }, + { + "completion_length": 516.0, + "epoch": 7.232, + "grad_norm": 0.1767578125, + "kl": 0.17464629560709, + "learning_rate": 1.0789543394517435e-05, + "loss": 0.007, + "reward": 6.336129665374756, + "reward_std": 3.0510412454605103, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.687818765640259, + "rewards/no_repetition_reward_func": -0.328251913189888, + "rewards/verse_reward_func": -0.0078125, + "step": 904 + }, + { + "completion_length": 516.0, + "epoch": 7.24, + "grad_norm": 0.154296875, + "kl": 0.17026565968990326, + "learning_rate": 1.0732160807889211e-05, + "loss": 0.0068, + "reward": 6.263676404953003, + "reward_std": 2.853097677230835, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.6193928718566895, + "rewards/no_repetition_reward_func": -0.34790369868278503, + "rewards/verse_reward_func": -0.0078125, + "step": 905 + }, + { + "completion_length": 515.546875, + "epoch": 7.248, + "grad_norm": 0.162109375, + "kl": 0.1578650362789631, + "learning_rate": 1.0674889484737125e-05, + "loss": 0.0063, + "reward": 6.657803773880005, + "reward_std": 2.9578635692596436, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.006585597991943, + "rewards/no_repetition_reward_func": -0.34096992015838623, + "rewards/verse_reward_func": -0.0078125, + "step": 906 + }, + { + "completion_length": 500.828125, + "epoch": 7.256, + "grad_norm": 0.29296875, + "kl": 0.2120385766029358, + "learning_rate": 1.0617729871674436e-05, + "loss": 0.0085, + "reward": 6.210769414901733, + "reward_std": 3.164361000061035, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.518244981765747, + "rewards/no_repetition_reward_func": -0.2918505370616913, + "rewards/verse_reward_func": 0.0, + "step": 907 + }, + { + "completion_length": 515.6875, + "epoch": 7.264, + "grad_norm": 0.169921875, + "kl": 0.14408966898918152, + "learning_rate": 1.0560682414443315e-05, + "loss": 0.0058, + "reward": 7.313984155654907, + "reward_std": 2.191602945327759, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.667505502700806, + "rewards/no_repetition_reward_func": -0.35352152585983276, + "rewards/verse_reward_func": 0.0, + "step": 908 + }, + { + "completion_length": 516.0, + "epoch": 7.272, + "grad_norm": 0.1591796875, + "kl": 0.2051902487874031, + "learning_rate": 1.050374755791127e-05, + "loss": 0.0082, + "reward": 6.111940145492554, + "reward_std": 2.5479227900505066, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.458571910858154, + "rewards/no_repetition_reward_func": -0.32319407165050507, + "rewards/verse_reward_func": -0.0234375, + "step": 909 + }, + { + "completion_length": 509.703125, + "epoch": 7.28, + "grad_norm": 0.2197265625, + "kl": 0.18348229676485062, + "learning_rate": 1.0446925746067768e-05, + "loss": 0.0073, + "reward": 6.529842138290405, + "reward_std": 2.930924892425537, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.848118782043457, + "rewards/no_repetition_reward_func": -0.3182770758867264, + "rewards/verse_reward_func": 0.0, + "step": 910 + }, + { + "completion_length": 516.0, + "epoch": 7.288, + "grad_norm": 0.1748046875, + "kl": 0.1566411554813385, + "learning_rate": 1.03902174220207e-05, + "loss": 0.0063, + "reward": 6.772393703460693, + "reward_std": 3.073458194732666, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.128962993621826, + "rewards/no_repetition_reward_func": -0.34875695407390594, + "rewards/verse_reward_func": -0.0078125, + "step": 911 + }, + { + "completion_length": 514.765625, + "epoch": 7.296, + "grad_norm": 0.1630859375, + "kl": 0.19032565504312515, + "learning_rate": 1.033362302799297e-05, + "loss": 0.0076, + "reward": 5.780538558959961, + "reward_std": 3.8341927528381348, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.109714508056641, + "rewards/no_repetition_reward_func": -0.3213634788990021, + "rewards/verse_reward_func": -0.0078125, + "step": 912 + }, + { + "completion_length": 502.828125, + "epoch": 7.304, + "grad_norm": 0.216796875, + "kl": 0.16527453064918518, + "learning_rate": 1.0277143005319038e-05, + "loss": 0.0066, + "reward": 6.687789678573608, + "reward_std": 3.0586137771606445, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.043074369430542, + "rewards/no_repetition_reward_func": -0.3396594673395157, + "rewards/verse_reward_func": -0.015625, + "step": 913 + }, + { + "completion_length": 511.140625, + "epoch": 7.312, + "grad_norm": 0.2119140625, + "kl": 0.14845828711986542, + "learning_rate": 1.022077779444145e-05, + "loss": 0.0059, + "reward": 7.293567895889282, + "reward_std": 3.097896933555603, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.650259494781494, + "rewards/no_repetition_reward_func": -0.3566918224096298, + "rewards/verse_reward_func": 0.0, + "step": 914 + }, + { + "completion_length": 506.0, + "epoch": 7.32, + "grad_norm": 0.17578125, + "kl": 0.2845080569386482, + "learning_rate": 1.0164527834907467e-05, + "loss": 0.0114, + "reward": 5.188232898712158, + "reward_std": 2.211182177066803, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.456395149230957, + "rewards/no_repetition_reward_func": -0.2525375634431839, + "rewards/verse_reward_func": -0.015625, + "step": 915 + }, + { + "completion_length": 512.046875, + "epoch": 7.328, + "grad_norm": 0.181640625, + "kl": 0.19434872269630432, + "learning_rate": 1.0108393565365551e-05, + "loss": 0.0078, + "reward": 6.1234354972839355, + "reward_std": 2.7143211364746094, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.447207689285278, + "rewards/no_repetition_reward_func": -0.30814751982688904, + "rewards/verse_reward_func": 0.0, + "step": 916 + }, + { + "completion_length": 514.171875, + "epoch": 7.336, + "grad_norm": 0.1943359375, + "kl": 0.18423223495483398, + "learning_rate": 1.0052375423562038e-05, + "loss": 0.0074, + "reward": 6.698188304901123, + "reward_std": 3.0210464000701904, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.032918930053711, + "rewards/no_repetition_reward_func": -0.3269183188676834, + "rewards/verse_reward_func": -0.0078125, + "step": 917 + }, + { + "completion_length": 508.484375, + "epoch": 7.344, + "grad_norm": 0.181640625, + "kl": 0.2838383838534355, + "learning_rate": 9.996473846337614e-06, + "loss": 0.0114, + "reward": 5.2121593952178955, + "reward_std": 2.823826789855957, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.506510257720947, + "rewards/no_repetition_reward_func": -0.255288302898407, + "rewards/verse_reward_func": -0.0390625, + "step": 918 + }, + { + "completion_length": 515.984375, + "epoch": 7.352, + "grad_norm": 0.1748046875, + "kl": 0.16578752547502518, + "learning_rate": 9.94068926962404e-06, + "loss": 0.0066, + "reward": 6.81140398979187, + "reward_std": 3.014499545097351, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.1716835498809814, + "rewards/no_repetition_reward_func": -0.35246700048446655, + "rewards/verse_reward_func": -0.0078125, + "step": 919 + }, + { + "completion_length": 514.25, + "epoch": 7.36, + "grad_norm": 0.19921875, + "kl": 0.17773693799972534, + "learning_rate": 9.88502212844063e-06, + "loss": 0.0071, + "reward": 6.510131359100342, + "reward_std": 2.9803746938705444, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.844125270843506, + "rewards/no_repetition_reward_func": -0.32618145644664764, + "rewards/verse_reward_func": -0.0078125, + "step": 920 + }, + { + "completion_length": 506.421875, + "epoch": 7.368, + "grad_norm": 0.1982421875, + "kl": 0.2092471569776535, + "learning_rate": 9.829472856890942e-06, + "loss": 0.0084, + "reward": 5.835445880889893, + "reward_std": 3.6856677532196045, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.128945827484131, + "rewards/no_repetition_reward_func": -0.2934999614953995, + "rewards/verse_reward_func": 0.0, + "step": 921 + }, + { + "completion_length": 514.71875, + "epoch": 7.376, + "grad_norm": 0.1708984375, + "kl": 0.19242015480995178, + "learning_rate": 9.774041888159364e-06, + "loss": 0.0077, + "reward": 6.629598617553711, + "reward_std": 3.0743606090545654, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.957818984985352, + "rewards/no_repetition_reward_func": -0.31259530782699585, + "rewards/verse_reward_func": -0.015625, + "step": 922 + }, + { + "completion_length": 516.0, + "epoch": 7.384, + "grad_norm": 0.169921875, + "kl": 0.16764061152935028, + "learning_rate": 9.718729654507713e-06, + "loss": 0.0067, + "reward": 6.465218544006348, + "reward_std": 2.9450913667678833, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.821408748626709, + "rewards/no_repetition_reward_func": -0.3405652195215225, + "rewards/verse_reward_func": -0.015625, + "step": 923 + }, + { + "completion_length": 504.40625, + "epoch": 7.392, + "grad_norm": 0.2421875, + "kl": 0.18023264035582542, + "learning_rate": 9.663536587271902e-06, + "loss": 0.0072, + "reward": 6.03523588180542, + "reward_std": 3.0663022994995117, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.361032485961914, + "rewards/no_repetition_reward_func": -0.3179839849472046, + "rewards/verse_reward_func": -0.0078125, + "step": 924 + }, + { + "completion_length": 514.921875, + "epoch": 7.4, + "grad_norm": 0.1650390625, + "kl": 0.22598200291395187, + "learning_rate": 9.608463116858542e-06, + "loss": 0.009, + "reward": 6.1995978355407715, + "reward_std": 2.8352295756340027, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.507456541061401, + "rewards/no_repetition_reward_func": -0.3000466972589493, + "rewards/verse_reward_func": -0.0078125, + "step": 925 + }, + { + "completion_length": 510.53125, + "epoch": 7.408, + "grad_norm": 0.185546875, + "kl": 0.2463657408952713, + "learning_rate": 9.553509672741645e-06, + "loss": 0.0099, + "reward": 5.513822078704834, + "reward_std": 2.869176506996155, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.8215086460113525, + "rewards/no_repetition_reward_func": -0.2764364629983902, + "rewards/verse_reward_func": -0.015625, + "step": 926 + }, + { + "completion_length": 516.0, + "epoch": 7.416, + "grad_norm": 0.2060546875, + "kl": 0.17982115596532822, + "learning_rate": 9.498676683459185e-06, + "loss": 0.0072, + "reward": 6.973018169403076, + "reward_std": 2.361541986465454, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.295591831207275, + "rewards/no_repetition_reward_func": -0.314761221408844, + "rewards/verse_reward_func": -0.0078125, + "step": 927 + }, + { + "completion_length": 516.0, + "epoch": 7.424, + "grad_norm": 0.1923828125, + "kl": 0.19594306498765945, + "learning_rate": 9.443964576609843e-06, + "loss": 0.0078, + "reward": 5.588392972946167, + "reward_std": 3.267909049987793, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.912254810333252, + "rewards/no_repetition_reward_func": -0.30823662877082825, + "rewards/verse_reward_func": -0.015625, + "step": 928 + }, + { + "completion_length": 516.0, + "epoch": 7.432, + "grad_norm": 0.1533203125, + "kl": 0.19216333329677582, + "learning_rate": 9.389373778849612e-06, + "loss": 0.0077, + "reward": 5.913788795471191, + "reward_std": 3.103072762489319, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.269168376922607, + "rewards/no_repetition_reward_func": -0.33194220066070557, + "rewards/verse_reward_func": -0.0078125, + "step": 929 + }, + { + "completion_length": 514.4375, + "epoch": 7.44, + "grad_norm": 0.2060546875, + "kl": 0.212739959359169, + "learning_rate": 9.334904715888495e-06, + "loss": 0.0085, + "reward": 6.165511846542358, + "reward_std": 2.571821093559265, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.49566125869751, + "rewards/no_repetition_reward_func": -0.306711807847023, + "rewards/verse_reward_func": -0.0078125, + "step": 930 + }, + { + "completion_length": 514.75, + "epoch": 7.448, + "grad_norm": 0.240234375, + "kl": 0.2240227460861206, + "learning_rate": 9.280557812487188e-06, + "loss": 0.009, + "reward": 5.896513938903809, + "reward_std": 2.6835951805114746, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.216015815734863, + "rewards/no_repetition_reward_func": -0.288251668214798, + "rewards/verse_reward_func": -0.015625, + "step": 931 + }, + { + "completion_length": 516.0, + "epoch": 7.456, + "grad_norm": 0.1611328125, + "kl": 0.13625653833150864, + "learning_rate": 9.22633349245376e-06, + "loss": 0.0055, + "reward": 6.6090288162231445, + "reward_std": 3.080623745918274, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.970985651016235, + "rewards/no_repetition_reward_func": -0.36195676028728485, + "rewards/verse_reward_func": 0.0, + "step": 932 + }, + { + "completion_length": 515.34375, + "epoch": 7.464, + "grad_norm": 0.1416015625, + "kl": 0.1868286058306694, + "learning_rate": 9.17223217864036e-06, + "loss": 0.0075, + "reward": 6.573328018188477, + "reward_std": 2.5928887128829956, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.920299053192139, + "rewards/no_repetition_reward_func": -0.32353314757347107, + "rewards/verse_reward_func": -0.0234375, + "step": 933 + }, + { + "completion_length": 516.0, + "epoch": 7.4719999999999995, + "grad_norm": 0.169921875, + "kl": 0.18739356845617294, + "learning_rate": 9.11825429293989e-06, + "loss": 0.0075, + "reward": 6.382275819778442, + "reward_std": 2.731178402900696, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.711693525314331, + "rewards/no_repetition_reward_func": -0.321605384349823, + "rewards/verse_reward_func": -0.0078125, + "step": 934 + }, + { + "completion_length": 516.0, + "epoch": 7.48, + "grad_norm": 0.15625, + "kl": 0.14584288001060486, + "learning_rate": 9.064400256282757e-06, + "loss": 0.0058, + "reward": 7.020452260971069, + "reward_std": 2.636118173599243, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 7.3784613609313965, + "rewards/no_repetition_reward_func": -0.34238438308238983, + "rewards/verse_reward_func": 0.0, + "step": 935 + }, + { + "completion_length": 511.890625, + "epoch": 7.4879999999999995, + "grad_norm": 0.1875, + "kl": 0.2010415941476822, + "learning_rate": 9.010670488633552e-06, + "loss": 0.008, + "reward": 5.881787300109863, + "reward_std": 3.3058449029922485, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.205791711807251, + "rewards/no_repetition_reward_func": -0.324004128575325, + "rewards/verse_reward_func": 0.0, + "step": 936 + }, + { + "completion_length": 515.0, + "epoch": 7.496, + "grad_norm": 0.1875, + "kl": 0.24094480276107788, + "learning_rate": 8.957065408987797e-06, + "loss": 0.0096, + "reward": 5.768712043762207, + "reward_std": 3.414099097251892, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.062878370285034, + "rewards/no_repetition_reward_func": -0.27854152768850327, + "rewards/verse_reward_func": -0.015625, + "step": 937 + }, + { + "completion_length": 516.0, + "epoch": 7.504, + "grad_norm": 0.171875, + "kl": 0.15746916830539703, + "learning_rate": 8.903585435368658e-06, + "loss": 0.0063, + "reward": 7.718474388122559, + "reward_std": 1.917959451675415, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 8.072185039520264, + "rewards/no_repetition_reward_func": -0.353710874915123, + "rewards/verse_reward_func": 0.0, + "step": 938 + }, + { + "completion_length": 513.078125, + "epoch": 7.5120000000000005, + "grad_norm": 0.1796875, + "kl": 0.2386726438999176, + "learning_rate": 8.850230984823735e-06, + "loss": 0.0095, + "reward": 6.080239295959473, + "reward_std": 3.5497301816940308, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.3693928718566895, + "rewards/no_repetition_reward_func": -0.28134098649024963, + "rewards/verse_reward_func": -0.0078125, + "step": 939 + }, + { + "completion_length": 515.03125, + "epoch": 7.52, + "grad_norm": 0.18359375, + "kl": 0.25136613845825195, + "learning_rate": 8.797002473421728e-06, + "loss": 0.0101, + "reward": 5.631449937820435, + "reward_std": 2.1681337356567383, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.9381022453308105, + "rewards/no_repetition_reward_func": -0.28321459144353867, + "rewards/verse_reward_func": -0.0234375, + "step": 940 + }, + { + "completion_length": 510.734375, + "epoch": 7.5280000000000005, + "grad_norm": 0.1669921875, + "kl": 0.1673746258020401, + "learning_rate": 8.743900316249273e-06, + "loss": 0.0067, + "reward": 6.385242462158203, + "reward_std": 3.1046674251556396, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.724510192871094, + "rewards/no_repetition_reward_func": -0.33926771581172943, + "rewards/verse_reward_func": 0.0, + "step": 941 + }, + { + "completion_length": 516.0, + "epoch": 7.536, + "grad_norm": 0.1611328125, + "kl": 0.1589796505868435, + "learning_rate": 8.690924927407679e-06, + "loss": 0.0064, + "reward": 6.1393141746521, + "reward_std": 3.2155516147613525, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.48680305480957, + "rewards/no_repetition_reward_func": -0.3474889546632767, + "rewards/verse_reward_func": 0.0, + "step": 942 + }, + { + "completion_length": 516.0, + "epoch": 7.5440000000000005, + "grad_norm": 0.158203125, + "kl": 0.18716009706258774, + "learning_rate": 8.63807672000963e-06, + "loss": 0.0075, + "reward": 6.9843668937683105, + "reward_std": 2.2717650532722473, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.3250412940979, + "rewards/no_repetition_reward_func": -0.32504914700984955, + "rewards/verse_reward_func": -0.015625, + "step": 943 + }, + { + "completion_length": 507.234375, + "epoch": 7.552, + "grad_norm": 0.19140625, + "kl": 0.22850479185581207, + "learning_rate": 8.585356106176094e-06, + "loss": 0.0091, + "reward": 6.644760847091675, + "reward_std": 2.962646007537842, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.964919328689575, + "rewards/no_repetition_reward_func": -0.3123456835746765, + "rewards/verse_reward_func": -0.0078125, + "step": 944 + }, + { + "completion_length": 514.5, + "epoch": 7.5600000000000005, + "grad_norm": 0.1767578125, + "kl": 0.25042538344860077, + "learning_rate": 8.532763497032987e-06, + "loss": 0.01, + "reward": 5.844624996185303, + "reward_std": 3.3242640495300293, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.1569108963012695, + "rewards/no_repetition_reward_func": -0.28103555738925934, + "rewards/verse_reward_func": -0.03125, + "step": 945 + }, + { + "completion_length": 504.578125, + "epoch": 7.568, + "grad_norm": 0.26171875, + "kl": 0.19726386666297913, + "learning_rate": 8.480299302708059e-06, + "loss": 0.0079, + "reward": 5.337730884552002, + "reward_std": 3.5245684385299683, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.666630983352661, + "rewards/no_repetition_reward_func": -0.3054623603820801, + "rewards/verse_reward_func": -0.0234375, + "step": 946 + }, + { + "completion_length": 515.78125, + "epoch": 7.576, + "grad_norm": 0.1826171875, + "kl": 0.25120044499635696, + "learning_rate": 8.42796393232762e-06, + "loss": 0.01, + "reward": 4.7899651527404785, + "reward_std": 2.894296407699585, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.069559812545776, + "rewards/no_repetition_reward_func": -0.27178171277046204, + "rewards/verse_reward_func": -0.0078125, + "step": 947 + }, + { + "completion_length": 507.78125, + "epoch": 7.584, + "grad_norm": 0.1904296875, + "kl": 0.28514528274536133, + "learning_rate": 8.375757794013414e-06, + "loss": 0.0114, + "reward": 4.497965335845947, + "reward_std": 3.3387176990509033, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 4.77599310874939, + "rewards/no_repetition_reward_func": -0.24677765369415283, + "rewards/verse_reward_func": -0.03125, + "step": 948 + }, + { + "completion_length": 516.0, + "epoch": 7.592, + "grad_norm": 0.185546875, + "kl": 0.16404947638511658, + "learning_rate": 8.323681294879394e-06, + "loss": 0.0066, + "reward": 6.911858320236206, + "reward_std": 2.8808830976486206, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.2644805908203125, + "rewards/no_repetition_reward_func": -0.3526221364736557, + "rewards/verse_reward_func": 0.0, + "step": 949 + }, + { + "completion_length": 511.84375, + "epoch": 7.6, + "grad_norm": 0.18359375, + "kl": 0.2631699964404106, + "learning_rate": 8.271734841028553e-06, + "loss": 0.0105, + "reward": 5.977914333343506, + "reward_std": 2.9207078218460083, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.289936780929565, + "rewards/no_repetition_reward_func": -0.28077271580696106, + "rewards/verse_reward_func": -0.015625, + "step": 950 + }, + { + "completion_length": 516.0, + "epoch": 7.608, + "grad_norm": 0.20703125, + "kl": 0.2033829316496849, + "learning_rate": 8.21991883754977e-06, + "loss": 0.0081, + "reward": 5.838162899017334, + "reward_std": 3.432569742202759, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.138234853744507, + "rewards/no_repetition_reward_func": -0.300071582198143, + "rewards/verse_reward_func": 0.0, + "step": 951 + }, + { + "completion_length": 506.296875, + "epoch": 7.616, + "grad_norm": 0.21484375, + "kl": 0.2601579427719116, + "learning_rate": 8.168233688514654e-06, + "loss": 0.0104, + "reward": 5.820450305938721, + "reward_std": 2.7211891412734985, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.122101783752441, + "rewards/no_repetition_reward_func": -0.2704017609357834, + "rewards/verse_reward_func": -0.03125, + "step": 952 + }, + { + "completion_length": 516.0, + "epoch": 7.624, + "grad_norm": 0.171875, + "kl": 0.229884535074234, + "learning_rate": 8.116679796974388e-06, + "loss": 0.0092, + "reward": 5.594229221343994, + "reward_std": 2.7615973949432373, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.906846046447754, + "rewards/no_repetition_reward_func": -0.28917960822582245, + "rewards/verse_reward_func": -0.0234375, + "step": 953 + }, + { + "completion_length": 512.640625, + "epoch": 7.632, + "grad_norm": 0.1806640625, + "kl": 0.19547390937805176, + "learning_rate": 8.06525756495657e-06, + "loss": 0.0078, + "reward": 6.120168685913086, + "reward_std": 3.457294225692749, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.462159872055054, + "rewards/no_repetition_reward_func": -0.3185536116361618, + "rewards/verse_reward_func": -0.0078125, + "step": 954 + }, + { + "completion_length": 513.359375, + "epoch": 7.64, + "grad_norm": 0.1845703125, + "kl": 0.2145848125219345, + "learning_rate": 8.013967393462094e-06, + "loss": 0.0086, + "reward": 5.469772100448608, + "reward_std": 3.728051543235779, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.7761054039001465, + "rewards/no_repetition_reward_func": -0.298521026968956, + "rewards/verse_reward_func": -0.0078125, + "step": 955 + }, + { + "completion_length": 516.0, + "epoch": 7.648, + "grad_norm": 0.1884765625, + "kl": 0.16177760809659958, + "learning_rate": 7.962809682462009e-06, + "loss": 0.0065, + "reward": 6.415769577026367, + "reward_std": 3.105261206626892, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.760792255401611, + "rewards/no_repetition_reward_func": -0.3450229614973068, + "rewards/verse_reward_func": 0.0, + "step": 956 + }, + { + "completion_length": 516.0, + "epoch": 7.656, + "grad_norm": 0.18359375, + "kl": 0.1882861852645874, + "learning_rate": 7.91178483089444e-06, + "loss": 0.0075, + "reward": 6.4853386878967285, + "reward_std": 3.0684503316879272, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.8236985206604, + "rewards/no_repetition_reward_func": -0.3227349817752838, + "rewards/verse_reward_func": -0.015625, + "step": 957 + }, + { + "completion_length": 513.90625, + "epoch": 7.664, + "grad_norm": 0.1728515625, + "kl": 0.16748661920428276, + "learning_rate": 7.860893236661412e-06, + "loss": 0.0067, + "reward": 6.726533651351929, + "reward_std": 1.97085702419281, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.089804172515869, + "rewards/no_repetition_reward_func": -0.3320203870534897, + "rewards/verse_reward_func": -0.03125, + "step": 958 + }, + { + "completion_length": 513.40625, + "epoch": 7.672, + "grad_norm": 0.26171875, + "kl": 0.27587568759918213, + "learning_rate": 7.810135296625818e-06, + "loss": 0.011, + "reward": 5.417375802993774, + "reward_std": 3.04726779460907, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.697129249572754, + "rewards/no_repetition_reward_func": -0.2641286998987198, + "rewards/verse_reward_func": -0.015625, + "step": 959 + }, + { + "completion_length": 516.0, + "epoch": 7.68, + "grad_norm": 0.173828125, + "kl": 0.1667083203792572, + "learning_rate": 7.759511406608255e-06, + "loss": 0.0067, + "reward": 6.6106343269348145, + "reward_std": 2.3094284534454346, + "rewards/check_divine_comedy_plagiarism": -0.03125, + "rewards/endecasillabo_reward_func": 6.987204551696777, + "rewards/no_repetition_reward_func": -0.33750833570957184, + "rewards/verse_reward_func": -0.0078125, + "step": 960 + }, + { + "completion_length": 516.0, + "epoch": 7.688, + "grad_norm": 0.1953125, + "kl": 0.20169949531555176, + "learning_rate": 7.709021961384e-06, + "loss": 0.0081, + "reward": 5.913158416748047, + "reward_std": 3.6654467582702637, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.222219705581665, + "rewards/no_repetition_reward_func": -0.3090614527463913, + "rewards/verse_reward_func": 0.0, + "step": 961 + }, + { + "completion_length": 516.0, + "epoch": 7.696, + "grad_norm": 0.1875, + "kl": 0.2402345836162567, + "learning_rate": 7.65866735467988e-06, + "loss": 0.0096, + "reward": 5.2433977127075195, + "reward_std": 3.456560730934143, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.532117605209351, + "rewards/no_repetition_reward_func": -0.2652825862169266, + "rewards/verse_reward_func": -0.0234375, + "step": 962 + }, + { + "completion_length": 506.015625, + "epoch": 7.704, + "grad_norm": 0.2099609375, + "kl": 0.2628094255924225, + "learning_rate": 7.608447979171229e-06, + "loss": 0.0105, + "reward": 5.377523422241211, + "reward_std": 3.954794406890869, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.662477970123291, + "rewards/no_repetition_reward_func": -0.2693289965391159, + "rewards/verse_reward_func": -0.015625, + "step": 963 + }, + { + "completion_length": 510.90625, + "epoch": 7.712, + "grad_norm": 0.1962890625, + "kl": 0.20766513049602509, + "learning_rate": 7.558364226478842e-06, + "loss": 0.0083, + "reward": 6.570536136627197, + "reward_std": 2.6823028326034546, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.933148622512817, + "rewards/no_repetition_reward_func": -0.3313625603914261, + "rewards/verse_reward_func": -0.015625, + "step": 964 + }, + { + "completion_length": 506.421875, + "epoch": 7.72, + "grad_norm": 0.421875, + "kl": 0.14496013522148132, + "learning_rate": 7.508416487165862e-06, + "loss": 0.0058, + "reward": 6.840256452560425, + "reward_std": 2.8856863975524902, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.185028076171875, + "rewards/no_repetition_reward_func": -0.3447718024253845, + "rewards/verse_reward_func": 0.0, + "step": 965 + }, + { + "completion_length": 516.0, + "epoch": 7.728, + "grad_norm": 0.1748046875, + "kl": 0.29118701815605164, + "learning_rate": 7.458605150734816e-06, + "loss": 0.0116, + "reward": 4.996778726577759, + "reward_std": 2.3840100169181824, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.282296419143677, + "rewards/no_repetition_reward_func": -0.2542674094438553, + "rewards/verse_reward_func": -0.03125, + "step": 966 + }, + { + "completion_length": 516.0, + "epoch": 7.736, + "grad_norm": 0.1865234375, + "kl": 0.15017058327794075, + "learning_rate": 7.408930605624498e-06, + "loss": 0.006, + "reward": 6.842717409133911, + "reward_std": 2.8965758085250854, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.193782567977905, + "rewards/no_repetition_reward_func": -0.3510653078556061, + "rewards/verse_reward_func": 0.0, + "step": 967 + }, + { + "completion_length": 508.953125, + "epoch": 7.744, + "grad_norm": 0.203125, + "kl": 0.2312842532992363, + "learning_rate": 7.359393239206991e-06, + "loss": 0.0093, + "reward": 6.200360298156738, + "reward_std": 2.3319121599197388, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.510737419128418, + "rewards/no_repetition_reward_func": -0.28693920373916626, + "rewards/verse_reward_func": -0.0078125, + "step": 968 + }, + { + "completion_length": 502.703125, + "epoch": 7.752, + "grad_norm": 0.234375, + "kl": 0.262625128030777, + "learning_rate": 7.309993437784624e-06, + "loss": 0.0105, + "reward": 4.8304362297058105, + "reward_std": 3.4979785680770874, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.121379137039185, + "rewards/no_repetition_reward_func": -0.25188013166189194, + "rewards/verse_reward_func": -0.0234375, + "step": 969 + }, + { + "completion_length": 509.921875, + "epoch": 7.76, + "grad_norm": 0.181640625, + "kl": 0.16863003373146057, + "learning_rate": 7.260731586586983e-06, + "loss": 0.0067, + "reward": 6.442395925521851, + "reward_std": 2.9732736349105835, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.772801399230957, + "rewards/no_repetition_reward_func": -0.3225926011800766, + "rewards/verse_reward_func": -0.0078125, + "step": 970 + }, + { + "completion_length": 510.46875, + "epoch": 7.768, + "grad_norm": 0.1748046875, + "kl": 0.27962905168533325, + "learning_rate": 7.211608069767867e-06, + "loss": 0.0112, + "reward": 5.654796123504639, + "reward_std": 2.2401435375213623, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.935122489929199, + "rewards/no_repetition_reward_func": -0.26470130681991577, + "rewards/verse_reward_func": -0.015625, + "step": 971 + }, + { + "completion_length": 502.984375, + "epoch": 7.776, + "grad_norm": 0.2138671875, + "kl": 0.1428181603550911, + "learning_rate": 7.162623270402335e-06, + "loss": 0.0057, + "reward": 6.3195977210998535, + "reward_std": 3.6245224475860596, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.6941797733306885, + "rewards/no_repetition_reward_func": -0.35114482045173645, + "rewards/verse_reward_func": -0.0078125, + "step": 972 + }, + { + "completion_length": 516.0, + "epoch": 7.784, + "grad_norm": 0.19921875, + "kl": 0.24445974081754684, + "learning_rate": 7.113777570483701e-06, + "loss": 0.0098, + "reward": 5.378423690795898, + "reward_std": 2.237412244081497, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.689203977584839, + "rewards/no_repetition_reward_func": -0.2717171758413315, + "rewards/verse_reward_func": -0.0390625, + "step": 973 + }, + { + "completion_length": 506.8125, + "epoch": 7.792, + "grad_norm": 0.64453125, + "kl": 0.24509216845035553, + "learning_rate": 7.065071350920538e-06, + "loss": 0.0098, + "reward": 5.6093621253967285, + "reward_std": 2.658163905143738, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.961400747299194, + "rewards/no_repetition_reward_func": -0.2895386815071106, + "rewards/verse_reward_func": -0.046875, + "step": 974 + }, + { + "completion_length": 514.890625, + "epoch": 7.8, + "grad_norm": 0.1728515625, + "kl": 0.21094757318496704, + "learning_rate": 7.016504991533726e-06, + "loss": 0.0084, + "reward": 6.102145195007324, + "reward_std": 2.5561567544937134, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.393847703933716, + "rewards/no_repetition_reward_func": -0.2917025089263916, + "rewards/verse_reward_func": 0.0, + "step": 975 + }, + { + "completion_length": 516.0, + "epoch": 7.808, + "grad_norm": 0.1826171875, + "kl": 0.24674423038959503, + "learning_rate": 6.968078871053488e-06, + "loss": 0.0099, + "reward": 5.415365219116211, + "reward_std": 2.9804816246032715, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.723666667938232, + "rewards/no_repetition_reward_func": -0.2770514488220215, + "rewards/verse_reward_func": -0.03125, + "step": 976 + }, + { + "completion_length": 511.65625, + "epoch": 7.816, + "grad_norm": 0.177734375, + "kl": 0.23917318135499954, + "learning_rate": 6.919793367116453e-06, + "loss": 0.0096, + "reward": 5.6836206912994385, + "reward_std": 2.8374972343444824, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.000532388687134, + "rewards/no_repetition_reward_func": -0.29347431659698486, + "rewards/verse_reward_func": -0.0234375, + "step": 977 + }, + { + "completion_length": 508.109375, + "epoch": 7.824, + "grad_norm": 0.671875, + "kl": 0.21679607033729553, + "learning_rate": 6.871648856262666e-06, + "loss": 0.0087, + "reward": 6.065804958343506, + "reward_std": 2.674037218093872, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.373357057571411, + "rewards/no_repetition_reward_func": -0.299739807844162, + "rewards/verse_reward_func": -0.0078125, + "step": 978 + }, + { + "completion_length": 516.0, + "epoch": 7.832, + "grad_norm": 0.1845703125, + "kl": 0.1468428522348404, + "learning_rate": 6.823645713932708e-06, + "loss": 0.0059, + "reward": 7.490975856781006, + "reward_std": 2.207758665084839, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.848611831665039, + "rewards/no_repetition_reward_func": -0.3576360046863556, + "rewards/verse_reward_func": 0.0, + "step": 979 + }, + { + "completion_length": 515.296875, + "epoch": 7.84, + "grad_norm": 0.17578125, + "kl": 0.14377613365650177, + "learning_rate": 6.775784314464717e-06, + "loss": 0.0058, + "reward": 7.044445276260376, + "reward_std": 3.028616189956665, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.409189701080322, + "rewards/no_repetition_reward_func": -0.356932133436203, + "rewards/verse_reward_func": -0.0078125, + "step": 980 + }, + { + "completion_length": 506.171875, + "epoch": 7.848, + "grad_norm": 0.1806640625, + "kl": 0.20706836879253387, + "learning_rate": 6.7280650310915015e-06, + "loss": 0.0083, + "reward": 6.292295694351196, + "reward_std": 3.4093852043151855, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.613749980926514, + "rewards/no_repetition_reward_func": -0.3214544802904129, + "rewards/verse_reward_func": 0.0, + "step": 981 + }, + { + "completion_length": 516.0, + "epoch": 7.856, + "grad_norm": 0.1875, + "kl": 0.16937708854675293, + "learning_rate": 6.6804882359376126e-06, + "loss": 0.0068, + "reward": 7.060451030731201, + "reward_std": 2.9413474798202515, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.41961669921875, + "rewards/no_repetition_reward_func": -0.3435407727956772, + "rewards/verse_reward_func": -0.015625, + "step": 982 + }, + { + "completion_length": 509.25, + "epoch": 7.864, + "grad_norm": 0.2080078125, + "kl": 0.15034142136573792, + "learning_rate": 6.6330543000164645e-06, + "loss": 0.006, + "reward": 6.653038740158081, + "reward_std": 2.9458101987838745, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.00105094909668, + "rewards/no_repetition_reward_func": -0.34801194071769714, + "rewards/verse_reward_func": 0.0, + "step": 983 + }, + { + "completion_length": 516.0, + "epoch": 7.872, + "grad_norm": 0.1669921875, + "kl": 0.19536083191633224, + "learning_rate": 6.58576359322742e-06, + "loss": 0.0078, + "reward": 6.837286949157715, + "reward_std": 3.050396680831909, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.171454906463623, + "rewards/no_repetition_reward_func": -0.3185427337884903, + "rewards/verse_reward_func": -0.015625, + "step": 984 + }, + { + "completion_length": 516.0, + "epoch": 7.88, + "grad_norm": 0.1806640625, + "kl": 0.24209705740213394, + "learning_rate": 6.538616484352902e-06, + "loss": 0.0097, + "reward": 5.549838066101074, + "reward_std": 3.297152519226074, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.882108449935913, + "rewards/no_repetition_reward_func": -0.28539521992206573, + "rewards/verse_reward_func": -0.03125, + "step": 985 + }, + { + "completion_length": 508.4375, + "epoch": 7.888, + "grad_norm": 0.2490234375, + "kl": 0.2532542124390602, + "learning_rate": 6.4916133410555466e-06, + "loss": 0.0101, + "reward": 5.646351099014282, + "reward_std": 2.641343891620636, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 5.967700719833374, + "rewards/no_repetition_reward_func": -0.29009921848773956, + "rewards/verse_reward_func": -0.015625, + "step": 986 + }, + { + "completion_length": 513.96875, + "epoch": 7.896, + "grad_norm": 0.171875, + "kl": 0.20123514533042908, + "learning_rate": 6.444754529875302e-06, + "loss": 0.008, + "reward": 6.531996965408325, + "reward_std": 2.553738236427307, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.862635135650635, + "rewards/no_repetition_reward_func": -0.3228254020214081, + "rewards/verse_reward_func": -0.0078125, + "step": 987 + }, + { + "completion_length": 516.0, + "epoch": 7.904, + "grad_norm": 0.1708984375, + "kl": 0.1979285329580307, + "learning_rate": 6.398040416226592e-06, + "loss": 0.0079, + "reward": 6.243427038192749, + "reward_std": 3.656306505203247, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.572209119796753, + "rewards/no_repetition_reward_func": -0.3209698647260666, + "rewards/verse_reward_func": -0.0078125, + "step": 988 + }, + { + "completion_length": 507.53125, + "epoch": 7.912, + "grad_norm": 0.189453125, + "kl": 0.298298716545105, + "learning_rate": 6.3514713643954475e-06, + "loss": 0.0119, + "reward": 4.9394237995147705, + "reward_std": 3.3891143798828125, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.202345609664917, + "rewards/no_repetition_reward_func": -0.24729667603969574, + "rewards/verse_reward_func": -0.015625, + "step": 989 + }, + { + "completion_length": 516.0, + "epoch": 7.92, + "grad_norm": 0.1884765625, + "kl": 0.20151732861995697, + "learning_rate": 6.305047737536707e-06, + "loss": 0.0081, + "reward": 6.57289457321167, + "reward_std": 2.2777618765830994, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.889364719390869, + "rewards/no_repetition_reward_func": -0.3086576461791992, + "rewards/verse_reward_func": -0.0078125, + "step": 990 + }, + { + "completion_length": 513.5, + "epoch": 7.928, + "grad_norm": 0.1796875, + "kl": 0.22548329830169678, + "learning_rate": 6.258769897671124e-06, + "loss": 0.009, + "reward": 6.064760446548462, + "reward_std": 2.2412988543510437, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.373163461685181, + "rewards/no_repetition_reward_func": -0.284965381026268, + "rewards/verse_reward_func": -0.0234375, + "step": 991 + }, + { + "completion_length": 514.15625, + "epoch": 7.936, + "grad_norm": 0.171875, + "kl": 0.14622513949871063, + "learning_rate": 6.2126382056826e-06, + "loss": 0.0058, + "reward": 7.348103046417236, + "reward_std": 2.142330765724182, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.709853410720825, + "rewards/no_repetition_reward_func": -0.35393768548965454, + "rewards/verse_reward_func": -0.0078125, + "step": 992 + }, + { + "completion_length": 515.640625, + "epoch": 7.944, + "grad_norm": 0.171875, + "kl": 0.24103792756795883, + "learning_rate": 6.1666530213153355e-06, + "loss": 0.0096, + "reward": 6.128934383392334, + "reward_std": 1.9754563570022583, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.43005633354187, + "rewards/no_repetition_reward_func": -0.29330912232398987, + "rewards/verse_reward_func": -0.0078125, + "step": 993 + }, + { + "completion_length": 504.0625, + "epoch": 7.952, + "grad_norm": 0.1923828125, + "kl": 0.3362417221069336, + "learning_rate": 6.120814703171024e-06, + "loss": 0.0134, + "reward": 3.73512065410614, + "reward_std": 3.345029830932617, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 3.9596078395843506, + "rewards/no_repetition_reward_func": -0.2088617980480194, + "rewards/verse_reward_func": -0.015625, + "step": 994 + }, + { + "completion_length": 514.359375, + "epoch": 7.96, + "grad_norm": 0.2080078125, + "kl": 0.149591863155365, + "learning_rate": 6.075123608706093e-06, + "loss": 0.006, + "reward": 7.335120916366577, + "reward_std": 2.4030555486679077, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.691418170928955, + "rewards/no_repetition_reward_func": -0.3562973290681839, + "rewards/verse_reward_func": 0.0, + "step": 995 + }, + { + "completion_length": 516.0, + "epoch": 7.968, + "grad_norm": 0.1962890625, + "kl": 0.16793400794267654, + "learning_rate": 6.029580094228862e-06, + "loss": 0.0067, + "reward": 6.735406160354614, + "reward_std": 3.0066243410110474, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 7.068373918533325, + "rewards/no_repetition_reward_func": -0.3329680860042572, + "rewards/verse_reward_func": 0.0, + "step": 996 + }, + { + "completion_length": 516.0, + "epoch": 7.976, + "grad_norm": 0.16796875, + "kl": 0.23349980264902115, + "learning_rate": 5.9841845148968204e-06, + "loss": 0.0093, + "reward": 5.8805694580078125, + "reward_std": 3.243261933326721, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.167189598083496, + "rewards/no_repetition_reward_func": -0.2866196781396866, + "rewards/verse_reward_func": 0.0, + "step": 997 + }, + { + "completion_length": 509.9375, + "epoch": 7.984, + "grad_norm": 0.1923828125, + "kl": 0.2516597956418991, + "learning_rate": 5.9389372247138e-06, + "loss": 0.0101, + "reward": 5.4727702140808105, + "reward_std": 3.617272138595581, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 5.760960102081299, + "rewards/no_repetition_reward_func": -0.2803771644830704, + "rewards/verse_reward_func": -0.0078125, + "step": 998 + }, + { + "completion_length": 516.0, + "epoch": 7.992, + "grad_norm": 0.1591796875, + "kl": 0.21637116372585297, + "learning_rate": 5.893838576527275e-06, + "loss": 0.0087, + "reward": 6.538309574127197, + "reward_std": 2.79458224773407, + "rewards/check_divine_comedy_plagiarism": 0.0, + "rewards/endecasillabo_reward_func": 6.858302116394043, + "rewards/no_repetition_reward_func": -0.30436716973781586, + "rewards/verse_reward_func": -0.015625, + "step": 999 + }, + { + "completion_length": 516.0, + "epoch": 8.0, + "grad_norm": 0.3515625, + "kl": 0.21909940242767334, + "learning_rate": 5.848888922025553e-06, + "loss": 0.0088, + "reward": 6.009196519851685, + "reward_std": 3.209109902381897, + "rewards/check_divine_comedy_plagiarism": -0.015625, + "rewards/endecasillabo_reward_func": 6.337465047836304, + "rewards/no_repetition_reward_func": -0.297018364071846, + "rewards/verse_reward_func": -0.015625, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}