{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06417455478902614, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 89.1796875, "epoch": 6.417455478902615e-05, "grad_norm": 6.457693642993236, "kl": 0.0, "learning_rate": 9.99967911692979e-07, "loss": 0.0, "reward": 2.8125, "reward_std": 0.5811586081981659, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.2265625, "rewards/format_reward": 0.9296875, "step": 1 }, { "completion_length": 100.859375, "epoch": 0.0001283491095780523, "grad_norm": 3.7199479867427576, "kl": 0.0006103515625, "learning_rate": 9.999358233859582e-07, "loss": 0.0, "reward": 2.95703125, "reward_std": 0.9970237612724304, "rewards/accuracy_reward": 0.5, "rewards/format_count_numbers": 1.53515625, "rewards/format_reward": 0.921875, "step": 2 }, { "completion_length": 99.9609375, "epoch": 0.00019252366436707844, "grad_norm": 4.667048062421479, "kl": 0.000865936279296875, "learning_rate": 9.999037350789372e-07, "loss": 0.0, "reward": 2.75, "reward_std": 0.6299314796924591, "rewards/accuracy_reward": 0.3984375, "rewards/format_count_numbers": 1.40625, "rewards/format_reward": 0.9453125, "step": 3 }, { "completion_length": 105.375, "epoch": 0.0002566982191561046, "grad_norm": 6.61841287599563, "kl": 0.000850677490234375, "learning_rate": 9.998716467719162e-07, "loss": 0.0, "reward": 2.6484375, "reward_std": 0.6614057421684265, "rewards/accuracy_reward": 0.4453125, "rewards/format_count_numbers": 1.2890625, "rewards/format_reward": 0.9140625, "step": 4 }, { "completion_length": 98.4921875, "epoch": 0.00032087277394513073, "grad_norm": 10.226707577648735, "kl": 0.001094818115234375, "learning_rate": 9.998395584648954e-07, "loss": 0.0, "reward": 2.7734375, "reward_std": 0.684965580701828, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.3125, "rewards/format_reward": 0.9296875, "step": 5 }, { "completion_length": 105.0078125, "epoch": 0.0003850473287341569, "grad_norm": 16.540801471606787, "kl": 0.00214385986328125, "learning_rate": 9.998074701578744e-07, "loss": 0.0001, "reward": 2.38671875, "reward_std": 0.6537165194749832, "rewards/accuracy_reward": 0.4140625, "rewards/format_count_numbers": 1.08984375, "rewards/format_reward": 0.8828125, "step": 6 }, { "completion_length": 96.6484375, "epoch": 0.0004492218835231831, "grad_norm": 4.007831804571436, "kl": 0.003238677978515625, "learning_rate": 9.997753818508536e-07, "loss": 0.0001, "reward": 2.85546875, "reward_std": 0.5957659184932709, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.23828125, "rewards/format_reward": 0.953125, "step": 7 }, { "completion_length": 94.6328125, "epoch": 0.0005133964383122092, "grad_norm": 3.3765246174936343, "kl": 0.00238037109375, "learning_rate": 9.997432935438326e-07, "loss": 0.0001, "reward": 3.05078125, "reward_std": 0.5704643428325653, "rewards/accuracy_reward": 0.515625, "rewards/format_count_numbers": 1.58203125, "rewards/format_reward": 0.953125, "step": 8 }, { "completion_length": 95.71875, "epoch": 0.0005775709931012354, "grad_norm": 3.445754640313553, "kl": 0.0037994384765625, "learning_rate": 9.997112052368116e-07, "loss": 0.0002, "reward": 2.921875, "reward_std": 0.7348538041114807, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.4453125, "rewards/format_reward": 0.9453125, "step": 9 }, { "completion_length": 81.859375, "epoch": 0.0006417455478902615, "grad_norm": 13.788009049584414, "kl": 0.00360870361328125, "learning_rate": 9.996791169297908e-07, "loss": 0.0001, "reward": 2.41015625, "reward_std": 0.3873346596956253, "rewards/accuracy_reward": 0.46875, "rewards/format_count_numbers": 0.97265625, "rewards/format_reward": 0.96875, "step": 10 }, { "completion_length": 95.3515625, "epoch": 0.0007059201026792877, "grad_norm": 3.802264533705271, "kl": 0.007354736328125, "learning_rate": 9.996470286227698e-07, "loss": 0.0003, "reward": 3.0703125, "reward_std": 0.5251666307449341, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.5546875, "rewards/format_reward": 0.9921875, "step": 11 }, { "completion_length": 91.515625, "epoch": 0.0007700946574683138, "grad_norm": 7.025555072435911, "kl": 0.0046234130859375, "learning_rate": 9.996149403157488e-07, "loss": 0.0002, "reward": 3.0, "reward_std": 0.433403342962265, "rewards/accuracy_reward": 0.4765625, "rewards/format_count_numbers": 1.5390625, "rewards/format_reward": 0.984375, "step": 12 }, { "completion_length": 82.625, "epoch": 0.00083426921225734, "grad_norm": 8.089653038056108, "kl": 0.0070648193359375, "learning_rate": 9.99582852008728e-07, "loss": 0.0003, "reward": 3.3828125, "reward_std": 0.47115227580070496, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 0.9921875, "step": 13 }, { "completion_length": 82.984375, "epoch": 0.0008984437670463662, "grad_norm": 2.6921754114824137, "kl": 0.00640869140625, "learning_rate": 9.99550763701707e-07, "loss": 0.0003, "reward": 3.25390625, "reward_std": 0.37268710136413574, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.53515625, "rewards/format_reward": 0.984375, "step": 14 }, { "completion_length": 86.34375, "epoch": 0.0009626183218353923, "grad_norm": 5.3694361415616685, "kl": 0.014495849609375, "learning_rate": 9.995186753946862e-07, "loss": 0.0006, "reward": 2.9453125, "reward_std": 0.3975609838962555, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.234375, "rewards/format_reward": 0.9921875, "step": 15 }, { "completion_length": 73.1171875, "epoch": 0.0010267928766244184, "grad_norm": 3.920415856048503, "kl": 0.011138916015625, "learning_rate": 9.994865870876652e-07, "loss": 0.0004, "reward": 3.05859375, "reward_std": 0.3202301412820816, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.42578125, "rewards/format_reward": 0.9921875, "step": 16 }, { "completion_length": 77.875, "epoch": 0.0010909674314134447, "grad_norm": 4.105012934454526, "kl": 0.007293701171875, "learning_rate": 9.994544987806442e-07, "loss": 0.0003, "reward": 2.80859375, "reward_std": 0.36826513707637787, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.26953125, "rewards/format_reward": 1.0, "step": 17 }, { "completion_length": 74.1640625, "epoch": 0.0011551419862024708, "grad_norm": 3.025079315189922, "kl": 0.009002685546875, "learning_rate": 9.994224104736234e-07, "loss": 0.0004, "reward": 2.72265625, "reward_std": 0.3272075057029724, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.16015625, "rewards/format_reward": 1.0, "step": 18 }, { "completion_length": 75.703125, "epoch": 0.0012193165409914968, "grad_norm": 2.702032953911433, "kl": 0.0196685791015625, "learning_rate": 9.993903221666024e-07, "loss": 0.0008, "reward": 2.99609375, "reward_std": 0.35988467931747437, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.39453125, "rewards/format_reward": 0.984375, "step": 19 }, { "completion_length": 72.75, "epoch": 0.001283491095780523, "grad_norm": 6.672377706898142, "kl": 0.00946044921875, "learning_rate": 9.993582338595814e-07, "loss": 0.0004, "reward": 2.78515625, "reward_std": 0.3711431473493576, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.14453125, "rewards/format_reward": 0.9921875, "step": 20 }, { "completion_length": 72.234375, "epoch": 0.0013476656505695492, "grad_norm": 3.21648017083967, "kl": 0.01513671875, "learning_rate": 9.993261455525607e-07, "loss": 0.0006, "reward": 3.3046875, "reward_std": 0.3110102415084839, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.6484375, "rewards/format_reward": 0.9765625, "step": 21 }, { "completion_length": 81.3203125, "epoch": 0.0014118402053585753, "grad_norm": 5.2036438638538405, "kl": 0.020416259765625, "learning_rate": 9.992940572455397e-07, "loss": 0.0008, "reward": 2.83203125, "reward_std": 0.3504672795534134, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.30078125, "rewards/format_reward": 0.9921875, "step": 22 }, { "completion_length": 77.3515625, "epoch": 0.0014760147601476014, "grad_norm": 4.546185134633502, "kl": 0.013824462890625, "learning_rate": 9.992619689385189e-07, "loss": 0.0006, "reward": 3.34375, "reward_std": 0.3390200138092041, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.6328125, "rewards/format_reward": 1.0, "step": 23 }, { "completion_length": 78.234375, "epoch": 0.0015401893149366275, "grad_norm": 3.663495400021626, "kl": 0.015289306640625, "learning_rate": 9.992298806314979e-07, "loss": 0.0006, "reward": 3.02734375, "reward_std": 0.3106808215379715, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.41015625, "rewards/format_reward": 1.0, "step": 24 }, { "completion_length": 85.5078125, "epoch": 0.0016043638697256538, "grad_norm": 4.017477734738997, "kl": 0.009735107421875, "learning_rate": 9.991977923244769e-07, "loss": 0.0004, "reward": 3.02734375, "reward_std": 0.3911897838115692, "rewards/accuracy_reward": 0.5, "rewards/format_count_numbers": 1.55078125, "rewards/format_reward": 0.9765625, "step": 25 }, { "completion_length": 70.953125, "epoch": 0.00166853842451468, "grad_norm": 131.7420509388314, "kl": 0.01788330078125, "learning_rate": 9.99165704017456e-07, "loss": 0.0007, "reward": 3.32421875, "reward_std": 0.3377445787191391, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.51953125, "rewards/format_reward": 0.9921875, "step": 26 }, { "completion_length": 83.9140625, "epoch": 0.001732712979303706, "grad_norm": 8.203637941977545, "kl": 0.012908935546875, "learning_rate": 9.99133615710435e-07, "loss": 0.0005, "reward": 3.140625, "reward_std": 0.31107497215270996, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.53125, "rewards/format_reward": 1.0, "step": 27 }, { "completion_length": 87.546875, "epoch": 0.0017968875340927323, "grad_norm": 3.074634047919227, "kl": 0.01849365234375, "learning_rate": 9.99101527403414e-07, "loss": 0.0007, "reward": 3.26171875, "reward_std": 0.3248459994792938, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 0.984375, "step": 28 }, { "completion_length": 83.1015625, "epoch": 0.0018610620888817584, "grad_norm": 1.7996263974927855, "kl": 0.01739501953125, "learning_rate": 9.990694390963933e-07, "loss": 0.0007, "reward": 3.36328125, "reward_std": 0.34072481095790863, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.67578125, "rewards/format_reward": 0.984375, "step": 29 }, { "completion_length": 82.0390625, "epoch": 0.0019252366436707845, "grad_norm": 5.044487990967384, "kl": 0.01666259765625, "learning_rate": 9.990373507893723e-07, "loss": 0.0007, "reward": 3.1328125, "reward_std": 0.3294168561697006, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.546875, "rewards/format_reward": 0.9921875, "step": 30 }, { "completion_length": 75.6640625, "epoch": 0.0019894111984598106, "grad_norm": 4.303535913004868, "kl": 0.02020263671875, "learning_rate": 9.990052624823513e-07, "loss": 0.0008, "reward": 3.14453125, "reward_std": 0.3391089290380478, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.43359375, "rewards/format_reward": 0.9921875, "step": 31 }, { "completion_length": 91.375, "epoch": 0.0020535857532488367, "grad_norm": 3.867777936702832, "kl": 0.0233154296875, "learning_rate": 9.989731741753305e-07, "loss": 0.0009, "reward": 3.17578125, "reward_std": 0.3817155063152313, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.65234375, "rewards/format_reward": 0.9921875, "step": 32 }, { "completion_length": 80.0859375, "epoch": 0.002117760308037863, "grad_norm": 5.660841872987584, "kl": 0.0257568359375, "learning_rate": 9.989410858683095e-07, "loss": 0.001, "reward": 3.13671875, "reward_std": 0.3630076050758362, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 0.9921875, "step": 33 }, { "completion_length": 72.8828125, "epoch": 0.0021819348628268893, "grad_norm": 4.882900614418831, "kl": 0.02276611328125, "learning_rate": 9.989089975612887e-07, "loss": 0.0009, "reward": 2.640625, "reward_std": 0.44960109889507294, "rewards/accuracy_reward": 0.484375, "rewards/format_count_numbers": 1.1640625, "rewards/format_reward": 0.9921875, "step": 34 }, { "completion_length": 80.9296875, "epoch": 0.0022461094176159154, "grad_norm": 4.3133118366946475, "kl": 0.02490234375, "learning_rate": 9.988769092542677e-07, "loss": 0.001, "reward": 3.453125, "reward_std": 0.3391571342945099, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.7734375, "rewards/format_reward": 1.0, "step": 35 }, { "completion_length": 89.9609375, "epoch": 0.0023102839724049415, "grad_norm": 6.955721387397482, "kl": 0.01922607421875, "learning_rate": 9.988448209472467e-07, "loss": 0.0008, "reward": 2.87890625, "reward_std": 0.2806504964828491, "rewards/accuracy_reward": 0.4765625, "rewards/format_count_numbers": 1.40234375, "rewards/format_reward": 1.0, "step": 36 }, { "completion_length": 74.5390625, "epoch": 0.0023744585271939676, "grad_norm": 4.861590830565515, "kl": 0.02423095703125, "learning_rate": 9.988127326402257e-07, "loss": 0.001, "reward": 3.06640625, "reward_std": 0.2550649642944336, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.43359375, "rewards/format_reward": 1.0, "step": 37 }, { "completion_length": 73.921875, "epoch": 0.0024386330819829937, "grad_norm": 2.854709944808263, "kl": 0.02410888671875, "learning_rate": 9.98780644333205e-07, "loss": 0.001, "reward": 2.8984375, "reward_std": 0.3886113613843918, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.296875, "rewards/format_reward": 0.984375, "step": 38 }, { "completion_length": 70.9609375, "epoch": 0.00250280763677202, "grad_norm": 3.5923630444215755, "kl": 0.0277099609375, "learning_rate": 9.98748556026184e-07, "loss": 0.0011, "reward": 3.2265625, "reward_std": 0.2659813463687897, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.4609375, "rewards/format_reward": 1.0, "step": 39 }, { "completion_length": 66.390625, "epoch": 0.002566982191561046, "grad_norm": 3.041742830092947, "kl": 0.02459716796875, "learning_rate": 9.987164677191631e-07, "loss": 0.001, "reward": 3.296875, "reward_std": 0.2199605107307434, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.59375, "rewards/format_reward": 0.9921875, "step": 40 }, { "completion_length": 74.828125, "epoch": 0.0026311567463500724, "grad_norm": 3.3807511168475717, "kl": 0.0289306640625, "learning_rate": 9.986843794121421e-07, "loss": 0.0012, "reward": 3.30078125, "reward_std": 0.34865450859069824, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.69921875, "rewards/format_reward": 1.0, "step": 41 }, { "completion_length": 89.6953125, "epoch": 0.0026953313011390985, "grad_norm": 1.9360780618011337, "kl": 0.02545166015625, "learning_rate": 9.986522911051214e-07, "loss": 0.001, "reward": 3.07421875, "reward_std": 0.24038218706846237, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.52734375, "rewards/format_reward": 1.0, "step": 42 }, { "completion_length": 71.140625, "epoch": 0.0027595058559281246, "grad_norm": 2.522942068290189, "kl": 0.033447265625, "learning_rate": 9.986202027981004e-07, "loss": 0.0013, "reward": 3.234375, "reward_std": 0.30629581212997437, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 1.0, "step": 43 }, { "completion_length": 76.5703125, "epoch": 0.0028236804107171507, "grad_norm": 2.7470688462972572, "kl": 0.026123046875, "learning_rate": 9.985881144910794e-07, "loss": 0.001, "reward": 3.25, "reward_std": 0.30682672560214996, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 0.9765625, "step": 44 }, { "completion_length": 83.8046875, "epoch": 0.0028878549655061768, "grad_norm": 2.4739700610037985, "kl": 0.02978515625, "learning_rate": 9.985560261840584e-07, "loss": 0.0012, "reward": 3.05078125, "reward_std": 0.29518504440784454, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 0.9921875, "step": 45 }, { "completion_length": 65.40625, "epoch": 0.002952029520295203, "grad_norm": 3.690551370084788, "kl": 0.0428466796875, "learning_rate": 9.985239378770376e-07, "loss": 0.0017, "reward": 3.17578125, "reward_std": 0.30267418175935745, "rewards/accuracy_reward": 0.859375, "rewards/format_count_numbers": 1.32421875, "rewards/format_reward": 0.9921875, "step": 46 }, { "completion_length": 69.03125, "epoch": 0.003016204075084229, "grad_norm": 4.382403144838998, "kl": 0.0341796875, "learning_rate": 9.984918495700166e-07, "loss": 0.0014, "reward": 3.484375, "reward_std": 0.23816770315170288, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.703125, "rewards/format_reward": 1.0, "step": 47 }, { "completion_length": 68.015625, "epoch": 0.003080378629873255, "grad_norm": 3.452349900499519, "kl": 0.0255126953125, "learning_rate": 9.984597612629958e-07, "loss": 0.001, "reward": 2.97265625, "reward_std": 0.32505670189857483, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.33984375, "rewards/format_reward": 1.0, "step": 48 }, { "completion_length": 75.1953125, "epoch": 0.0031445531846622816, "grad_norm": 3.392241748989927, "kl": 0.03125, "learning_rate": 9.984276729559748e-07, "loss": 0.0012, "reward": 3.10546875, "reward_std": 0.39815399050712585, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.42578125, "rewards/format_reward": 1.0, "step": 49 }, { "completion_length": 67.9765625, "epoch": 0.0032087277394513077, "grad_norm": 2.177945178884684, "kl": 0.047607421875, "learning_rate": 9.98395584648954e-07, "loss": 0.0019, "reward": 2.953125, "reward_std": 0.2759072184562683, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.34375, "rewards/format_reward": 1.0, "step": 50 }, { "completion_length": 78.390625, "epoch": 0.0032729022942403338, "grad_norm": 3.0857042332441456, "kl": 0.0269775390625, "learning_rate": 9.98363496341933e-07, "loss": 0.0011, "reward": 2.890625, "reward_std": 0.27773458510637283, "rewards/accuracy_reward": 0.4296875, "rewards/format_count_numbers": 1.4609375, "rewards/format_reward": 1.0, "step": 51 }, { "completion_length": 67.046875, "epoch": 0.00333707684902936, "grad_norm": 5.440745248958781, "kl": 0.13043212890625, "learning_rate": 9.98331408034912e-07, "loss": 0.0052, "reward": 2.921875, "reward_std": 0.3226177394390106, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.3359375, "rewards/format_reward": 0.984375, "step": 52 }, { "completion_length": 66.4375, "epoch": 0.003401251403818386, "grad_norm": 4.657297519762943, "kl": 0.041259765625, "learning_rate": 9.982993197278912e-07, "loss": 0.0017, "reward": 3.25, "reward_std": 0.3035288602113724, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.671875, "rewards/format_reward": 1.0, "step": 53 }, { "completion_length": 70.2265625, "epoch": 0.003465425958607412, "grad_norm": 2.3261413114277727, "kl": 0.048583984375, "learning_rate": 9.982672314208702e-07, "loss": 0.0019, "reward": 3.48046875, "reward_std": 0.2077426016330719, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.82421875, "rewards/format_reward": 1.0, "step": 54 }, { "completion_length": 69.4765625, "epoch": 0.003529600513396438, "grad_norm": 12.890917932396162, "kl": 0.045166015625, "learning_rate": 9.982351431138492e-07, "loss": 0.0018, "reward": 3.34765625, "reward_std": 0.25980181246995926, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.70703125, "rewards/format_reward": 1.0, "step": 55 }, { "completion_length": 70.75, "epoch": 0.0035937750681854647, "grad_norm": 2.3924995403034814, "kl": 0.0390625, "learning_rate": 9.982030548068284e-07, "loss": 0.0016, "reward": 2.83203125, "reward_std": 0.2434411644935608, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.18359375, "rewards/format_reward": 1.0, "step": 56 }, { "completion_length": 78.265625, "epoch": 0.0036579496229744908, "grad_norm": 4.097074998616903, "kl": 0.0467529296875, "learning_rate": 9.981709664998074e-07, "loss": 0.0019, "reward": 3.16796875, "reward_std": 0.323630690574646, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.54296875, "rewards/format_reward": 1.0, "step": 57 }, { "completion_length": 64.4453125, "epoch": 0.003722124177763517, "grad_norm": 2.8596095528212815, "kl": 0.045166015625, "learning_rate": 9.981388781927866e-07, "loss": 0.0018, "reward": 2.99609375, "reward_std": 0.3102172762155533, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.45703125, "rewards/format_reward": 1.0, "step": 58 }, { "completion_length": 64.9375, "epoch": 0.003786298732552543, "grad_norm": 2.728196990192836, "kl": 0.0457763671875, "learning_rate": 9.981067898857656e-07, "loss": 0.0018, "reward": 2.9609375, "reward_std": 0.2459762617945671, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.3515625, "rewards/format_reward": 1.0, "step": 59 }, { "completion_length": 69.0625, "epoch": 0.003850473287341569, "grad_norm": 3.4406926961923587, "kl": 0.03759765625, "learning_rate": 9.980747015787446e-07, "loss": 0.0015, "reward": 2.87109375, "reward_std": 0.34723127633333206, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.21484375, "rewards/format_reward": 0.9921875, "step": 60 }, { "completion_length": 66.5390625, "epoch": 0.0039146478421305956, "grad_norm": 3.057262479255094, "kl": 0.043212890625, "learning_rate": 9.980426132717238e-07, "loss": 0.0017, "reward": 3.38671875, "reward_std": 0.22124166041612625, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.71484375, "rewards/format_reward": 1.0, "step": 61 }, { "completion_length": 84.8125, "epoch": 0.003978822396919621, "grad_norm": 6.416434548752605, "kl": 0.0321044921875, "learning_rate": 9.980105249647028e-07, "loss": 0.0013, "reward": 3.08984375, "reward_std": 0.32453496754169464, "rewards/accuracy_reward": 0.4453125, "rewards/format_count_numbers": 1.64453125, "rewards/format_reward": 1.0, "step": 62 }, { "completion_length": 77.1953125, "epoch": 0.004042996951708648, "grad_norm": 3.732769709996209, "kl": 0.037841796875, "learning_rate": 9.979784366576818e-07, "loss": 0.0015, "reward": 3.1015625, "reward_std": 0.35314419865608215, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.5703125, "rewards/format_reward": 1.0, "step": 63 }, { "completion_length": 77.5, "epoch": 0.004107171506497673, "grad_norm": 2.5167684293651877, "kl": 0.0311279296875, "learning_rate": 9.979463483506608e-07, "loss": 0.0012, "reward": 3.10546875, "reward_std": 0.2587262690067291, "rewards/accuracy_reward": 0.5078125, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 0.9921875, "step": 64 }, { "completion_length": 70.609375, "epoch": 0.0041713460612867, "grad_norm": 7.497854848230895, "kl": 0.03466796875, "learning_rate": 9.9791426004364e-07, "loss": 0.0014, "reward": 3.12109375, "reward_std": 0.35663464665412903, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.45703125, "rewards/format_reward": 1.0, "step": 65 }, { "completion_length": 67.3359375, "epoch": 0.004235520616075726, "grad_norm": 12.830822241443972, "kl": 0.118408203125, "learning_rate": 9.97882171736619e-07, "loss": 0.0047, "reward": 3.08984375, "reward_std": 0.2815767228603363, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.36328125, "rewards/format_reward": 1.0, "step": 66 }, { "completion_length": 80.15625, "epoch": 0.004299695170864752, "grad_norm": 6.252660074852895, "kl": 0.03729248046875, "learning_rate": 9.978500834295983e-07, "loss": 0.0015, "reward": 3.5703125, "reward_std": 0.21104412525892258, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.9609375, "rewards/format_reward": 1.0, "step": 67 }, { "completion_length": 72.375, "epoch": 0.004363869725653779, "grad_norm": 2.346394602052095, "kl": 0.025146484375, "learning_rate": 9.978179951225773e-07, "loss": 0.001, "reward": 3.26171875, "reward_std": 0.2549284026026726, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 68 }, { "completion_length": 80.0078125, "epoch": 0.004428044280442804, "grad_norm": 6.9747477147107775, "kl": 0.053955078125, "learning_rate": 9.977859068155565e-07, "loss": 0.0022, "reward": 3.1171875, "reward_std": 0.23806139826774597, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.5546875, "rewards/format_reward": 1.0, "step": 69 }, { "completion_length": 76.78125, "epoch": 0.004492218835231831, "grad_norm": 4.378472876328936, "kl": 0.029541015625, "learning_rate": 9.977538185085355e-07, "loss": 0.0012, "reward": 2.93359375, "reward_std": 0.19287973642349243, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.33984375, "rewards/format_reward": 0.984375, "step": 70 }, { "completion_length": 74.171875, "epoch": 0.0045563933900208565, "grad_norm": 2.358606331460238, "kl": 0.0230712890625, "learning_rate": 9.977217302015145e-07, "loss": 0.0009, "reward": 3.54296875, "reward_std": 0.26491738110780716, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 1.0, "step": 71 }, { "completion_length": 75.34375, "epoch": 0.004620567944809883, "grad_norm": 65.47310912953631, "kl": 0.03179931640625, "learning_rate": 9.976896418944935e-07, "loss": 0.0013, "reward": 2.78125, "reward_std": 0.29492397606372833, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.234375, "rewards/format_reward": 1.0, "step": 72 }, { "completion_length": 76.921875, "epoch": 0.004684742499598909, "grad_norm": 3.27041300953387, "kl": 0.0367431640625, "learning_rate": 9.976575535874727e-07, "loss": 0.0015, "reward": 3.33203125, "reward_std": 0.2555892765522003, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 73 }, { "completion_length": 67.484375, "epoch": 0.004748917054387935, "grad_norm": 5.812215039650067, "kl": 0.03240966796875, "learning_rate": 9.976254652804517e-07, "loss": 0.0013, "reward": 3.43359375, "reward_std": 0.20885366201400757, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 74 }, { "completion_length": 82.4921875, "epoch": 0.004813091609176962, "grad_norm": 4.4277516048263506, "kl": 0.0313720703125, "learning_rate": 9.97593376973431e-07, "loss": 0.0013, "reward": 3.2734375, "reward_std": 0.19161942601203918, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.5625, "rewards/format_reward": 1.0, "step": 75 }, { "completion_length": 67.109375, "epoch": 0.004877266163965987, "grad_norm": 3.0595066417538415, "kl": 0.04052734375, "learning_rate": 9.9756128866641e-07, "loss": 0.0016, "reward": 3.5625, "reward_std": 0.32848016172647476, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.84375, "rewards/format_reward": 1.0, "step": 76 }, { "completion_length": 82.9921875, "epoch": 0.004941440718755014, "grad_norm": 3.999806146457781, "kl": 0.0372314453125, "learning_rate": 9.975292003593891e-07, "loss": 0.0015, "reward": 3.0546875, "reward_std": 0.2797150984406471, "rewards/accuracy_reward": 0.4921875, "rewards/format_count_numbers": 1.5703125, "rewards/format_reward": 0.9921875, "step": 77 }, { "completion_length": 75.265625, "epoch": 0.00500561527354404, "grad_norm": 2.4007654591592953, "kl": 0.02508544921875, "learning_rate": 9.974971120523681e-07, "loss": 0.001, "reward": 3.30859375, "reward_std": 0.21722427010536194, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 78 }, { "completion_length": 69.8671875, "epoch": 0.005069789828333066, "grad_norm": 4.413870539754506, "kl": 0.0401611328125, "learning_rate": 9.974650237453471e-07, "loss": 0.0016, "reward": 3.09765625, "reward_std": 0.24370676279067993, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.46484375, "rewards/format_reward": 0.9921875, "step": 79 }, { "completion_length": 71.3828125, "epoch": 0.005133964383122092, "grad_norm": 2.7469356159448375, "kl": 0.03594970703125, "learning_rate": 9.974329354383261e-07, "loss": 0.0014, "reward": 3.359375, "reward_std": 0.28942976146936417, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.59375, "rewards/format_reward": 1.0, "step": 80 }, { "completion_length": 71.6875, "epoch": 0.005198138937911118, "grad_norm": 4.694669178987162, "kl": 0.04248046875, "learning_rate": 9.974008471313053e-07, "loss": 0.0017, "reward": 3.09765625, "reward_std": 0.3313465863466263, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.46484375, "rewards/format_reward": 0.9921875, "step": 81 }, { "completion_length": 71.9375, "epoch": 0.005262313492700145, "grad_norm": 5.239839542228686, "kl": 0.0465087890625, "learning_rate": 9.973687588242843e-07, "loss": 0.0019, "reward": 3.40234375, "reward_std": 0.2555918022990227, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.84765625, "rewards/format_reward": 0.9921875, "step": 82 }, { "completion_length": 66.59375, "epoch": 0.0053264880474891705, "grad_norm": 9.58228205262393, "kl": 0.03302001953125, "learning_rate": 9.973366705172635e-07, "loss": 0.0013, "reward": 3.328125, "reward_std": 0.27802956849336624, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 83 }, { "completion_length": 73.7734375, "epoch": 0.005390662602278197, "grad_norm": 5.487582516234903, "kl": 0.0311279296875, "learning_rate": 9.973045822102425e-07, "loss": 0.0012, "reward": 3.28515625, "reward_std": 0.24403482675552368, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 1.0, "step": 84 }, { "completion_length": 77.0859375, "epoch": 0.005454837157067223, "grad_norm": 2.8797110234241217, "kl": 0.03607177734375, "learning_rate": 9.972724939032218e-07, "loss": 0.0014, "reward": 3.4296875, "reward_std": 0.3189963102340698, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 0.984375, "step": 85 }, { "completion_length": 71.2734375, "epoch": 0.005519011711856249, "grad_norm": 3.495421949980677, "kl": 0.043212890625, "learning_rate": 9.972404055962008e-07, "loss": 0.0017, "reward": 3.0625, "reward_std": 0.30399875342845917, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 0.9921875, "step": 86 }, { "completion_length": 67.015625, "epoch": 0.005583186266645275, "grad_norm": 6.764316503209916, "kl": 0.0382080078125, "learning_rate": 9.972083172891798e-07, "loss": 0.0015, "reward": 2.8671875, "reward_std": 0.3235751837491989, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.3359375, "rewards/format_reward": 0.984375, "step": 87 }, { "completion_length": 77.1796875, "epoch": 0.005647360821434301, "grad_norm": 2.0348391619602713, "kl": 0.046142578125, "learning_rate": 9.971762289821588e-07, "loss": 0.0018, "reward": 3.046875, "reward_std": 0.26907191798090935, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.4375, "rewards/format_reward": 1.0, "step": 88 }, { "completion_length": 69.859375, "epoch": 0.005711535376223327, "grad_norm": 2.4510593826235993, "kl": 0.044677734375, "learning_rate": 9.97144140675138e-07, "loss": 0.0018, "reward": 3.40625, "reward_std": 0.3325708657503128, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 1.0, "step": 89 }, { "completion_length": 69.421875, "epoch": 0.0057757099310123535, "grad_norm": 7.386556285996494, "kl": 0.0384521484375, "learning_rate": 9.97112052368117e-07, "loss": 0.0015, "reward": 3.45703125, "reward_std": 0.18665644526481628, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.84765625, "rewards/format_reward": 1.0, "step": 90 }, { "completion_length": 76.546875, "epoch": 0.00583988448580138, "grad_norm": 10.07527340286235, "kl": 0.03741455078125, "learning_rate": 9.97079964061096e-07, "loss": 0.0015, "reward": 3.4765625, "reward_std": 0.17688900232315063, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 1.0, "step": 91 }, { "completion_length": 67.1015625, "epoch": 0.005904059040590406, "grad_norm": 6.961725253066966, "kl": 0.0450439453125, "learning_rate": 9.970478757540752e-07, "loss": 0.0018, "reward": 3.15234375, "reward_std": 0.18702887743711472, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.34765625, "rewards/format_reward": 1.0, "step": 92 }, { "completion_length": 67.75, "epoch": 0.005968233595379432, "grad_norm": 4.665158893240586, "kl": 0.0504150390625, "learning_rate": 9.970157874470542e-07, "loss": 0.002, "reward": 3.32421875, "reward_std": 0.17975258082151413, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 0.9921875, "step": 93 }, { "completion_length": 62.765625, "epoch": 0.006032408150168458, "grad_norm": 6.052438787803174, "kl": 0.0577392578125, "learning_rate": 9.969836991400334e-07, "loss": 0.0023, "reward": 2.96484375, "reward_std": 0.17256294190883636, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 0.9921875, "step": 94 }, { "completion_length": 67.546875, "epoch": 0.0060965827049574844, "grad_norm": 3.8543628664738248, "kl": 0.1171875, "learning_rate": 9.969516108330124e-07, "loss": 0.0047, "reward": 2.75, "reward_std": 0.25491149723529816, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.21875, "rewards/format_reward": 0.9921875, "step": 95 }, { "completion_length": 57.5234375, "epoch": 0.00616075725974651, "grad_norm": 10.068717750488322, "kl": 0.050048828125, "learning_rate": 9.969195225259914e-07, "loss": 0.002, "reward": 3.34765625, "reward_std": 0.15529648214578629, "rewards/accuracy_reward": 0.859375, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 0.9921875, "step": 96 }, { "completion_length": 77.4140625, "epoch": 0.006224931814535537, "grad_norm": 76.26414395404969, "kl": 0.0400390625, "learning_rate": 9.968874342189706e-07, "loss": 0.0016, "reward": 3.3046875, "reward_std": 0.18849123269319534, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 0.9921875, "step": 97 }, { "completion_length": 64.359375, "epoch": 0.006289106369324563, "grad_norm": 6.324698594548965, "kl": 0.05126953125, "learning_rate": 9.968553459119496e-07, "loss": 0.0021, "reward": 3.09765625, "reward_std": 0.16071559116244316, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.34765625, "rewards/format_reward": 0.9921875, "step": 98 }, { "completion_length": 69.578125, "epoch": 0.006353280924113589, "grad_norm": 2.6642177820540494, "kl": 0.05078125, "learning_rate": 9.968232576049286e-07, "loss": 0.002, "reward": 3.20703125, "reward_std": 0.19081907719373703, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.57421875, "rewards/format_reward": 1.0, "step": 99 }, { "completion_length": 69.6171875, "epoch": 0.006417455478902615, "grad_norm": 3.1800877639787006, "kl": 0.04925537109375, "learning_rate": 9.967911692979078e-07, "loss": 0.002, "reward": 3.15625, "reward_std": 0.25464994460344315, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.46875, "rewards/format_reward": 1.0, "step": 100 }, { "completion_length": 69.296875, "epoch": 0.006481630033691641, "grad_norm": 5.47896444311625, "kl": 0.04052734375, "learning_rate": 9.967590809908868e-07, "loss": 0.0016, "reward": 3.16796875, "reward_std": 0.15877367183566093, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 101 }, { "completion_length": 66.703125, "epoch": 0.0065458045884806675, "grad_norm": 6.581183385830175, "kl": 0.0469970703125, "learning_rate": 9.96726992683866e-07, "loss": 0.0019, "reward": 3.359375, "reward_std": 0.2588741034269333, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 102 }, { "completion_length": 76.7109375, "epoch": 0.006609979143269693, "grad_norm": 5.79571682079651, "kl": 0.0438232421875, "learning_rate": 9.96694904376845e-07, "loss": 0.0018, "reward": 3.125, "reward_std": 0.23913496732711792, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.5859375, "rewards/format_reward": 0.9921875, "step": 103 }, { "completion_length": 79.7578125, "epoch": 0.00667415369805872, "grad_norm": 3.1092072916938407, "kl": 0.0345458984375, "learning_rate": 9.966628160698242e-07, "loss": 0.0014, "reward": 3.2578125, "reward_std": 0.24152958393096924, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.5859375, "rewards/format_reward": 1.0, "step": 104 }, { "completion_length": 71.3046875, "epoch": 0.006738328252847746, "grad_norm": 8.094574176465539, "kl": 0.037841796875, "learning_rate": 9.966307277628032e-07, "loss": 0.0015, "reward": 3.24609375, "reward_std": 0.37829458713531494, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 105 }, { "completion_length": 80.2578125, "epoch": 0.006802502807636772, "grad_norm": 7.53058354344353, "kl": 0.03106689453125, "learning_rate": 9.965986394557822e-07, "loss": 0.0012, "reward": 3.19921875, "reward_std": 0.15539800375699997, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.56640625, "rewards/format_reward": 0.984375, "step": 106 }, { "completion_length": 75.828125, "epoch": 0.006866677362425798, "grad_norm": 10.101290133760061, "kl": 0.0355224609375, "learning_rate": 9.965665511487612e-07, "loss": 0.0014, "reward": 3.59765625, "reward_std": 0.2908743619918823, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.95703125, "rewards/format_reward": 1.0, "step": 107 }, { "completion_length": 70.15625, "epoch": 0.006930851917214824, "grad_norm": 2.667528641039127, "kl": 0.02984619140625, "learning_rate": 9.965344628417405e-07, "loss": 0.0012, "reward": 3.109375, "reward_std": 0.21665052324533463, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 108 }, { "completion_length": 67.7265625, "epoch": 0.006995026472003851, "grad_norm": 1.7942554364390482, "kl": 0.03582763671875, "learning_rate": 9.965023745347195e-07, "loss": 0.0014, "reward": 3.16796875, "reward_std": 0.20552908629179, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 1.0, "step": 109 }, { "completion_length": 69.875, "epoch": 0.007059201026792876, "grad_norm": 3.9047862888632094, "kl": 0.035888671875, "learning_rate": 9.964702862276987e-07, "loss": 0.0014, "reward": 2.9296875, "reward_std": 0.21844128519296646, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.3359375, "rewards/format_reward": 1.0, "step": 110 }, { "completion_length": 73.9296875, "epoch": 0.007123375581581903, "grad_norm": 1.9901205037970706, "kl": 0.0367431640625, "learning_rate": 9.964381979206777e-07, "loss": 0.0015, "reward": 3.80078125, "reward_std": 0.12836876511573792, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.98046875, "rewards/format_reward": 1.0, "step": 111 }, { "completion_length": 70.2265625, "epoch": 0.007187550136370929, "grad_norm": 2.6629757273994272, "kl": 0.029296875, "learning_rate": 9.964061096136569e-07, "loss": 0.0012, "reward": 2.9765625, "reward_std": 0.192819744348526, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.2421875, "rewards/format_reward": 0.9921875, "step": 112 }, { "completion_length": 89.484375, "epoch": 0.007251724691159955, "grad_norm": 2.966985281450592, "kl": 0.0301513671875, "learning_rate": 9.963740213066359e-07, "loss": 0.0012, "reward": 3.60546875, "reward_std": 0.2533341944217682, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.95703125, "rewards/format_reward": 0.984375, "step": 113 }, { "completion_length": 70.59375, "epoch": 0.0073158992459489815, "grad_norm": 2.7125397233136437, "kl": 0.02899169921875, "learning_rate": 9.963419329996149e-07, "loss": 0.0012, "reward": 3.4453125, "reward_std": 0.18551141023635864, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 114 }, { "completion_length": 70.8515625, "epoch": 0.007380073800738007, "grad_norm": 3.8788221946660197, "kl": 0.044189453125, "learning_rate": 9.963098446925939e-07, "loss": 0.0018, "reward": 3.2265625, "reward_std": 0.4408875107765198, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 0.9921875, "step": 115 }, { "completion_length": 71.4375, "epoch": 0.007444248355527034, "grad_norm": 2.4238411680755836, "kl": 0.0294189453125, "learning_rate": 9.96277756385573e-07, "loss": 0.0012, "reward": 2.98046875, "reward_std": 0.3246304541826248, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 0.9921875, "step": 116 }, { "completion_length": 75.3671875, "epoch": 0.007508422910316059, "grad_norm": 2.1672853496708027, "kl": 0.03167724609375, "learning_rate": 9.96245668078552e-07, "loss": 0.0013, "reward": 3.28515625, "reward_std": 0.16477391123771667, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 0.9921875, "step": 117 }, { "completion_length": 72.8125, "epoch": 0.007572597465105086, "grad_norm": 4.153439569042706, "kl": 0.0335693359375, "learning_rate": 9.962135797715313e-07, "loss": 0.0013, "reward": 3.45703125, "reward_std": 0.1774558126926422, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 118 }, { "completion_length": 71.96875, "epoch": 0.007636772019894112, "grad_norm": 4.332907387044588, "kl": 0.0391845703125, "learning_rate": 9.961814914645103e-07, "loss": 0.0016, "reward": 2.921875, "reward_std": 0.26686903089284897, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.2109375, "rewards/format_reward": 1.0, "step": 119 }, { "completion_length": 80.3671875, "epoch": 0.007700946574683138, "grad_norm": 8.242677153517718, "kl": 0.0345458984375, "learning_rate": 9.961494031574895e-07, "loss": 0.0014, "reward": 3.37890625, "reward_std": 0.2326306775212288, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 0.9921875, "step": 120 }, { "completion_length": 77.03125, "epoch": 0.007765121129472165, "grad_norm": 5.62963310635079, "kl": 0.0301513671875, "learning_rate": 9.961173148504685e-07, "loss": 0.0012, "reward": 3.38671875, "reward_std": 0.25540195405483246, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 121 }, { "completion_length": 72.515625, "epoch": 0.007829295684261191, "grad_norm": 5.019156549606577, "kl": 0.0352783203125, "learning_rate": 9.960852265434475e-07, "loss": 0.0014, "reward": 3.07421875, "reward_std": 0.17285499721765518, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.33984375, "rewards/format_reward": 1.0, "step": 122 }, { "completion_length": 72.046875, "epoch": 0.007893470239050217, "grad_norm": 172.86521654796587, "kl": 0.03363037109375, "learning_rate": 9.960531382364265e-07, "loss": 0.0013, "reward": 3.49609375, "reward_std": 0.18784459680318832, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 0.9921875, "step": 123 }, { "completion_length": 81.546875, "epoch": 0.007957644793839242, "grad_norm": 2.980011330259798, "kl": 0.0408935546875, "learning_rate": 9.960210499294057e-07, "loss": 0.0016, "reward": 3.41015625, "reward_std": 0.26077425479888916, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 1.0, "step": 124 }, { "completion_length": 76.75, "epoch": 0.008021819348628268, "grad_norm": 11.648469118233269, "kl": 0.03179931640625, "learning_rate": 9.959889616223847e-07, "loss": 0.0013, "reward": 3.47265625, "reward_std": 0.2088487520813942, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 125 }, { "completion_length": 73.171875, "epoch": 0.008085993903417295, "grad_norm": 8.91710759838637, "kl": 0.03350830078125, "learning_rate": 9.959568733153637e-07, "loss": 0.0013, "reward": 3.13671875, "reward_std": 0.2274910733103752, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 1.0, "step": 126 }, { "completion_length": 85.359375, "epoch": 0.008150168458206321, "grad_norm": 1.741816818144556, "kl": 0.0216064453125, "learning_rate": 9.95924785008343e-07, "loss": 0.0009, "reward": 2.6328125, "reward_std": 0.1547919102013111, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 0.984375, "rewards/format_reward": 1.0, "step": 127 }, { "completion_length": 74.5625, "epoch": 0.008214343012995347, "grad_norm": 2.6793007127892614, "kl": 0.0460205078125, "learning_rate": 9.95892696701322e-07, "loss": 0.0018, "reward": 3.48046875, "reward_std": 0.188736230134964, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 0.9921875, "step": 128 }, { "completion_length": 80.5390625, "epoch": 0.008278517567784374, "grad_norm": 8.14669625843878, "kl": 0.03326416015625, "learning_rate": 9.958606083943011e-07, "loss": 0.0013, "reward": 3.59765625, "reward_std": 0.23143374174833298, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.86328125, "rewards/format_reward": 0.9921875, "step": 129 }, { "completion_length": 82.234375, "epoch": 0.0083426921225734, "grad_norm": 2.7245691995074965, "kl": 0.02880859375, "learning_rate": 9.958285200872801e-07, "loss": 0.0012, "reward": 3.0703125, "reward_std": 0.28687404096126556, "rewards/accuracy_reward": 0.5078125, "rewards/format_count_numbers": 1.5703125, "rewards/format_reward": 0.9921875, "step": 130 }, { "completion_length": 73.1328125, "epoch": 0.008406866677362426, "grad_norm": 5.3208068953625345, "kl": 0.03204345703125, "learning_rate": 9.957964317802592e-07, "loss": 0.0013, "reward": 3.03125, "reward_std": 0.19568345695734024, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 1.0, "step": 131 }, { "completion_length": 73.2734375, "epoch": 0.008471041232151451, "grad_norm": 2.684705369760692, "kl": 0.03472900390625, "learning_rate": 9.957643434732384e-07, "loss": 0.0014, "reward": 3.34375, "reward_std": 0.1942191794514656, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.5625, "rewards/format_reward": 1.0, "step": 132 }, { "completion_length": 79.359375, "epoch": 0.008535215786940479, "grad_norm": 4.770259276932602, "kl": 0.03912353515625, "learning_rate": 9.957322551662174e-07, "loss": 0.0016, "reward": 3.38671875, "reward_std": 0.28095004707574844, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 0.9921875, "step": 133 }, { "completion_length": 77.875, "epoch": 0.008599390341729504, "grad_norm": 2.007034772347311, "kl": 0.0369873046875, "learning_rate": 9.957001668591964e-07, "loss": 0.0015, "reward": 3.41796875, "reward_std": 0.15443194285035133, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 134 }, { "completion_length": 76.3671875, "epoch": 0.00866356489651853, "grad_norm": 5.867057502447185, "kl": 0.03289794921875, "learning_rate": 9.956680785521756e-07, "loss": 0.0013, "reward": 3.31640625, "reward_std": 0.260734885931015, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 1.0, "step": 135 }, { "completion_length": 91.40625, "epoch": 0.008727739451307557, "grad_norm": 3.3264591723301042, "kl": 0.037841796875, "learning_rate": 9.956359902451546e-07, "loss": 0.0015, "reward": 3.01171875, "reward_std": 0.2456851825118065, "rewards/accuracy_reward": 0.421875, "rewards/format_count_numbers": 1.58984375, "rewards/format_reward": 1.0, "step": 136 }, { "completion_length": 73.9609375, "epoch": 0.008791914006096583, "grad_norm": 1.8525091716102906, "kl": 0.02886962890625, "learning_rate": 9.956039019381338e-07, "loss": 0.0012, "reward": 3.16796875, "reward_std": 0.054446361027657986, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.36328125, "rewards/format_reward": 1.0, "step": 137 }, { "completion_length": 82.421875, "epoch": 0.008856088560885609, "grad_norm": 3.221242319181372, "kl": 0.0504150390625, "learning_rate": 9.955718136311128e-07, "loss": 0.002, "reward": 3.38671875, "reward_std": 0.24750632792711258, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 0.9921875, "step": 138 }, { "completion_length": 78.4296875, "epoch": 0.008920263115674634, "grad_norm": 2.908425383296693, "kl": 0.02789306640625, "learning_rate": 9.955397253240918e-07, "loss": 0.0011, "reward": 3.140625, "reward_std": 0.22043407708406448, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 0.9921875, "step": 139 }, { "completion_length": 76.2734375, "epoch": 0.008984437670463662, "grad_norm": 69.81281381804405, "kl": 0.037353515625, "learning_rate": 9.95507637017071e-07, "loss": 0.0015, "reward": 3.2265625, "reward_std": 0.25224410742521286, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.6015625, "rewards/format_reward": 0.9921875, "step": 140 }, { "completion_length": 84.6953125, "epoch": 0.009048612225252687, "grad_norm": 5.582225121353137, "kl": 0.0328369140625, "learning_rate": 9.9547554871005e-07, "loss": 0.0013, "reward": 3.0546875, "reward_std": 0.24409383535385132, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.4765625, "rewards/format_reward": 1.0, "step": 141 }, { "completion_length": 86.4921875, "epoch": 0.009112786780041713, "grad_norm": 6.841154880421374, "kl": 0.1744384765625, "learning_rate": 9.95443460403029e-07, "loss": 0.007, "reward": 3.17578125, "reward_std": 0.16243606060743332, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 142 }, { "completion_length": 84.5703125, "epoch": 0.00917696133483074, "grad_norm": 3.638970858152364, "kl": 0.0343017578125, "learning_rate": 9.954113720960082e-07, "loss": 0.0014, "reward": 3.23828125, "reward_std": 0.27304429560899734, "rewards/accuracy_reward": 0.515625, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 1.0, "step": 143 }, { "completion_length": 83.40625, "epoch": 0.009241135889619766, "grad_norm": 4.522345160206695, "kl": 0.0487060546875, "learning_rate": 9.953792837889872e-07, "loss": 0.0019, "reward": 3.37109375, "reward_std": 0.2966039180755615, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 144 }, { "completion_length": 83.421875, "epoch": 0.009305310444408792, "grad_norm": 7.380300919018527, "kl": 0.02728271484375, "learning_rate": 9.953471954819664e-07, "loss": 0.0011, "reward": 3.1875, "reward_std": 0.21595831215381622, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 145 }, { "completion_length": 87.9140625, "epoch": 0.009369484999197817, "grad_norm": 5.3843936760731435, "kl": 0.0396728515625, "learning_rate": 9.953151071749454e-07, "loss": 0.0016, "reward": 3.48828125, "reward_std": 0.29947739839553833, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.82421875, "rewards/format_reward": 1.0, "step": 146 }, { "completion_length": 84.15625, "epoch": 0.009433659553986845, "grad_norm": 2.10354290562444, "kl": 0.035400390625, "learning_rate": 9.952830188679244e-07, "loss": 0.0014, "reward": 3.2421875, "reward_std": 0.13888052850961685, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 147 }, { "completion_length": 85.2265625, "epoch": 0.00949783410877587, "grad_norm": 26.198299549627695, "kl": 0.0377197265625, "learning_rate": 9.952509305609036e-07, "loss": 0.0015, "reward": 3.19921875, "reward_std": 0.3435995280742645, "rewards/accuracy_reward": 0.484375, "rewards/format_count_numbers": 1.71484375, "rewards/format_reward": 1.0, "step": 148 }, { "completion_length": 77.6875, "epoch": 0.009562008663564896, "grad_norm": 2.2014767998798597, "kl": 0.04052734375, "learning_rate": 9.952188422538826e-07, "loss": 0.0016, "reward": 3.109375, "reward_std": 0.18990949541330338, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 149 }, { "completion_length": 86.5234375, "epoch": 0.009626183218353923, "grad_norm": 3.7167303088615733, "kl": 0.02752685546875, "learning_rate": 9.951867539468616e-07, "loss": 0.0011, "reward": 3.0703125, "reward_std": 0.3137922137975693, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.4765625, "rewards/format_reward": 0.9921875, "step": 150 }, { "completion_length": 85.1640625, "epoch": 0.009690357773142949, "grad_norm": 4.175840708663256, "kl": 0.0296630859375, "learning_rate": 9.951546656398408e-07, "loss": 0.0012, "reward": 3.41015625, "reward_std": 0.18387350719422102, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 0.9921875, "step": 151 }, { "completion_length": 78.6015625, "epoch": 0.009754532327931975, "grad_norm": 1.9822559184582953, "kl": 0.0369873046875, "learning_rate": 9.951225773328198e-07, "loss": 0.0015, "reward": 3.30078125, "reward_std": 0.22633200883865356, "rewards/accuracy_reward": 0.5703125, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 152 }, { "completion_length": 100.1953125, "epoch": 0.009818706882721, "grad_norm": 6.95505305805243, "kl": 0.0323486328125, "learning_rate": 9.950904890257988e-07, "loss": 0.0013, "reward": 3.3828125, "reward_std": 0.27803826332092285, "rewards/accuracy_reward": 0.5703125, "rewards/format_count_numbers": 1.8359375, "rewards/format_reward": 0.9765625, "step": 153 }, { "completion_length": 96.5703125, "epoch": 0.009882881437510028, "grad_norm": 3.906970922010899, "kl": 0.04315185546875, "learning_rate": 9.95058400718778e-07, "loss": 0.0017, "reward": 3.19140625, "reward_std": 0.4169163405895233, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.58984375, "rewards/format_reward": 0.9609375, "step": 154 }, { "completion_length": 80.3359375, "epoch": 0.009947055992299053, "grad_norm": 3.628791735089063, "kl": 0.02752685546875, "learning_rate": 9.95026312411757e-07, "loss": 0.0011, "reward": 2.9140625, "reward_std": 0.29985813796520233, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 0.9921875, "step": 155 }, { "completion_length": 89.8046875, "epoch": 0.01001123054708808, "grad_norm": 2.993371763950907, "kl": 0.02532958984375, "learning_rate": 9.949942241047363e-07, "loss": 0.001, "reward": 3.5703125, "reward_std": 0.34663237631320953, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.9609375, "rewards/format_reward": 0.984375, "step": 156 }, { "completion_length": 82.5, "epoch": 0.010075405101877107, "grad_norm": 3.4536790618347486, "kl": 0.03021240234375, "learning_rate": 9.949621357977153e-07, "loss": 0.0012, "reward": 3.48828125, "reward_std": 0.2875918447971344, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 0.9921875, "step": 157 }, { "completion_length": 84.484375, "epoch": 0.010139579656666132, "grad_norm": 9.831040603261537, "kl": 0.220703125, "learning_rate": 9.949300474906943e-07, "loss": 0.0088, "reward": 3.58203125, "reward_std": 0.11652141809463501, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 158 }, { "completion_length": 85.0859375, "epoch": 0.010203754211455158, "grad_norm": 38.92993687592298, "kl": 0.0340576171875, "learning_rate": 9.948979591836735e-07, "loss": 0.0014, "reward": 3.5625, "reward_std": 0.15991678088903427, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 159 }, { "completion_length": 79.046875, "epoch": 0.010267928766244184, "grad_norm": 4.0346708620650835, "kl": 0.02191162109375, "learning_rate": 9.948658708766525e-07, "loss": 0.0009, "reward": 2.9921875, "reward_std": 0.2673698216676712, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.2265625, "rewards/format_reward": 0.984375, "step": 160 }, { "completion_length": 93.453125, "epoch": 0.010332103321033211, "grad_norm": 1.8562842308505547, "kl": 0.037353515625, "learning_rate": 9.948337825696315e-07, "loss": 0.0015, "reward": 3.1875, "reward_std": 0.19912117719650269, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.5859375, "rewards/format_reward": 1.0, "step": 161 }, { "completion_length": 83.734375, "epoch": 0.010396277875822237, "grad_norm": 6.323565148375347, "kl": 0.02825927734375, "learning_rate": 9.948016942626107e-07, "loss": 0.0011, "reward": 3.34375, "reward_std": 0.21071936190128326, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 0.984375, "step": 162 }, { "completion_length": 81.3046875, "epoch": 0.010460452430611262, "grad_norm": 4.139572905745001, "kl": 0.029052734375, "learning_rate": 9.947696059555897e-07, "loss": 0.0012, "reward": 3.04296875, "reward_std": 0.29703205823898315, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 163 }, { "completion_length": 76.125, "epoch": 0.01052462698540029, "grad_norm": 2.7648686464439822, "kl": 0.0252685546875, "learning_rate": 9.94737517648569e-07, "loss": 0.001, "reward": 3.390625, "reward_std": 0.17688900232315063, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 164 }, { "completion_length": 80.359375, "epoch": 0.010588801540189315, "grad_norm": 8.517375820803469, "kl": 0.03009033203125, "learning_rate": 9.94705429341548e-07, "loss": 0.0012, "reward": 3.12109375, "reward_std": 0.33453139662742615, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.46484375, "rewards/format_reward": 0.984375, "step": 165 }, { "completion_length": 74.140625, "epoch": 0.010652976094978341, "grad_norm": 13.798127005048082, "kl": 0.03436279296875, "learning_rate": 9.94673341034527e-07, "loss": 0.0014, "reward": 3.32421875, "reward_std": 0.1938636675477028, "rewards/accuracy_reward": 0.8515625, "rewards/format_count_numbers": 1.47265625, "rewards/format_reward": 1.0, "step": 166 }, { "completion_length": 89.6875, "epoch": 0.010717150649767367, "grad_norm": 1.4991333191981464, "kl": 0.031005859375, "learning_rate": 9.946412527275061e-07, "loss": 0.0012, "reward": 3.08984375, "reward_std": 0.256519578397274, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.56640625, "rewards/format_reward": 0.9921875, "step": 167 }, { "completion_length": 75.0703125, "epoch": 0.010781325204556394, "grad_norm": 3.45279085370487, "kl": 0.0384521484375, "learning_rate": 9.946091644204851e-07, "loss": 0.0015, "reward": 3.671875, "reward_std": 0.25596361607313156, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.8671875, "rewards/format_reward": 1.0, "step": 168 }, { "completion_length": 83.171875, "epoch": 0.01084549975934542, "grad_norm": 3.2484122379759017, "kl": 0.03173828125, "learning_rate": 9.945770761134641e-07, "loss": 0.0013, "reward": 3.234375, "reward_std": 0.20317253470420837, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.453125, "rewards/format_reward": 1.0, "step": 169 }, { "completion_length": 82.7734375, "epoch": 0.010909674314134445, "grad_norm": 6.212774921649588, "kl": 0.0299072265625, "learning_rate": 9.945449878064433e-07, "loss": 0.0012, "reward": 3.49609375, "reward_std": 0.12981030344963074, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 170 }, { "completion_length": 73.2578125, "epoch": 0.010973848868923473, "grad_norm": 2.7189307025534237, "kl": 0.03564453125, "learning_rate": 9.945128994994223e-07, "loss": 0.0014, "reward": 3.0546875, "reward_std": 0.10720711201429367, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.234375, "rewards/format_reward": 1.0, "step": 171 }, { "completion_length": 76.7578125, "epoch": 0.011038023423712498, "grad_norm": 2.9999929786732147, "kl": 0.03466796875, "learning_rate": 9.944808111924015e-07, "loss": 0.0014, "reward": 3.20703125, "reward_std": 0.2519870027899742, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 172 }, { "completion_length": 76.9375, "epoch": 0.011102197978501524, "grad_norm": 2.873932479263256, "kl": 0.037109375, "learning_rate": 9.944487228853805e-07, "loss": 0.0015, "reward": 3.3046875, "reward_std": 0.2227931022644043, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 173 }, { "completion_length": 69.9296875, "epoch": 0.01116637253329055, "grad_norm": 6.091517398086225, "kl": 0.03564453125, "learning_rate": 9.944166345783595e-07, "loss": 0.0014, "reward": 3.5078125, "reward_std": 0.18962866812944412, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 174 }, { "completion_length": 80.5625, "epoch": 0.011230547088079577, "grad_norm": 2.820888222245219, "kl": 0.044189453125, "learning_rate": 9.943845462713388e-07, "loss": 0.0018, "reward": 2.97265625, "reward_std": 0.1426027175039053, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.33984375, "rewards/format_reward": 1.0, "step": 175 }, { "completion_length": 80.125, "epoch": 0.011294721642868603, "grad_norm": 2.2693076636170884, "kl": 0.05224609375, "learning_rate": 9.943524579643178e-07, "loss": 0.0021, "reward": 3.0703125, "reward_std": 0.27271443605422974, "rewards/accuracy_reward": 0.34375, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 1.0, "step": 176 }, { "completion_length": 71.1640625, "epoch": 0.011358896197657628, "grad_norm": 5.291209352425651, "kl": 0.03515625, "learning_rate": 9.943203696572968e-07, "loss": 0.0014, "reward": 2.99609375, "reward_std": 0.3106383979320526, "rewards/accuracy_reward": 0.5, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 177 }, { "completion_length": 73.984375, "epoch": 0.011423070752446654, "grad_norm": 5.5675141265794945, "kl": 0.039306640625, "learning_rate": 9.94288281350276e-07, "loss": 0.0016, "reward": 3.31640625, "reward_std": 0.2747081220149994, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 0.9921875, "step": 178 }, { "completion_length": 70.0703125, "epoch": 0.011487245307235681, "grad_norm": 6.985645753951893, "kl": 0.0343017578125, "learning_rate": 9.94256193043255e-07, "loss": 0.0014, "reward": 3.15234375, "reward_std": 0.17097023129463196, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 179 }, { "completion_length": 71.8203125, "epoch": 0.011551419862024707, "grad_norm": 7.739954317659845, "kl": 0.041259765625, "learning_rate": 9.942241047362342e-07, "loss": 0.0017, "reward": 3.19140625, "reward_std": 0.16861121356487274, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 180 }, { "completion_length": 84.7265625, "epoch": 0.011615594416813733, "grad_norm": 9.13554190308794, "kl": 0.04150390625, "learning_rate": 9.941920164292132e-07, "loss": 0.0017, "reward": 3.16015625, "reward_std": 0.18764331191778183, "rewards/accuracy_reward": 0.4453125, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 0.9921875, "step": 181 }, { "completion_length": 68.8125, "epoch": 0.01167976897160276, "grad_norm": 6.684857907458709, "kl": 0.037109375, "learning_rate": 9.941599281221922e-07, "loss": 0.0015, "reward": 3.42578125, "reward_std": 0.10340797528624535, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 182 }, { "completion_length": 62.53125, "epoch": 0.011743943526391786, "grad_norm": 20.276682439562684, "kl": 0.0408935546875, "learning_rate": 9.941278398151714e-07, "loss": 0.0016, "reward": 2.91796875, "reward_std": 0.2412276715040207, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.12109375, "rewards/format_reward": 1.0, "step": 183 }, { "completion_length": 73.6484375, "epoch": 0.011808118081180811, "grad_norm": 1.2059821012195944, "kl": 0.036376953125, "learning_rate": 9.940957515081504e-07, "loss": 0.0015, "reward": 3.296875, "reward_std": 0.11225596815347672, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 184 }, { "completion_length": 63.8671875, "epoch": 0.011872292635969837, "grad_norm": 3.6775666032489127, "kl": 0.035888671875, "learning_rate": 9.940636632011294e-07, "loss": 0.0014, "reward": 3.5390625, "reward_std": 0.18884866684675217, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 185 }, { "completion_length": 72.78125, "epoch": 0.011936467190758865, "grad_norm": 4.4157184419582975, "kl": 0.041015625, "learning_rate": 9.940315748941084e-07, "loss": 0.0016, "reward": 3.25, "reward_std": 0.23224648088216782, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.4765625, "rewards/format_reward": 0.984375, "step": 186 }, { "completion_length": 71.859375, "epoch": 0.01200064174554789, "grad_norm": 9.668761760285799, "kl": 0.0509033203125, "learning_rate": 9.939994865870876e-07, "loss": 0.002, "reward": 3.3203125, "reward_std": 0.23759140819311142, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 187 }, { "completion_length": 80.5859375, "epoch": 0.012064816300336916, "grad_norm": 3.460844463396569, "kl": 0.0325927734375, "learning_rate": 9.939673982800666e-07, "loss": 0.0013, "reward": 2.9921875, "reward_std": 0.24632348865270615, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 0.9921875, "step": 188 }, { "completion_length": 73.109375, "epoch": 0.012128990855125943, "grad_norm": 6.373996408717781, "kl": 0.03350830078125, "learning_rate": 9.939353099730458e-07, "loss": 0.0013, "reward": 3.39453125, "reward_std": 0.1775766797363758, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 1.0, "step": 189 }, { "completion_length": 70.765625, "epoch": 0.012193165409914969, "grad_norm": 10.07291127206197, "kl": 0.0396728515625, "learning_rate": 9.939032216660248e-07, "loss": 0.0016, "reward": 3.1953125, "reward_std": 0.14389308914542198, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 190 }, { "completion_length": 87.0390625, "epoch": 0.012257339964703995, "grad_norm": 4.055183447098594, "kl": 0.0389404296875, "learning_rate": 9.93871133359004e-07, "loss": 0.0016, "reward": 3.34765625, "reward_std": 0.20216327160596848, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 1.0, "step": 191 }, { "completion_length": 72.328125, "epoch": 0.01232151451949302, "grad_norm": 2.64817332710522, "kl": 0.056396484375, "learning_rate": 9.93839045051983e-07, "loss": 0.0023, "reward": 3.703125, "reward_std": 0.17855798825621605, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.8671875, "rewards/format_reward": 1.0, "step": 192 }, { "completion_length": 71.125, "epoch": 0.012385689074282048, "grad_norm": 2.216725629073317, "kl": 0.0364990234375, "learning_rate": 9.93806956744962e-07, "loss": 0.0015, "reward": 3.11328125, "reward_std": 0.20756448060274124, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 193 }, { "completion_length": 75.53125, "epoch": 0.012449863629071073, "grad_norm": 11.231387091604994, "kl": 0.0382080078125, "learning_rate": 9.93774868437941e-07, "loss": 0.0015, "reward": 2.97265625, "reward_std": 0.2857213169336319, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.23828125, "rewards/format_reward": 1.0, "step": 194 }, { "completion_length": 81.8046875, "epoch": 0.012514038183860099, "grad_norm": 2.6049661013831455, "kl": 0.03643798828125, "learning_rate": 9.937427801309202e-07, "loss": 0.0015, "reward": 3.37890625, "reward_std": 0.3019161969423294, "rewards/accuracy_reward": 0.5546875, "rewards/format_count_numbers": 1.83984375, "rewards/format_reward": 0.984375, "step": 195 }, { "completion_length": 79.828125, "epoch": 0.012578212738649126, "grad_norm": 5.08982880552597, "kl": 0.044189453125, "learning_rate": 9.937106918238992e-07, "loss": 0.0018, "reward": 3.02734375, "reward_std": 0.26120351254940033, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 196 }, { "completion_length": 87.5625, "epoch": 0.012642387293438152, "grad_norm": 2.5047610674660783, "kl": 0.02960205078125, "learning_rate": 9.936786035168785e-07, "loss": 0.0012, "reward": 3.0859375, "reward_std": 0.252044215798378, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 197 }, { "completion_length": 89.09375, "epoch": 0.012706561848227178, "grad_norm": 3.913485159688576, "kl": 0.0316162109375, "learning_rate": 9.936465152098575e-07, "loss": 0.0013, "reward": 3.2265625, "reward_std": 0.2882782071828842, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.59375, "rewards/format_reward": 1.0, "step": 198 }, { "completion_length": 96.53125, "epoch": 0.012770736403016203, "grad_norm": 3.69506908248979, "kl": 0.0374755859375, "learning_rate": 9.936144269028367e-07, "loss": 0.0015, "reward": 3.234375, "reward_std": 0.2950380742549896, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 1.0, "step": 199 }, { "completion_length": 78.390625, "epoch": 0.01283491095780523, "grad_norm": 5.212393233018134, "kl": 0.0321044921875, "learning_rate": 9.935823385958157e-07, "loss": 0.0013, "reward": 3.40234375, "reward_std": 0.20039305090904236, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 200 }, { "completion_length": 83.125, "epoch": 0.012899085512594256, "grad_norm": 3.0260540550479265, "kl": 0.03851318359375, "learning_rate": 9.935502502887947e-07, "loss": 0.0015, "reward": 3.0625, "reward_std": 0.18484792113304138, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.453125, "rewards/format_reward": 1.0, "step": 201 }, { "completion_length": 81.5390625, "epoch": 0.012963260067383282, "grad_norm": 2.089500151762205, "kl": 0.034423828125, "learning_rate": 9.935181619817739e-07, "loss": 0.0014, "reward": 3.6171875, "reward_std": 0.18465957045555115, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.875, "rewards/format_reward": 1.0, "step": 202 }, { "completion_length": 90.375, "epoch": 0.01302743462217231, "grad_norm": 18.632411583304346, "kl": 0.0343017578125, "learning_rate": 9.934860736747529e-07, "loss": 0.0014, "reward": 3.25390625, "reward_std": 0.36827754974365234, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 0.9921875, "step": 203 }, { "completion_length": 86.1328125, "epoch": 0.013091609176961335, "grad_norm": 5.396951218544426, "kl": 0.0369873046875, "learning_rate": 9.934539853677319e-07, "loss": 0.0015, "reward": 3.75390625, "reward_std": 0.23330486565828323, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.99609375, "rewards/format_reward": 1.0, "step": 204 }, { "completion_length": 91.9453125, "epoch": 0.01315578373175036, "grad_norm": 3.7175516651631333, "kl": 0.02880859375, "learning_rate": 9.93421897060711e-07, "loss": 0.0012, "reward": 2.97265625, "reward_std": 0.3149999529123306, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.45703125, "rewards/format_reward": 0.9921875, "step": 205 }, { "completion_length": 85.7578125, "epoch": 0.013219958286539386, "grad_norm": 3.488289812379485, "kl": 0.02825927734375, "learning_rate": 9.9338980875369e-07, "loss": 0.0011, "reward": 3.15625, "reward_std": 0.27411776781082153, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 206 }, { "completion_length": 73.015625, "epoch": 0.013284132841328414, "grad_norm": 2.2992155883271175, "kl": 0.03314208984375, "learning_rate": 9.933577204466693e-07, "loss": 0.0013, "reward": 2.78515625, "reward_std": 0.22006277740001678, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.08984375, "rewards/format_reward": 1.0, "step": 207 }, { "completion_length": 79.1796875, "epoch": 0.01334830739611744, "grad_norm": 1.7475216813790706, "kl": 0.028564453125, "learning_rate": 9.933256321396483e-07, "loss": 0.0011, "reward": 3.2265625, "reward_std": 0.21897923946380615, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.4765625, "rewards/format_reward": 0.984375, "step": 208 }, { "completion_length": 75.984375, "epoch": 0.013412481950906465, "grad_norm": 1.9614565866853593, "kl": 0.03759765625, "learning_rate": 9.932935438326273e-07, "loss": 0.0015, "reward": 3.4453125, "reward_std": 0.1165238693356514, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 209 }, { "completion_length": 81.4765625, "epoch": 0.013476656505695492, "grad_norm": 4.738512185607158, "kl": 0.07666015625, "learning_rate": 9.932614555256065e-07, "loss": 0.0031, "reward": 3.15234375, "reward_std": 0.25880295783281326, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 210 }, { "completion_length": 78.6796875, "epoch": 0.013540831060484518, "grad_norm": 2.816917770812569, "kl": 0.0322265625, "learning_rate": 9.932293672185855e-07, "loss": 0.0013, "reward": 3.33984375, "reward_std": 0.1813633181154728, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 211 }, { "completion_length": 81.6171875, "epoch": 0.013605005615273544, "grad_norm": 2.299619557346593, "kl": 0.0479736328125, "learning_rate": 9.931972789115645e-07, "loss": 0.0019, "reward": 3.390625, "reward_std": 0.22563035786151886, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 1.0, "step": 212 }, { "completion_length": 88.0, "epoch": 0.01366918017006257, "grad_norm": 2.6745631224068496, "kl": 0.0445556640625, "learning_rate": 9.931651906045435e-07, "loss": 0.0018, "reward": 3.19140625, "reward_std": 0.22151628136634827, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.55859375, "rewards/format_reward": 1.0, "step": 213 }, { "completion_length": 70.015625, "epoch": 0.013733354724851597, "grad_norm": 1.983832594429385, "kl": 0.0406494140625, "learning_rate": 9.931331022975227e-07, "loss": 0.0016, "reward": 3.359375, "reward_std": 0.15474097058176994, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.59375, "rewards/format_reward": 0.9921875, "step": 214 }, { "completion_length": 80.859375, "epoch": 0.013797529279640623, "grad_norm": 4.209250220360455, "kl": 0.035888671875, "learning_rate": 9.931010139905017e-07, "loss": 0.0014, "reward": 3.296875, "reward_std": 0.11562084779143333, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 0.9921875, "step": 215 }, { "completion_length": 90.9296875, "epoch": 0.013861703834429648, "grad_norm": 2.0324317831990673, "kl": 0.02642822265625, "learning_rate": 9.93068925683481e-07, "loss": 0.0011, "reward": 2.953125, "reward_std": 0.3179033100605011, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.4296875, "rewards/format_reward": 0.984375, "step": 216 }, { "completion_length": 71.28125, "epoch": 0.013925878389218676, "grad_norm": 2.7208204189033838, "kl": 0.0528564453125, "learning_rate": 9.9303683737646e-07, "loss": 0.0021, "reward": 3.1875, "reward_std": 0.18702642619609833, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.4765625, "rewards/format_reward": 1.0, "step": 217 }, { "completion_length": 74.203125, "epoch": 0.013990052944007701, "grad_norm": 2.020383449795589, "kl": 0.0452880859375, "learning_rate": 9.930047490694392e-07, "loss": 0.0018, "reward": 3.20703125, "reward_std": 0.19173656404018402, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.46484375, "rewards/format_reward": 1.0, "step": 218 }, { "completion_length": 74.890625, "epoch": 0.014054227498796727, "grad_norm": 2.1452041970682263, "kl": 0.03125, "learning_rate": 9.929726607624182e-07, "loss": 0.0012, "reward": 3.16796875, "reward_std": 0.22886022925376892, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 219 }, { "completion_length": 82.90625, "epoch": 0.014118402053585753, "grad_norm": 2.2108964131799747, "kl": 0.0322265625, "learning_rate": 9.929405724553972e-07, "loss": 0.0013, "reward": 3.21875, "reward_std": 0.1173202209174633, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.6015625, "rewards/format_reward": 1.0, "step": 220 }, { "completion_length": 87.4140625, "epoch": 0.01418257660837478, "grad_norm": 2.37234676215025, "kl": 0.041748046875, "learning_rate": 9.929084841483762e-07, "loss": 0.0017, "reward": 3.34765625, "reward_std": 0.24297630041837692, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 1.0, "step": 221 }, { "completion_length": 76.984375, "epoch": 0.014246751163163806, "grad_norm": 2.5509896311868525, "kl": 0.0364990234375, "learning_rate": 9.928763958413554e-07, "loss": 0.0015, "reward": 3.234375, "reward_std": 0.16097762063145638, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 222 }, { "completion_length": 86.2109375, "epoch": 0.014310925717952831, "grad_norm": 13.342157254950608, "kl": 0.03802490234375, "learning_rate": 9.928443075343344e-07, "loss": 0.0015, "reward": 3.3203125, "reward_std": 0.29596562683582306, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.7734375, "rewards/format_reward": 1.0, "step": 223 }, { "completion_length": 85.5546875, "epoch": 0.014375100272741859, "grad_norm": 5.121984918167353, "kl": 0.0318603515625, "learning_rate": 9.928122192273136e-07, "loss": 0.0013, "reward": 3.17578125, "reward_std": 0.20372388511896133, "rewards/accuracy_reward": 0.453125, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 1.0, "step": 224 }, { "completion_length": 76.015625, "epoch": 0.014439274827530884, "grad_norm": 5.388234340493652, "kl": 0.0399169921875, "learning_rate": 9.927801309202926e-07, "loss": 0.0016, "reward": 3.265625, "reward_std": 0.2496304288506508, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 225 }, { "completion_length": 82.5390625, "epoch": 0.01450344938231991, "grad_norm": 2.230338889600805, "kl": 0.0885009765625, "learning_rate": 9.927480426132718e-07, "loss": 0.0035, "reward": 3.46484375, "reward_std": 0.15862105041742325, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.84765625, "rewards/format_reward": 0.984375, "step": 226 }, { "completion_length": 90.671875, "epoch": 0.014567623937108936, "grad_norm": 4.9625544558308405, "kl": 0.0323486328125, "learning_rate": 9.927159543062508e-07, "loss": 0.0013, "reward": 2.8671875, "reward_std": 0.2549026757478714, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.2109375, "rewards/format_reward": 1.0, "step": 227 }, { "completion_length": 77.640625, "epoch": 0.014631798491897963, "grad_norm": 2.218697853355151, "kl": 0.03369140625, "learning_rate": 9.926838659992298e-07, "loss": 0.0013, "reward": 3.04296875, "reward_std": 0.14651413541287184, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 228 }, { "completion_length": 83.7109375, "epoch": 0.014695973046686989, "grad_norm": 49.572074016222096, "kl": 0.03509521484375, "learning_rate": 9.926517776922088e-07, "loss": 0.0014, "reward": 2.734375, "reward_std": 0.33995233476161957, "rewards/accuracy_reward": 0.515625, "rewards/format_count_numbers": 1.21875, "rewards/format_reward": 1.0, "step": 229 }, { "completion_length": 81.796875, "epoch": 0.014760147601476014, "grad_norm": 7.643260460877982, "kl": 0.03240966796875, "learning_rate": 9.92619689385188e-07, "loss": 0.0013, "reward": 3.15234375, "reward_std": 0.33026222884655, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.47265625, "rewards/format_reward": 0.9921875, "step": 230 }, { "completion_length": 82.21875, "epoch": 0.014824322156265042, "grad_norm": 2.0228822011293968, "kl": 0.0330810546875, "learning_rate": 9.92587601078167e-07, "loss": 0.0013, "reward": 3.19140625, "reward_std": 0.19648226350545883, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 0.9921875, "step": 231 }, { "completion_length": 78.6015625, "epoch": 0.014888496711054067, "grad_norm": 2.4279472390130254, "kl": 0.048095703125, "learning_rate": 9.925555127711462e-07, "loss": 0.0019, "reward": 3.078125, "reward_std": 0.2356146201491356, "rewards/accuracy_reward": 0.4765625, "rewards/format_count_numbers": 1.6015625, "rewards/format_reward": 1.0, "step": 232 }, { "completion_length": 82.0546875, "epoch": 0.014952671265843093, "grad_norm": 2.9152584134250503, "kl": 0.0286865234375, "learning_rate": 9.925234244641252e-07, "loss": 0.0011, "reward": 3.57421875, "reward_std": 0.1608278937637806, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.84765625, "rewards/format_reward": 1.0, "step": 233 }, { "completion_length": 77.2578125, "epoch": 0.015016845820632119, "grad_norm": 3.700958614512502, "kl": 0.031494140625, "learning_rate": 9.924913361571044e-07, "loss": 0.0013, "reward": 3.24609375, "reward_std": 0.20149482041597366, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 234 }, { "completion_length": 79.9765625, "epoch": 0.015081020375421146, "grad_norm": 2.386057971853338, "kl": 0.03155517578125, "learning_rate": 9.924592478500834e-07, "loss": 0.0013, "reward": 2.9765625, "reward_std": 0.3054931163787842, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.4609375, "rewards/format_reward": 0.9921875, "step": 235 }, { "completion_length": 79.421875, "epoch": 0.015145194930210172, "grad_norm": 4.06565517625868, "kl": 0.03485107421875, "learning_rate": 9.924271595430624e-07, "loss": 0.0014, "reward": 3.4375, "reward_std": 0.2096475586295128, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 236 }, { "completion_length": 72.25, "epoch": 0.015209369484999197, "grad_norm": 3.1291867431846403, "kl": 0.02685546875, "learning_rate": 9.923950712360414e-07, "loss": 0.0011, "reward": 3.24609375, "reward_std": 0.1646866761147976, "rewards/accuracy_reward": 0.875, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 237 }, { "completion_length": 90.9140625, "epoch": 0.015273544039788225, "grad_norm": 10.051824124049453, "kl": 0.03564453125, "learning_rate": 9.923629829290206e-07, "loss": 0.0014, "reward": 3.3046875, "reward_std": 0.25956378132104874, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.71875, "rewards/format_reward": 0.9921875, "step": 238 }, { "completion_length": 94.875, "epoch": 0.01533771859457725, "grad_norm": 1.9650437289569813, "kl": 0.02923583984375, "learning_rate": 9.923308946219996e-07, "loss": 0.0012, "reward": 3.33984375, "reward_std": 0.2180376648902893, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 0.9921875, "step": 239 }, { "completion_length": 85.671875, "epoch": 0.015401893149366276, "grad_norm": 2.45382583124142, "kl": 0.03118896484375, "learning_rate": 9.922988063149789e-07, "loss": 0.0012, "reward": 3.1015625, "reward_std": 0.2903960943222046, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.453125, "rewards/format_reward": 1.0, "step": 240 }, { "completion_length": 79.7734375, "epoch": 0.015466067704155302, "grad_norm": 2.9995776830960565, "kl": 0.09619140625, "learning_rate": 9.922667180079579e-07, "loss": 0.0039, "reward": 3.33984375, "reward_std": 0.14363106340169907, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 241 }, { "completion_length": 80.9765625, "epoch": 0.01553024225894433, "grad_norm": 4.94136969321472, "kl": 0.04486083984375, "learning_rate": 9.92234629700937e-07, "loss": 0.0018, "reward": 2.91796875, "reward_std": 0.16105157136917114, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.23828125, "rewards/format_reward": 0.9921875, "step": 242 }, { "completion_length": 88.4140625, "epoch": 0.015594416813733355, "grad_norm": 2.0097180632107943, "kl": 0.03131103515625, "learning_rate": 9.92202541393916e-07, "loss": 0.0013, "reward": 3.20703125, "reward_std": 0.2625589966773987, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 0.9921875, "step": 243 }, { "completion_length": 89.171875, "epoch": 0.015658591368522382, "grad_norm": 2.2364527490952644, "kl": 0.02789306640625, "learning_rate": 9.92170453086895e-07, "loss": 0.0011, "reward": 3.03125, "reward_std": 0.2627197951078415, "rewards/accuracy_reward": 0.453125, "rewards/format_count_numbers": 1.578125, "rewards/format_reward": 1.0, "step": 244 }, { "completion_length": 87.6796875, "epoch": 0.015722765923311406, "grad_norm": 4.431117597475641, "kl": 0.040283203125, "learning_rate": 9.92138364779874e-07, "loss": 0.0016, "reward": 3.36328125, "reward_std": 0.2981094866991043, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.69921875, "rewards/format_reward": 1.0, "step": 245 }, { "completion_length": 79.53125, "epoch": 0.015786940478100434, "grad_norm": 2.3922094802654787, "kl": 0.030029296875, "learning_rate": 9.921062764728533e-07, "loss": 0.0012, "reward": 3.22265625, "reward_std": 0.29023801535367966, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.46484375, "rewards/format_reward": 1.0, "step": 246 }, { "completion_length": 86.5546875, "epoch": 0.01585111503288946, "grad_norm": 4.076068596757619, "kl": 0.0361328125, "learning_rate": 9.920741881658323e-07, "loss": 0.0014, "reward": 3.515625, "reward_std": 0.4329284429550171, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 0.9921875, "step": 247 }, { "completion_length": 93.0859375, "epoch": 0.015915289587678485, "grad_norm": 5.220724437875426, "kl": 0.02685546875, "learning_rate": 9.920420998588113e-07, "loss": 0.0011, "reward": 3.390625, "reward_std": 0.3870125710964203, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.7578125, "rewards/format_reward": 0.984375, "step": 248 }, { "completion_length": 79.234375, "epoch": 0.015979464142467512, "grad_norm": 1.558563655652595, "kl": 0.035888671875, "learning_rate": 9.920100115517905e-07, "loss": 0.0014, "reward": 3.48828125, "reward_std": 0.15190556272864342, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.70703125, "rewards/format_reward": 1.0, "step": 249 }, { "completion_length": 81.71875, "epoch": 0.016043638697256536, "grad_norm": 5.108727998481249, "kl": 0.035888671875, "learning_rate": 9.919779232447695e-07, "loss": 0.0014, "reward": 3.02734375, "reward_std": 0.39648038148880005, "rewards/accuracy_reward": 0.5703125, "rewards/format_count_numbers": 1.45703125, "rewards/format_reward": 1.0, "step": 250 }, { "completion_length": 85.9375, "epoch": 0.016107813252045564, "grad_norm": 5.481068408854277, "kl": 0.0577392578125, "learning_rate": 9.919458349377487e-07, "loss": 0.0023, "reward": 3.02734375, "reward_std": 0.3454916924238205, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.41015625, "rewards/format_reward": 0.9921875, "step": 251 }, { "completion_length": 79.96875, "epoch": 0.01617198780683459, "grad_norm": 4.841711572865715, "kl": 0.03436279296875, "learning_rate": 9.919137466307277e-07, "loss": 0.0014, "reward": 3.171875, "reward_std": 0.4203125834465027, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.546875, "rewards/format_reward": 0.9921875, "step": 252 }, { "completion_length": 80.4609375, "epoch": 0.016236162361623615, "grad_norm": 1.2390412140353957, "kl": 0.033935546875, "learning_rate": 9.91881658323707e-07, "loss": 0.0014, "reward": 3.1796875, "reward_std": 0.13098490238189697, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 253 }, { "completion_length": 79.234375, "epoch": 0.016300336916412642, "grad_norm": 1.7497812408626021, "kl": 0.0391845703125, "learning_rate": 9.91849570016686e-07, "loss": 0.0016, "reward": 2.9609375, "reward_std": 0.1687931790947914, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.3515625, "rewards/format_reward": 1.0, "step": 254 }, { "completion_length": 74.65625, "epoch": 0.01636451147120167, "grad_norm": 2.837849244606871, "kl": 0.0318603515625, "learning_rate": 9.91817481709665e-07, "loss": 0.0013, "reward": 3.1328125, "reward_std": 0.24042697995901108, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.3359375, "rewards/format_reward": 1.0, "step": 255 }, { "completion_length": 95.7109375, "epoch": 0.016428686025990694, "grad_norm": 4.590977335327697, "kl": 0.03216552734375, "learning_rate": 9.91785393402644e-07, "loss": 0.0013, "reward": 2.9375, "reward_std": 0.2701571136713028, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 0.9921875, "step": 256 }, { "completion_length": 74.4453125, "epoch": 0.01649286058077972, "grad_norm": 4.502151653886888, "kl": 0.03851318359375, "learning_rate": 9.917533050956231e-07, "loss": 0.0015, "reward": 3.56640625, "reward_std": 0.278650239109993, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.83984375, "rewards/format_reward": 1.0, "step": 257 }, { "completion_length": 83.6875, "epoch": 0.01655703513556875, "grad_norm": 3.651472762701162, "kl": 0.03314208984375, "learning_rate": 9.917212167886021e-07, "loss": 0.0013, "reward": 3.2578125, "reward_std": 0.16836363822221756, "rewards/accuracy_reward": 0.515625, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 258 }, { "completion_length": 74.1953125, "epoch": 0.016621209690357772, "grad_norm": 2.91382750675099, "kl": 0.0406494140625, "learning_rate": 9.916891284815813e-07, "loss": 0.0016, "reward": 3.5546875, "reward_std": 0.35277700424194336, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.84375, "rewards/format_reward": 0.9921875, "step": 259 }, { "completion_length": 67.125, "epoch": 0.0166853842451468, "grad_norm": 3.701653196043323, "kl": 0.0372314453125, "learning_rate": 9.916570401745603e-07, "loss": 0.0015, "reward": 3.42578125, "reward_std": 0.18747156858444214, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 0.984375, "step": 260 }, { "completion_length": 79.96875, "epoch": 0.016749558799935827, "grad_norm": 67.79259570151449, "kl": 0.0269775390625, "learning_rate": 9.916249518675396e-07, "loss": 0.0011, "reward": 3.39453125, "reward_std": 0.16891109943389893, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 1.0, "step": 261 }, { "completion_length": 78.234375, "epoch": 0.01681373335472485, "grad_norm": 1.804760902692092, "kl": 0.041748046875, "learning_rate": 9.915928635605186e-07, "loss": 0.0017, "reward": 3.1171875, "reward_std": 0.3680955320596695, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.546875, "rewards/format_reward": 0.9921875, "step": 262 }, { "completion_length": 85.1953125, "epoch": 0.01687790790951388, "grad_norm": 1.7245268837655756, "kl": 0.03857421875, "learning_rate": 9.915607752534976e-07, "loss": 0.0015, "reward": 3.3671875, "reward_std": 0.20511799305677414, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.71875, "rewards/format_reward": 0.9921875, "step": 263 }, { "completion_length": 91.515625, "epoch": 0.016942082464302902, "grad_norm": 2.6735452027868036, "kl": 0.0347900390625, "learning_rate": 9.915286869464766e-07, "loss": 0.0014, "reward": 2.96875, "reward_std": 0.3016202747821808, "rewards/accuracy_reward": 0.5078125, "rewards/format_count_numbers": 1.46875, "rewards/format_reward": 0.9921875, "step": 264 }, { "completion_length": 78.8515625, "epoch": 0.01700625701909193, "grad_norm": 2.0634360763869863, "kl": 0.03857421875, "learning_rate": 9.914965986394558e-07, "loss": 0.0015, "reward": 3.46875, "reward_std": 0.3529767394065857, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.84375, "rewards/format_reward": 0.9921875, "step": 265 }, { "completion_length": 74.4375, "epoch": 0.017070431573880957, "grad_norm": 2.094251666175363, "kl": 0.03662109375, "learning_rate": 9.914645103324348e-07, "loss": 0.0015, "reward": 2.91015625, "reward_std": 0.19791889190673828, "rewards/accuracy_reward": 0.5546875, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 1.0, "step": 266 }, { "completion_length": 73.2421875, "epoch": 0.01713460612866998, "grad_norm": 1.784234028672772, "kl": 0.0355224609375, "learning_rate": 9.91432422025414e-07, "loss": 0.0014, "reward": 3.4609375, "reward_std": 0.25816430151462555, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.859375, "rewards/format_reward": 0.9921875, "step": 267 }, { "completion_length": 66.6171875, "epoch": 0.01719878068345901, "grad_norm": 3.4239141536582665, "kl": 0.0313720703125, "learning_rate": 9.91400333718393e-07, "loss": 0.0013, "reward": 3.1953125, "reward_std": 0.13941731303930283, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 1.0, "step": 268 }, { "completion_length": 69.0703125, "epoch": 0.017262955238248036, "grad_norm": 2.568629155554493, "kl": 0.03173828125, "learning_rate": 9.913682454113722e-07, "loss": 0.0013, "reward": 2.9921875, "reward_std": 0.24329257756471634, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 0.9921875, "step": 269 }, { "completion_length": 72.1171875, "epoch": 0.01732712979303706, "grad_norm": 2.583715483632162, "kl": 0.0565185546875, "learning_rate": 9.913361571043512e-07, "loss": 0.0023, "reward": 3.25390625, "reward_std": 0.19358646124601364, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 270 }, { "completion_length": 82.8359375, "epoch": 0.017391304347826087, "grad_norm": 2.173185685429927, "kl": 0.038818359375, "learning_rate": 9.913040687973302e-07, "loss": 0.0016, "reward": 3.16796875, "reward_std": 0.1250661350786686, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 271 }, { "completion_length": 68.9140625, "epoch": 0.017455478902615115, "grad_norm": 2.3212357799194114, "kl": 0.0390625, "learning_rate": 9.912719804903092e-07, "loss": 0.0016, "reward": 3.1640625, "reward_std": 0.17544355243444443, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.4765625, "rewards/format_reward": 1.0, "step": 272 }, { "completion_length": 71.0, "epoch": 0.01751965345740414, "grad_norm": 2.6466961603040624, "kl": 0.03436279296875, "learning_rate": 9.912398921832884e-07, "loss": 0.0014, "reward": 3.390625, "reward_std": 0.16097761690616608, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 273 }, { "completion_length": 75.8359375, "epoch": 0.017583828012193166, "grad_norm": 19.155007704535887, "kl": 0.07861328125, "learning_rate": 9.912078038762674e-07, "loss": 0.0031, "reward": 3.05859375, "reward_std": 0.2724815607070923, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.47265625, "rewards/format_reward": 0.9921875, "step": 274 }, { "completion_length": 75.6171875, "epoch": 0.017648002566982193, "grad_norm": 11.84390115481588, "kl": 0.06732177734375, "learning_rate": 9.911757155692464e-07, "loss": 0.0027, "reward": 3.6796875, "reward_std": 0.14981039240956306, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.859375, "rewards/format_reward": 1.0, "step": 275 }, { "completion_length": 67.5, "epoch": 0.017712177121771217, "grad_norm": 2.3077736174179453, "kl": 0.0391845703125, "learning_rate": 9.911436272622256e-07, "loss": 0.0016, "reward": 2.91796875, "reward_std": 0.1896619200706482, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.12109375, "rewards/format_reward": 1.0, "step": 276 }, { "completion_length": 71.5859375, "epoch": 0.017776351676560245, "grad_norm": 3.7589568241803297, "kl": 0.051025390625, "learning_rate": 9.911115389552046e-07, "loss": 0.002, "reward": 3.67578125, "reward_std": 0.21515949815511703, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 0.9921875, "step": 277 }, { "completion_length": 75.875, "epoch": 0.01784052623134927, "grad_norm": 3.6667770827072492, "kl": 0.0362548828125, "learning_rate": 9.910794506481838e-07, "loss": 0.0015, "reward": 2.96484375, "reward_std": 0.22461289167404175, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.36328125, "rewards/format_reward": 1.0, "step": 278 }, { "completion_length": 79.5703125, "epoch": 0.017904700786138296, "grad_norm": 18.18298150822486, "kl": 0.0372314453125, "learning_rate": 9.910473623411628e-07, "loss": 0.0015, "reward": 3.046875, "reward_std": 0.2908087372779846, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 279 }, { "completion_length": 71.3359375, "epoch": 0.017968875340927323, "grad_norm": 5.2583988742664305, "kl": 0.0782470703125, "learning_rate": 9.910152740341418e-07, "loss": 0.0031, "reward": 3.40234375, "reward_std": 0.1862325295805931, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 280 }, { "completion_length": 73.5078125, "epoch": 0.018033049895716347, "grad_norm": 4.692130098876798, "kl": 0.0389404296875, "learning_rate": 9.90983185727121e-07, "loss": 0.0016, "reward": 3.37109375, "reward_std": 0.1851307675242424, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 281 }, { "completion_length": 73.890625, "epoch": 0.018097224450505375, "grad_norm": 3.0322910534734855, "kl": 0.037109375, "learning_rate": 9.909510974201e-07, "loss": 0.0015, "reward": 3.125, "reward_std": 0.15650184452533722, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 282 }, { "completion_length": 79.234375, "epoch": 0.018161399005294402, "grad_norm": 3.5384331438688723, "kl": 0.1441650390625, "learning_rate": 9.90919009113079e-07, "loss": 0.0058, "reward": 3.5625, "reward_std": 0.17288463562726974, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.9765625, "rewards/format_reward": 1.0, "step": 283 }, { "completion_length": 68.46875, "epoch": 0.018225573560083426, "grad_norm": 1.969345915456077, "kl": 0.04248046875, "learning_rate": 9.908869208060583e-07, "loss": 0.0017, "reward": 3.0078125, "reward_std": 0.17282496392726898, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.2421875, "rewards/format_reward": 1.0, "step": 284 }, { "completion_length": 73.328125, "epoch": 0.018289748114872453, "grad_norm": 6.895089542132447, "kl": 0.036376953125, "learning_rate": 9.908548324990373e-07, "loss": 0.0015, "reward": 3.34375, "reward_std": 0.2109457403421402, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 285 }, { "completion_length": 71.1953125, "epoch": 0.01835392266966148, "grad_norm": 3.167067371026583, "kl": 0.041015625, "learning_rate": 9.908227441920165e-07, "loss": 0.0016, "reward": 3.0390625, "reward_std": 0.21666546911001205, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 1.0, "step": 286 }, { "completion_length": 77.9453125, "epoch": 0.018418097224450505, "grad_norm": 4.140650419009445, "kl": 0.048583984375, "learning_rate": 9.907906558849955e-07, "loss": 0.0019, "reward": 3.53125, "reward_std": 0.18990949913859367, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.9921875, "rewards/format_reward": 1.0, "step": 287 }, { "completion_length": 70.9296875, "epoch": 0.018482271779239532, "grad_norm": 3.7880074419467253, "kl": 0.0570068359375, "learning_rate": 9.907585675779745e-07, "loss": 0.0023, "reward": 3.23828125, "reward_std": 0.20785115659236908, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.47265625, "rewards/format_reward": 1.0, "step": 288 }, { "completion_length": 77.3125, "epoch": 0.018546446334028556, "grad_norm": 2.1786159083247503, "kl": 0.037109375, "learning_rate": 9.907264792709537e-07, "loss": 0.0015, "reward": 3.28125, "reward_std": 0.1378196980804205, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 289 }, { "completion_length": 71.4921875, "epoch": 0.018610620888817583, "grad_norm": 11.660900482778375, "kl": 0.0406494140625, "learning_rate": 9.906943909639327e-07, "loss": 0.0016, "reward": 3.06640625, "reward_std": 0.21359793841838837, "rewards/accuracy_reward": 0.5703125, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 290 }, { "completion_length": 78.765625, "epoch": 0.01867479544360661, "grad_norm": 2.1412567237036115, "kl": 0.0399169921875, "learning_rate": 9.906623026569117e-07, "loss": 0.0016, "reward": 3.26953125, "reward_std": 0.1283687688410282, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 291 }, { "completion_length": 74.4296875, "epoch": 0.018738969998395635, "grad_norm": 1.8969635884090137, "kl": 0.0435791015625, "learning_rate": 9.906302143498909e-07, "loss": 0.0017, "reward": 3.45703125, "reward_std": 0.1144671943038702, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.86328125, "rewards/format_reward": 1.0, "step": 292 }, { "completion_length": 71.3125, "epoch": 0.018803144553184662, "grad_norm": 2.7972660092804604, "kl": 0.03564453125, "learning_rate": 9.905981260428699e-07, "loss": 0.0014, "reward": 2.796875, "reward_std": 0.20517178624868393, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.125, "rewards/format_reward": 0.9921875, "step": 293 }, { "completion_length": 77.9453125, "epoch": 0.01886731910797369, "grad_norm": 3.529780530402686, "kl": 0.03955078125, "learning_rate": 9.90566037735849e-07, "loss": 0.0016, "reward": 3.515625, "reward_std": 0.1828427091240883, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.8671875, "rewards/format_reward": 1.0, "step": 294 }, { "completion_length": 72.5625, "epoch": 0.018931493662762713, "grad_norm": 6.7166067029824195, "kl": 0.0443115234375, "learning_rate": 9.905339494288281e-07, "loss": 0.0018, "reward": 3.41796875, "reward_std": 0.11230521276593208, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 1.0, "step": 295 }, { "completion_length": 76.5546875, "epoch": 0.01899566821755174, "grad_norm": 4.592194015590574, "kl": 0.0384521484375, "learning_rate": 9.905018611218071e-07, "loss": 0.0015, "reward": 2.76953125, "reward_std": 0.1638985425233841, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 1.0, "step": 296 }, { "completion_length": 71.859375, "epoch": 0.019059842772340768, "grad_norm": 5.5829816952478355, "kl": 0.0391845703125, "learning_rate": 9.904697728147863e-07, "loss": 0.0016, "reward": 3.12109375, "reward_std": 0.2509261667728424, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 297 }, { "completion_length": 73.234375, "epoch": 0.019124017327129792, "grad_norm": 2.9063205757140564, "kl": 0.0496826171875, "learning_rate": 9.904376845077653e-07, "loss": 0.002, "reward": 3.05859375, "reward_std": 0.24533042311668396, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.36328125, "rewards/format_reward": 1.0, "step": 298 }, { "completion_length": 67.03125, "epoch": 0.01918819188191882, "grad_norm": 2.2820361285373605, "kl": 0.047119140625, "learning_rate": 9.904055962007443e-07, "loss": 0.0019, "reward": 3.04296875, "reward_std": 0.1699405387043953, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 1.0, "step": 299 }, { "completion_length": 70.5546875, "epoch": 0.019252366436707847, "grad_norm": 1.8516320000684716, "kl": 0.0401611328125, "learning_rate": 9.903735078937235e-07, "loss": 0.0016, "reward": 3.26953125, "reward_std": 0.13094642385840416, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 300 }, { "completion_length": 82.03125, "epoch": 0.01931654099149687, "grad_norm": 2.918950923226392, "kl": 0.0537109375, "learning_rate": 9.903414195867025e-07, "loss": 0.0021, "reward": 3.49609375, "reward_std": 0.2231239750981331, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.85546875, "rewards/format_reward": 1.0, "step": 301 }, { "completion_length": 78.53125, "epoch": 0.019380715546285898, "grad_norm": 6.320771871325963, "kl": 0.0513916015625, "learning_rate": 9.903093312796815e-07, "loss": 0.0021, "reward": 2.7890625, "reward_std": 0.26743485033512115, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.21875, "rewards/format_reward": 0.9921875, "step": 302 }, { "completion_length": 71.625, "epoch": 0.019444890101074922, "grad_norm": 5.484442758286644, "kl": 0.0445556640625, "learning_rate": 9.902772429726607e-07, "loss": 0.0018, "reward": 3.26171875, "reward_std": 0.20738627761602402, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 303 }, { "completion_length": 72.6875, "epoch": 0.01950906465586395, "grad_norm": 2.954327464777133, "kl": 0.0428466796875, "learning_rate": 9.902451546656397e-07, "loss": 0.0017, "reward": 3.08984375, "reward_std": 0.1720261573791504, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 304 }, { "completion_length": 71.5, "epoch": 0.019573239210652977, "grad_norm": 2.0547572244190926, "kl": 0.05615234375, "learning_rate": 9.90213066358619e-07, "loss": 0.0022, "reward": 3.5859375, "reward_std": 0.111396424472332, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 305 }, { "completion_length": 75.0859375, "epoch": 0.019637413765442, "grad_norm": 3.299057766310909, "kl": 0.03662109375, "learning_rate": 9.90180978051598e-07, "loss": 0.0015, "reward": 3.265625, "reward_std": 0.2430691346526146, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 306 }, { "completion_length": 79.1875, "epoch": 0.019701588320231028, "grad_norm": 7.7664930098759255, "kl": 0.0433349609375, "learning_rate": 9.90148889744577e-07, "loss": 0.0017, "reward": 3.015625, "reward_std": 0.14887069165706635, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 307 }, { "completion_length": 74.6015625, "epoch": 0.019765762875020056, "grad_norm": 2.6906879742300136, "kl": 0.0386962890625, "learning_rate": 9.901168014375562e-07, "loss": 0.0015, "reward": 3.1015625, "reward_std": 0.2590838298201561, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 308 }, { "completion_length": 87.6171875, "epoch": 0.01982993742980908, "grad_norm": 4.127270571967414, "kl": 0.0447998046875, "learning_rate": 9.900847131305352e-07, "loss": 0.0018, "reward": 3.3359375, "reward_std": 0.28433138132095337, "rewards/accuracy_reward": 0.484375, "rewards/format_count_numbers": 1.8515625, "rewards/format_reward": 1.0, "step": 309 }, { "completion_length": 73.390625, "epoch": 0.019894111984598107, "grad_norm": 1.8553501914422261, "kl": 0.0362548828125, "learning_rate": 9.900526248235142e-07, "loss": 0.0015, "reward": 3.35546875, "reward_std": 0.08954200521111488, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 310 }, { "completion_length": 78.3828125, "epoch": 0.019958286539387134, "grad_norm": 2.3057394947293046, "kl": 0.040283203125, "learning_rate": 9.900205365164934e-07, "loss": 0.0016, "reward": 3.01953125, "reward_std": 0.25869518518447876, "rewards/accuracy_reward": 0.5703125, "rewards/format_count_numbers": 1.45703125, "rewards/format_reward": 0.9921875, "step": 311 }, { "completion_length": 72.5234375, "epoch": 0.02002246109417616, "grad_norm": 47.53406243743931, "kl": 0.04296875, "learning_rate": 9.899884482094724e-07, "loss": 0.0017, "reward": 3.375, "reward_std": 0.1820138692855835, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 0.9921875, "step": 312 }, { "completion_length": 76.515625, "epoch": 0.020086635648965186, "grad_norm": 8.454647844248559, "kl": 0.042236328125, "learning_rate": 9.899563599024516e-07, "loss": 0.0017, "reward": 3.16796875, "reward_std": 0.20472782850265503, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 313 }, { "completion_length": 74.796875, "epoch": 0.020150810203754213, "grad_norm": 4.909440694030945, "kl": 0.0462646484375, "learning_rate": 9.899242715954306e-07, "loss": 0.0018, "reward": 3.078125, "reward_std": 0.1832578182220459, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 314 }, { "completion_length": 70.25, "epoch": 0.020214984758543237, "grad_norm": 1.958650952843981, "kl": 0.0413818359375, "learning_rate": 9.898921832884096e-07, "loss": 0.0017, "reward": 2.984375, "reward_std": 0.11230766773223877, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 1.0, "step": 315 }, { "completion_length": 72.515625, "epoch": 0.020279159313332264, "grad_norm": 3.7008919690258613, "kl": 0.0450439453125, "learning_rate": 9.898600949813888e-07, "loss": 0.0018, "reward": 3.23828125, "reward_std": 0.3325863182544708, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 0.9921875, "step": 316 }, { "completion_length": 78.796875, "epoch": 0.02034333386812129, "grad_norm": 3.0016568315820926, "kl": 0.03271484375, "learning_rate": 9.898280066743678e-07, "loss": 0.0013, "reward": 3.3359375, "reward_std": 0.152285635471344, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 317 }, { "completion_length": 75.28125, "epoch": 0.020407508422910316, "grad_norm": 3.446637427977765, "kl": 0.0308837890625, "learning_rate": 9.897959183673468e-07, "loss": 0.0012, "reward": 3.6015625, "reward_std": 0.17806214094161987, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.875, "rewards/format_reward": 1.0, "step": 318 }, { "completion_length": 80.2734375, "epoch": 0.020471682977699343, "grad_norm": 5.527813715748061, "kl": 0.0364990234375, "learning_rate": 9.89763830060326e-07, "loss": 0.0015, "reward": 3.328125, "reward_std": 0.15570057928562164, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 319 }, { "completion_length": 79.734375, "epoch": 0.020535857532488367, "grad_norm": 2.1859437733730425, "kl": 0.03387451171875, "learning_rate": 9.89731741753305e-07, "loss": 0.0014, "reward": 3.55078125, "reward_std": 0.16537447273731232, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 0.9921875, "step": 320 }, { "completion_length": 78.0859375, "epoch": 0.020600032087277394, "grad_norm": 1.6010612300372236, "kl": 0.032958984375, "learning_rate": 9.896996534462842e-07, "loss": 0.0013, "reward": 3.62109375, "reward_std": 0.12232652306556702, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 321 }, { "completion_length": 82.875, "epoch": 0.020664206642066422, "grad_norm": 2.402566504373887, "kl": 0.0333251953125, "learning_rate": 9.896675651392632e-07, "loss": 0.0013, "reward": 3.10546875, "reward_std": 0.2916644662618637, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 0.9921875, "step": 322 }, { "completion_length": 90.234375, "epoch": 0.020728381196855446, "grad_norm": 2.898048967418061, "kl": 0.0467529296875, "learning_rate": 9.896354768322422e-07, "loss": 0.0019, "reward": 3.4765625, "reward_std": 0.2870207577943802, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.859375, "rewards/format_reward": 1.0, "step": 323 }, { "completion_length": 81.3828125, "epoch": 0.020792555751644473, "grad_norm": 2.7658387064990797, "kl": 0.0355224609375, "learning_rate": 9.896033885252214e-07, "loss": 0.0014, "reward": 3.20703125, "reward_std": 0.11390282958745956, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 324 }, { "completion_length": 78.859375, "epoch": 0.0208567303064335, "grad_norm": 2.077719229595578, "kl": 0.037109375, "learning_rate": 9.895713002182004e-07, "loss": 0.0015, "reward": 3.57421875, "reward_std": 0.11655256152153015, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 325 }, { "completion_length": 77.9765625, "epoch": 0.020920904861222524, "grad_norm": 1.5075463905730007, "kl": 0.03253173828125, "learning_rate": 9.895392119111794e-07, "loss": 0.0013, "reward": 3.421875, "reward_std": 0.20593319833278656, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 326 }, { "completion_length": 83.3515625, "epoch": 0.020985079416011552, "grad_norm": 2.6014959551754204, "kl": 0.05548095703125, "learning_rate": 9.895071236041587e-07, "loss": 0.0022, "reward": 3.33984375, "reward_std": 0.2452201023697853, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 327 }, { "completion_length": 92.3203125, "epoch": 0.02104925397080058, "grad_norm": 2.1798827092517357, "kl": 0.0347900390625, "learning_rate": 9.894750352971377e-07, "loss": 0.0014, "reward": 3.51953125, "reward_std": 0.19358646124601364, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 328 }, { "completion_length": 80.703125, "epoch": 0.021113428525589603, "grad_norm": 2.2607423096277133, "kl": 0.0579833984375, "learning_rate": 9.894429469901169e-07, "loss": 0.0023, "reward": 3.58984375, "reward_std": 0.19753818958997726, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.86328125, "rewards/format_reward": 1.0, "step": 329 }, { "completion_length": 77.3671875, "epoch": 0.02117760308037863, "grad_norm": 26.08606556283715, "kl": 0.034423828125, "learning_rate": 9.894108586830959e-07, "loss": 0.0014, "reward": 3.3828125, "reward_std": 0.1830746978521347, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 330 }, { "completion_length": 72.3984375, "epoch": 0.021241777635167654, "grad_norm": 2.021538298088485, "kl": 0.0582275390625, "learning_rate": 9.893787703760749e-07, "loss": 0.0023, "reward": 2.8828125, "reward_std": 0.2369818240404129, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.125, "rewards/format_reward": 0.9921875, "step": 331 }, { "completion_length": 89.0625, "epoch": 0.021305952189956682, "grad_norm": 3.891239005431301, "kl": 0.0777587890625, "learning_rate": 9.89346682069054e-07, "loss": 0.0031, "reward": 3.44140625, "reward_std": 0.23322894424200058, "rewards/accuracy_reward": 0.5078125, "rewards/format_count_numbers": 1.93359375, "rewards/format_reward": 1.0, "step": 332 }, { "completion_length": 80.984375, "epoch": 0.02137012674474571, "grad_norm": 6.047211157592112, "kl": 0.0244140625, "learning_rate": 9.89314593762033e-07, "loss": 0.001, "reward": 3.28125, "reward_std": 0.23934084922075272, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 333 }, { "completion_length": 84.9296875, "epoch": 0.021434301299534733, "grad_norm": 4.132828130614972, "kl": 0.034423828125, "learning_rate": 9.89282505455012e-07, "loss": 0.0014, "reward": 3.46484375, "reward_std": 0.15334401279687881, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 334 }, { "completion_length": 84.453125, "epoch": 0.02149847585432376, "grad_norm": 1.8729236163557068, "kl": 0.0316162109375, "learning_rate": 9.89250417147991e-07, "loss": 0.0013, "reward": 2.93359375, "reward_std": 0.16755038499832153, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 1.0, "step": 335 }, { "completion_length": 89.640625, "epoch": 0.021562650409112788, "grad_norm": 6.182621805508202, "kl": 0.03302001953125, "learning_rate": 9.892183288409703e-07, "loss": 0.0013, "reward": 3.19140625, "reward_std": 0.23702459782361984, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 336 }, { "completion_length": 85.0703125, "epoch": 0.021626824963901812, "grad_norm": 4.004387261278588, "kl": 0.0516357421875, "learning_rate": 9.891862405339493e-07, "loss": 0.0021, "reward": 3.46875, "reward_std": 0.2537073493003845, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.859375, "rewards/format_reward": 1.0, "step": 337 }, { "completion_length": 77.015625, "epoch": 0.02169099951869084, "grad_norm": 5.7836392880636325, "kl": 0.052978515625, "learning_rate": 9.891541522269285e-07, "loss": 0.0021, "reward": 3.08984375, "reward_std": 0.1573006436228752, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 338 }, { "completion_length": 76.9453125, "epoch": 0.021755174073479867, "grad_norm": 1.8469791329610443, "kl": 0.0301513671875, "learning_rate": 9.891220639199075e-07, "loss": 0.0012, "reward": 2.91796875, "reward_std": 0.1454532966017723, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 1.0, "step": 339 }, { "completion_length": 86.71875, "epoch": 0.02181934862826889, "grad_norm": 2.768424291218591, "kl": 0.037353515625, "learning_rate": 9.890899756128867e-07, "loss": 0.0015, "reward": 3.546875, "reward_std": 0.2714823931455612, "rewards/accuracy_reward": 0.5703125, "rewards/format_count_numbers": 1.9765625, "rewards/format_reward": 1.0, "step": 340 }, { "completion_length": 75.578125, "epoch": 0.021883523183057918, "grad_norm": 6.769190335280211, "kl": 0.03369140625, "learning_rate": 9.890578873058657e-07, "loss": 0.0014, "reward": 2.875, "reward_std": 0.2525489032268524, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.2265625, "rewards/format_reward": 0.9921875, "step": 341 }, { "completion_length": 87.953125, "epoch": 0.021947697737846945, "grad_norm": 5.022933322582836, "kl": 0.044189453125, "learning_rate": 9.890257989988447e-07, "loss": 0.0018, "reward": 3.3984375, "reward_std": 0.2860850542783737, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.6875, "rewards/format_reward": 0.9921875, "step": 342 }, { "completion_length": 75.6796875, "epoch": 0.02201187229263597, "grad_norm": 2.065533615828042, "kl": 0.0367431640625, "learning_rate": 9.889937106918237e-07, "loss": 0.0015, "reward": 3.515625, "reward_std": 0.21914125978946686, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 1.0, "step": 343 }, { "completion_length": 82.7578125, "epoch": 0.022076046847424997, "grad_norm": 5.073202282468144, "kl": 0.0523681640625, "learning_rate": 9.88961622384803e-07, "loss": 0.0021, "reward": 3.39453125, "reward_std": 0.267434298992157, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.71484375, "rewards/format_reward": 1.0, "step": 344 }, { "completion_length": 86.3203125, "epoch": 0.02214022140221402, "grad_norm": 6.389589774406354, "kl": 0.0360107421875, "learning_rate": 9.88929534077782e-07, "loss": 0.0014, "reward": 3.45703125, "reward_std": 0.175977885723114, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 0.9921875, "step": 345 }, { "completion_length": 93.5546875, "epoch": 0.022204395957003048, "grad_norm": 2.5643508129764534, "kl": 0.0343017578125, "learning_rate": 9.888974457707611e-07, "loss": 0.0014, "reward": 2.96875, "reward_std": 0.1921706572175026, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.3515625, "rewards/format_reward": 1.0, "step": 346 }, { "completion_length": 78.9140625, "epoch": 0.022268570511792075, "grad_norm": 3.961771698767157, "kl": 0.03070068359375, "learning_rate": 9.888653574637401e-07, "loss": 0.0012, "reward": 3.2578125, "reward_std": 0.17908401414752007, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 0.9921875, "step": 347 }, { "completion_length": 85.109375, "epoch": 0.0223327450665811, "grad_norm": 4.3000744893240554, "kl": 0.03497314453125, "learning_rate": 9.888332691567194e-07, "loss": 0.0014, "reward": 3.41796875, "reward_std": 0.3060422018170357, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.69921875, "rewards/format_reward": 0.9921875, "step": 348 }, { "completion_length": 74.5703125, "epoch": 0.022396919621370127, "grad_norm": 1.0954007853308396, "kl": 0.0369873046875, "learning_rate": 9.888011808496984e-07, "loss": 0.0015, "reward": 3.08984375, "reward_std": 0.0706173200160265, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 349 }, { "completion_length": 80.265625, "epoch": 0.022461094176159154, "grad_norm": 3.195366913379474, "kl": 0.0369873046875, "learning_rate": 9.887690925426774e-07, "loss": 0.0015, "reward": 3.08984375, "reward_std": 0.2496279776096344, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 1.0, "step": 350 }, { "completion_length": 77.9765625, "epoch": 0.022525268730948178, "grad_norm": 2.435573538386913, "kl": 0.0467529296875, "learning_rate": 9.887370042356566e-07, "loss": 0.0019, "reward": 3.5390625, "reward_std": 0.13178616762161255, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.859375, "rewards/format_reward": 1.0, "step": 351 }, { "completion_length": 83.7890625, "epoch": 0.022589443285737205, "grad_norm": 5.382894502559734, "kl": 0.037109375, "learning_rate": 9.887049159286356e-07, "loss": 0.0015, "reward": 3.26171875, "reward_std": 0.18629190325737, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 352 }, { "completion_length": 80.171875, "epoch": 0.022653617840526233, "grad_norm": 7.468233392690856, "kl": 0.0400390625, "learning_rate": 9.886728276216146e-07, "loss": 0.0016, "reward": 3.28125, "reward_std": 0.24748793244361877, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 353 }, { "completion_length": 67.59375, "epoch": 0.022717792395315257, "grad_norm": 2.3704195087172204, "kl": 0.0640869140625, "learning_rate": 9.886407393145938e-07, "loss": 0.0026, "reward": 3.25390625, "reward_std": 0.2137848511338234, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.47265625, "rewards/format_reward": 0.9921875, "step": 354 }, { "completion_length": 68.390625, "epoch": 0.022781966950104284, "grad_norm": 2.0290582746739325, "kl": 0.0289306640625, "learning_rate": 9.886086510075728e-07, "loss": 0.0012, "reward": 3.40625, "reward_std": 0.05444390885531902, "rewards/accuracy_reward": 0.90625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 355 }, { "completion_length": 80.4140625, "epoch": 0.022846141504893308, "grad_norm": 4.008892753422357, "kl": 0.04443359375, "learning_rate": 9.88576562700552e-07, "loss": 0.0018, "reward": 3.2265625, "reward_std": 0.15082315355539322, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.46875, "rewards/format_reward": 1.0, "step": 356 }, { "completion_length": 77.3203125, "epoch": 0.022910316059682335, "grad_norm": 3.449518121825568, "kl": 0.0496826171875, "learning_rate": 9.88544474393531e-07, "loss": 0.002, "reward": 3.30078125, "reward_std": 0.21571563184261322, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 357 }, { "completion_length": 81.7109375, "epoch": 0.022974490614471363, "grad_norm": 2.7126927864486685, "kl": 0.030517578125, "learning_rate": 9.8851238608651e-07, "loss": 0.0012, "reward": 3.10546875, "reward_std": 0.14452779106795788, "rewards/accuracy_reward": 0.8828125, "rewards/format_count_numbers": 1.23046875, "rewards/format_reward": 0.9921875, "step": 358 }, { "completion_length": 71.4765625, "epoch": 0.023038665169260387, "grad_norm": 28.981994390279645, "kl": 0.04144287109375, "learning_rate": 9.884802977794892e-07, "loss": 0.0017, "reward": 3.4140625, "reward_std": 0.1526160091161728, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 359 }, { "completion_length": 81.1171875, "epoch": 0.023102839724049414, "grad_norm": 5.102804568294057, "kl": 0.040283203125, "learning_rate": 9.884482094724682e-07, "loss": 0.0016, "reward": 3.51953125, "reward_std": 0.1749495342373848, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 1.0, "step": 360 }, { "completion_length": 74.2578125, "epoch": 0.02316701427883844, "grad_norm": 1.872314059459667, "kl": 0.02880859375, "learning_rate": 9.884161211654472e-07, "loss": 0.0012, "reward": 3.6953125, "reward_std": 0.13838762417435646, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.8515625, "rewards/format_reward": 1.0, "step": 361 }, { "completion_length": 81.640625, "epoch": 0.023231188833627466, "grad_norm": 2.836357396061621, "kl": 0.0380859375, "learning_rate": 9.883840328584262e-07, "loss": 0.0015, "reward": 3.3046875, "reward_std": 0.1952216997742653, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.71875, "rewards/format_reward": 0.9921875, "step": 362 }, { "completion_length": 79.375, "epoch": 0.023295363388416493, "grad_norm": 5.632585133478851, "kl": 0.042724609375, "learning_rate": 9.883519445514054e-07, "loss": 0.0017, "reward": 3.41015625, "reward_std": 0.30673080682754517, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.84765625, "rewards/format_reward": 1.0, "step": 363 }, { "completion_length": 81.734375, "epoch": 0.02335953794320552, "grad_norm": 5.785440511985008, "kl": 0.0400390625, "learning_rate": 9.883198562443844e-07, "loss": 0.0016, "reward": 3.33203125, "reward_std": 0.22772862017154694, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 364 }, { "completion_length": 75.8671875, "epoch": 0.023423712497994544, "grad_norm": 2.072342748914024, "kl": 0.0384521484375, "learning_rate": 9.882877679373636e-07, "loss": 0.0015, "reward": 3.140625, "reward_std": 0.24147523939609528, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.3515625, "rewards/format_reward": 0.9921875, "step": 365 }, { "completion_length": 87.3046875, "epoch": 0.02348788705278357, "grad_norm": 8.429294656694106, "kl": 0.0458984375, "learning_rate": 9.882556796303426e-07, "loss": 0.0018, "reward": 3.60546875, "reward_std": 0.16242551058530807, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.85546875, "rewards/format_reward": 0.9921875, "step": 366 }, { "completion_length": 72.5078125, "epoch": 0.0235520616075726, "grad_norm": 1.7309448366047746, "kl": 0.048583984375, "learning_rate": 9.882235913233218e-07, "loss": 0.0019, "reward": 3.02734375, "reward_std": 0.150530144572258, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 1.0, "step": 367 }, { "completion_length": 80.2578125, "epoch": 0.023616236162361623, "grad_norm": 4.113255720949969, "kl": 0.03460693359375, "learning_rate": 9.881915030163008e-07, "loss": 0.0014, "reward": 3.3203125, "reward_std": 0.22647252678871155, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 0.9921875, "step": 368 }, { "completion_length": 76.3515625, "epoch": 0.02368041071715065, "grad_norm": 2.068165117131359, "kl": 0.034423828125, "learning_rate": 9.881594147092798e-07, "loss": 0.0014, "reward": 3.53515625, "reward_std": 0.1454532966017723, "rewards/accuracy_reward": 0.921875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 0.9921875, "step": 369 }, { "completion_length": 79.4375, "epoch": 0.023744585271939674, "grad_norm": 2.068362319541115, "kl": 0.0377197265625, "learning_rate": 9.881273264022588e-07, "loss": 0.0015, "reward": 3.16015625, "reward_std": 0.14442361146211624, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 370 }, { "completion_length": 80.5, "epoch": 0.0238087598267287, "grad_norm": 2.9625603898700557, "kl": 0.03759765625, "learning_rate": 9.88095238095238e-07, "loss": 0.0015, "reward": 3.38671875, "reward_std": 0.2820390909910202, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 371 }, { "completion_length": 85.7890625, "epoch": 0.02387293438151773, "grad_norm": 4.3856748965288705, "kl": 0.055908203125, "learning_rate": 9.88063149788217e-07, "loss": 0.0022, "reward": 3.05078125, "reward_std": 0.24741338193416595, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.47265625, "rewards/format_reward": 1.0, "step": 372 }, { "completion_length": 80.484375, "epoch": 0.023937108936306753, "grad_norm": 3.719499126072363, "kl": 0.0572509765625, "learning_rate": 9.880310614811963e-07, "loss": 0.0023, "reward": 3.26953125, "reward_std": 0.1984347254037857, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.47265625, "rewards/format_reward": 1.0, "step": 373 }, { "completion_length": 79.0, "epoch": 0.02400128349109578, "grad_norm": 1.69807131048397, "kl": 0.0361328125, "learning_rate": 9.879989731741753e-07, "loss": 0.0014, "reward": 3.70703125, "reward_std": 0.07769465446472168, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 374 }, { "completion_length": 80.90625, "epoch": 0.024065458045884808, "grad_norm": 3.9666777840340584, "kl": 0.0440673828125, "learning_rate": 9.879668848671545e-07, "loss": 0.0018, "reward": 3.22265625, "reward_std": 0.1638985350728035, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 375 }, { "completion_length": 80.640625, "epoch": 0.02412963260067383, "grad_norm": 4.772780273588727, "kl": 0.03564453125, "learning_rate": 9.879347965601335e-07, "loss": 0.0014, "reward": 3.3984375, "reward_std": 0.2093481346964836, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 376 }, { "completion_length": 80.6484375, "epoch": 0.02419380715546286, "grad_norm": 2.7684971107051197, "kl": 0.0345458984375, "learning_rate": 9.879027082531125e-07, "loss": 0.0014, "reward": 3.0, "reward_std": 0.13204573839902878, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 377 }, { "completion_length": 80.25, "epoch": 0.024257981710251886, "grad_norm": 1.700211970948363, "kl": 0.044921875, "learning_rate": 9.878706199460915e-07, "loss": 0.0018, "reward": 3.58203125, "reward_std": 0.10627167671918869, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 378 }, { "completion_length": 91.2109375, "epoch": 0.02432215626504091, "grad_norm": 3.9728263101272345, "kl": 0.040771484375, "learning_rate": 9.878385316390707e-07, "loss": 0.0016, "reward": 3.15625, "reward_std": 0.3403690755367279, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 379 }, { "completion_length": 71.453125, "epoch": 0.024386330819829938, "grad_norm": 1.1064012126062157, "kl": 0.036376953125, "learning_rate": 9.878064433320497e-07, "loss": 0.0015, "reward": 3.58984375, "reward_std": 0.09566336870193481, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 380 }, { "completion_length": 75.6328125, "epoch": 0.024450505374618965, "grad_norm": 2.518562910866717, "kl": 0.0341796875, "learning_rate": 9.87774355025029e-07, "loss": 0.0014, "reward": 3.125, "reward_std": 0.2069891169667244, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 381 }, { "completion_length": 75.484375, "epoch": 0.02451467992940799, "grad_norm": 2.4399820960454397, "kl": 0.0595703125, "learning_rate": 9.87742266718008e-07, "loss": 0.0024, "reward": 3.34765625, "reward_std": 0.13679246604442596, "rewards/accuracy_reward": 0.8671875, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 0.9921875, "step": 382 }, { "completion_length": 89.203125, "epoch": 0.024578854484197016, "grad_norm": 2.9276613125514883, "kl": 0.053955078125, "learning_rate": 9.877101784109871e-07, "loss": 0.0021, "reward": 3.4296875, "reward_std": 0.18542881309986115, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 383 }, { "completion_length": 77.328125, "epoch": 0.02464302903898604, "grad_norm": 2.830245204545985, "kl": 0.0355224609375, "learning_rate": 9.876780901039661e-07, "loss": 0.0014, "reward": 3.32421875, "reward_std": 0.19252563267946243, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 384 }, { "completion_length": 83.5703125, "epoch": 0.024707203593775068, "grad_norm": 7.127123191676637, "kl": 0.0469970703125, "learning_rate": 9.876460017969451e-07, "loss": 0.0019, "reward": 2.8515625, "reward_std": 0.22753334045410156, "rewards/accuracy_reward": 0.5, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 0.984375, "step": 385 }, { "completion_length": 85.796875, "epoch": 0.024771378148564095, "grad_norm": 2.7550811760798566, "kl": 0.038818359375, "learning_rate": 9.876139134899241e-07, "loss": 0.0016, "reward": 3.37890625, "reward_std": 0.09077248722314835, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 386 }, { "completion_length": 89.40625, "epoch": 0.02483555270335312, "grad_norm": 2.0396310060598433, "kl": 0.03131103515625, "learning_rate": 9.875818251829033e-07, "loss": 0.0013, "reward": 3.26953125, "reward_std": 0.22882908582687378, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 387 }, { "completion_length": 89.2578125, "epoch": 0.024899727258142147, "grad_norm": 15.598148276516785, "kl": 0.0390625, "learning_rate": 9.875497368758823e-07, "loss": 0.0016, "reward": 3.63671875, "reward_std": 0.19242897629737854, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.84765625, "rewards/format_reward": 1.0, "step": 388 }, { "completion_length": 80.296875, "epoch": 0.024963901812931174, "grad_norm": 1.7362247625726366, "kl": 0.0408935546875, "learning_rate": 9.875176485688615e-07, "loss": 0.0016, "reward": 3.453125, "reward_std": 0.1357654631137848, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 389 }, { "completion_length": 88.0703125, "epoch": 0.025028076367720198, "grad_norm": 4.395136245010878, "kl": 0.0577392578125, "learning_rate": 9.874855602618405e-07, "loss": 0.0023, "reward": 3.578125, "reward_std": 0.16841552406549454, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.828125, "rewards/format_reward": 0.984375, "step": 390 }, { "completion_length": 88.546875, "epoch": 0.025092250922509225, "grad_norm": 12.85465482179912, "kl": 0.0374755859375, "learning_rate": 9.874534719548198e-07, "loss": 0.0015, "reward": 3.375, "reward_std": 0.22008295357227325, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 391 }, { "completion_length": 91.46875, "epoch": 0.025156425477298253, "grad_norm": 2.36150273289245, "kl": 0.0423583984375, "learning_rate": 9.874213836477988e-07, "loss": 0.0017, "reward": 3.453125, "reward_std": 0.15650184452533722, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 0.9921875, "step": 392 }, { "completion_length": 92.59375, "epoch": 0.025220600032087277, "grad_norm": 7.83656228688999, "kl": 0.0408935546875, "learning_rate": 9.873892953407778e-07, "loss": 0.0016, "reward": 3.23046875, "reward_std": 0.2951347902417183, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 393 }, { "completion_length": 91.7734375, "epoch": 0.025284774586876304, "grad_norm": 2.5606651790718002, "kl": 0.044189453125, "learning_rate": 9.873572070337568e-07, "loss": 0.0018, "reward": 3.484375, "reward_std": 0.20617882907390594, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.859375, "rewards/format_reward": 0.9921875, "step": 394 }, { "completion_length": 89.65625, "epoch": 0.02534894914166533, "grad_norm": 3.4540044213852883, "kl": 0.0352783203125, "learning_rate": 9.87325118726736e-07, "loss": 0.0014, "reward": 3.1640625, "reward_std": 0.24595101922750473, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 395 }, { "completion_length": 93.421875, "epoch": 0.025413123696454355, "grad_norm": 2.5964730870490818, "kl": 0.0362548828125, "learning_rate": 9.87293030419715e-07, "loss": 0.0014, "reward": 3.27734375, "reward_std": 0.23519299179315567, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 396 }, { "completion_length": 93.7734375, "epoch": 0.025477298251243383, "grad_norm": 28.22328884573873, "kl": 0.049072265625, "learning_rate": 9.87260942112694e-07, "loss": 0.002, "reward": 2.98046875, "reward_std": 0.27177654206752777, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.46484375, "rewards/format_reward": 0.9921875, "step": 397 }, { "completion_length": 85.09375, "epoch": 0.025541472806032407, "grad_norm": 2.3191135713676028, "kl": 0.060302734375, "learning_rate": 9.872288538056732e-07, "loss": 0.0024, "reward": 3.515625, "reward_std": 0.2294127270579338, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.84375, "rewards/format_reward": 0.9921875, "step": 398 }, { "completion_length": 81.3515625, "epoch": 0.025605647360821434, "grad_norm": 1.6905703769183331, "kl": 0.035400390625, "learning_rate": 9.871967654986522e-07, "loss": 0.0014, "reward": 3.44921875, "reward_std": 0.14091838151216507, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 399 }, { "completion_length": 87.5703125, "epoch": 0.02566982191561046, "grad_norm": 4.512482526660794, "kl": 0.0390625, "learning_rate": 9.871646771916314e-07, "loss": 0.0016, "reward": 3.25390625, "reward_std": 0.16978827118873596, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 400 }, { "completion_length": 81.3984375, "epoch": 0.025733996470399485, "grad_norm": 5.3715469115990135, "kl": 0.03326416015625, "learning_rate": 9.871325888846104e-07, "loss": 0.0013, "reward": 3.1328125, "reward_std": 0.2319277748465538, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 401 }, { "completion_length": 92.53125, "epoch": 0.025798171025188513, "grad_norm": 2.188901893000093, "kl": 0.03277587890625, "learning_rate": 9.871005005775896e-07, "loss": 0.0013, "reward": 3.515625, "reward_std": 0.2073073610663414, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 0.9921875, "step": 402 }, { "completion_length": 90.1796875, "epoch": 0.02586234557997754, "grad_norm": 1.8161504861610147, "kl": 0.0440673828125, "learning_rate": 9.870684122705686e-07, "loss": 0.0018, "reward": 3.59765625, "reward_std": 0.07733980100601912, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.84765625, "rewards/format_reward": 1.0, "step": 403 }, { "completion_length": 85.8203125, "epoch": 0.025926520134766564, "grad_norm": 2.966138606812278, "kl": 0.0445556640625, "learning_rate": 9.870363239635476e-07, "loss": 0.0018, "reward": 3.0, "reward_std": 0.22305265814065933, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.2421875, "rewards/format_reward": 1.0, "step": 404 }, { "completion_length": 85.53125, "epoch": 0.02599069468955559, "grad_norm": 3.4599966220049114, "kl": 0.0679931640625, "learning_rate": 9.870042356565266e-07, "loss": 0.0027, "reward": 2.91015625, "reward_std": 0.3398706614971161, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.23046875, "rewards/format_reward": 1.0, "step": 405 }, { "completion_length": 82.8984375, "epoch": 0.02605486924434462, "grad_norm": 5.464528831634491, "kl": 0.0372314453125, "learning_rate": 9.869721473495058e-07, "loss": 0.0015, "reward": 3.61328125, "reward_std": 0.23227517306804657, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 406 }, { "completion_length": 85.2421875, "epoch": 0.026119043799133643, "grad_norm": 4.7420087335897305, "kl": 0.0384521484375, "learning_rate": 9.869400590424848e-07, "loss": 0.0015, "reward": 3.0625, "reward_std": 0.24975158274173737, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.46875, "rewards/format_reward": 1.0, "step": 407 }, { "completion_length": 83.6171875, "epoch": 0.02618321835392267, "grad_norm": 3.5170385415420324, "kl": 0.0535888671875, "learning_rate": 9.86907970735464e-07, "loss": 0.0021, "reward": 3.58984375, "reward_std": 0.09442433342337608, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.86328125, "rewards/format_reward": 1.0, "step": 408 }, { "completion_length": 86.1640625, "epoch": 0.026247392908711698, "grad_norm": 2.985252580890065, "kl": 0.041015625, "learning_rate": 9.86875882428443e-07, "loss": 0.0016, "reward": 3.40625, "reward_std": 0.2109048217535019, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 409 }, { "completion_length": 78.765625, "epoch": 0.02631156746350072, "grad_norm": 8.121963048952532, "kl": 0.0423583984375, "learning_rate": 9.868437941214222e-07, "loss": 0.0017, "reward": 3.64453125, "reward_std": 0.23565896600484848, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.85546875, "rewards/format_reward": 1.0, "step": 410 }, { "completion_length": 81.5234375, "epoch": 0.02637574201828975, "grad_norm": 2.5648741702963216, "kl": 0.04132080078125, "learning_rate": 9.868117058144012e-07, "loss": 0.0017, "reward": 3.48046875, "reward_std": 0.24417440593242645, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.85546875, "rewards/format_reward": 0.9921875, "step": 411 }, { "completion_length": 82.828125, "epoch": 0.026439916573078773, "grad_norm": 2.710591200094352, "kl": 0.035888671875, "learning_rate": 9.867796175073802e-07, "loss": 0.0014, "reward": 3.32421875, "reward_std": 0.1908865123987198, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 412 }, { "completion_length": 72.8203125, "epoch": 0.0265040911278678, "grad_norm": 1.472211214866935, "kl": 0.03045654296875, "learning_rate": 9.867475292003592e-07, "loss": 0.0012, "reward": 3.09375, "reward_std": 0.14571532234549522, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 413 }, { "completion_length": 74.59375, "epoch": 0.026568265682656828, "grad_norm": 3.887894164065186, "kl": 0.048828125, "learning_rate": 9.867154408933385e-07, "loss": 0.002, "reward": 2.8671875, "reward_std": 0.2101493924856186, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.2421875, "rewards/format_reward": 0.9921875, "step": 414 }, { "completion_length": 75.6171875, "epoch": 0.02663244023744585, "grad_norm": 1.5454401622146114, "kl": 0.03759765625, "learning_rate": 9.866833525863175e-07, "loss": 0.0015, "reward": 2.75390625, "reward_std": 0.07733980193734169, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.12109375, "rewards/format_reward": 1.0, "step": 415 }, { "completion_length": 73.5, "epoch": 0.02669661479223488, "grad_norm": 7.315806026305294, "kl": 0.0452880859375, "learning_rate": 9.866512642792967e-07, "loss": 0.0018, "reward": 3.30078125, "reward_std": 0.06744491867721081, "rewards/accuracy_reward": 0.9296875, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 416 }, { "completion_length": 79.4765625, "epoch": 0.026760789347023906, "grad_norm": 2.51053773560654, "kl": 0.03302001953125, "learning_rate": 9.866191759722757e-07, "loss": 0.0013, "reward": 2.953125, "reward_std": 0.20463500916957855, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 0.9921875, "step": 417 }, { "completion_length": 89.3671875, "epoch": 0.02682496390181293, "grad_norm": 2.775506852789836, "kl": 0.0478515625, "learning_rate": 9.865870876652549e-07, "loss": 0.0019, "reward": 3.36328125, "reward_std": 0.35115696489810944, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.70703125, "rewards/format_reward": 1.0, "step": 418 }, { "completion_length": 76.7890625, "epoch": 0.026889138456601958, "grad_norm": 3.533410878723769, "kl": 0.048583984375, "learning_rate": 9.865549993582339e-07, "loss": 0.0019, "reward": 3.38671875, "reward_std": 0.22647008299827576, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 0.9921875, "step": 419 }, { "completion_length": 65.5546875, "epoch": 0.026953313011390985, "grad_norm": 1.18105011494968, "kl": 0.0538330078125, "learning_rate": 9.865229110512129e-07, "loss": 0.0022, "reward": 3.4375, "reward_std": 0.12179600074887276, "rewards/accuracy_reward": 0.953125, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 420 }, { "completion_length": 76.6796875, "epoch": 0.02701748756618001, "grad_norm": 11.069109197683012, "kl": 0.0330810546875, "learning_rate": 9.864908227441919e-07, "loss": 0.0013, "reward": 3.45703125, "reward_std": 0.16242552362382412, "rewards/accuracy_reward": 0.859375, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 421 }, { "completion_length": 76.1171875, "epoch": 0.027081662120969036, "grad_norm": 1.8660133287401062, "kl": 0.0364990234375, "learning_rate": 9.86458734437171e-07, "loss": 0.0015, "reward": 3.73828125, "reward_std": 0.12049250304698944, "rewards/accuracy_reward": 0.8671875, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 422 }, { "completion_length": 70.1015625, "epoch": 0.02714583667575806, "grad_norm": 2.5636098551154, "kl": 0.0450439453125, "learning_rate": 9.8642664613015e-07, "loss": 0.0018, "reward": 3.1953125, "reward_std": 0.15527012944221497, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 423 }, { "completion_length": 79.6328125, "epoch": 0.027210011230547088, "grad_norm": 4.92049493045296, "kl": 0.0439453125, "learning_rate": 9.86394557823129e-07, "loss": 0.0018, "reward": 3.4140625, "reward_std": 0.2120065912604332, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 424 }, { "completion_length": 78.9765625, "epoch": 0.027274185785336115, "grad_norm": 3.870214215039947, "kl": 0.0380859375, "learning_rate": 9.863624695161083e-07, "loss": 0.0015, "reward": 3.4921875, "reward_std": 0.14000925421714783, "rewards/accuracy_reward": 0.8828125, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 425 }, { "completion_length": 79.484375, "epoch": 0.02733836034012514, "grad_norm": 1.985863018004636, "kl": 0.0491943359375, "learning_rate": 9.863303812090873e-07, "loss": 0.002, "reward": 3.296875, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 0.9921875, "step": 426 }, { "completion_length": 85.0859375, "epoch": 0.027402534894914166, "grad_norm": 3.813102561584444, "kl": 0.0516357421875, "learning_rate": 9.862982929020665e-07, "loss": 0.0021, "reward": 3.51953125, "reward_std": 0.09548516571521759, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 427 }, { "completion_length": 91.9296875, "epoch": 0.027466709449703194, "grad_norm": 3.102663248754436, "kl": 0.0732421875, "learning_rate": 9.862662045950455e-07, "loss": 0.0029, "reward": 3.64453125, "reward_std": 0.21123892068862915, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 428 }, { "completion_length": 83.578125, "epoch": 0.027530884004492218, "grad_norm": 1.863544725445917, "kl": 0.037841796875, "learning_rate": 9.862341162880245e-07, "loss": 0.0015, "reward": 3.1484375, "reward_std": 0.13914500921964645, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 429 }, { "completion_length": 88.0625, "epoch": 0.027595058559281245, "grad_norm": 1.1730984674473448, "kl": 0.03106689453125, "learning_rate": 9.862020279810037e-07, "loss": 0.0012, "reward": 3.12890625, "reward_std": 0.10456175170838833, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 430 }, { "completion_length": 93.3046875, "epoch": 0.027659233114070272, "grad_norm": 3.564594368107555, "kl": 0.0413818359375, "learning_rate": 9.861699396739827e-07, "loss": 0.0017, "reward": 3.33203125, "reward_std": 0.1895616203546524, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.71484375, "rewards/format_reward": 1.0, "step": 431 }, { "completion_length": 80.6484375, "epoch": 0.027723407668859296, "grad_norm": 2.7443980524531213, "kl": 0.0426025390625, "learning_rate": 9.861378513669617e-07, "loss": 0.0017, "reward": 3.0703125, "reward_std": 0.15880915522575378, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.234375, "rewards/format_reward": 1.0, "step": 432 }, { "completion_length": 85.515625, "epoch": 0.027787582223648324, "grad_norm": 5.7358535792556715, "kl": 0.0582275390625, "learning_rate": 9.86105763059941e-07, "loss": 0.0023, "reward": 3.0859375, "reward_std": 0.1638217195868492, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 0.9921875, "step": 433 }, { "completion_length": 91.8984375, "epoch": 0.02785175677843735, "grad_norm": 5.455053528895965, "kl": 0.0362548828125, "learning_rate": 9.8607367475292e-07, "loss": 0.0014, "reward": 3.265625, "reward_std": 0.26223427802324295, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 434 }, { "completion_length": 87.1328125, "epoch": 0.027915931333226375, "grad_norm": 2.626452471893669, "kl": 0.0362548828125, "learning_rate": 9.860415864458992e-07, "loss": 0.0015, "reward": 3.3671875, "reward_std": 0.134404756128788, "rewards/accuracy_reward": 0.875, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 435 }, { "completion_length": 94.9921875, "epoch": 0.027980105888015402, "grad_norm": 1.9803338381231637, "kl": 0.0380859375, "learning_rate": 9.860094981388782e-07, "loss": 0.0015, "reward": 3.44921875, "reward_std": 0.2482164204120636, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 1.0, "step": 436 }, { "completion_length": 95.7890625, "epoch": 0.028044280442804426, "grad_norm": 1.2214412688137188, "kl": 0.0438232421875, "learning_rate": 9.859774098318572e-07, "loss": 0.0018, "reward": 3.09375, "reward_std": 0.12756996601819992, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 1.0, "step": 437 }, { "completion_length": 95.96875, "epoch": 0.028108454997593454, "grad_norm": 3.3151260660520854, "kl": 0.045166015625, "learning_rate": 9.859453215248364e-07, "loss": 0.0018, "reward": 3.515625, "reward_std": 0.22596783190965652, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.859375, "rewards/format_reward": 1.0, "step": 438 }, { "completion_length": 95.8125, "epoch": 0.02817262955238248, "grad_norm": 1.963700791220928, "kl": 0.042236328125, "learning_rate": 9.859132332178154e-07, "loss": 0.0017, "reward": 3.47265625, "reward_std": 0.18391259759664536, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 0.984375, "step": 439 }, { "completion_length": 101.2421875, "epoch": 0.028236804107171505, "grad_norm": 2.910174575314695, "kl": 0.044677734375, "learning_rate": 9.858811449107944e-07, "loss": 0.0018, "reward": 3.16015625, "reward_std": 0.30195021629333496, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 0.9921875, "step": 440 }, { "completion_length": 91.2890625, "epoch": 0.028300978661960532, "grad_norm": 0.9872831031431698, "kl": 0.0367431640625, "learning_rate": 9.858490566037736e-07, "loss": 0.0015, "reward": 3.0546875, "reward_std": 0.09969891607761383, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 0.9921875, "step": 441 }, { "completion_length": 96.4375, "epoch": 0.02836515321674956, "grad_norm": 2.227958703216696, "kl": 0.0404052734375, "learning_rate": 9.858169682967526e-07, "loss": 0.0016, "reward": 2.8359375, "reward_std": 0.26726052910089493, "rewards/accuracy_reward": 0.4921875, "rewards/format_count_numbers": 1.3515625, "rewards/format_reward": 0.9921875, "step": 442 }, { "completion_length": 86.0, "epoch": 0.028429327771538584, "grad_norm": 3.445037448415752, "kl": 0.0452880859375, "learning_rate": 9.857848799897318e-07, "loss": 0.0018, "reward": 3.0234375, "reward_std": 0.26698727905750275, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.203125, "rewards/format_reward": 0.9921875, "step": 443 }, { "completion_length": 90.59375, "epoch": 0.02849350232632761, "grad_norm": 1.622411467798214, "kl": 0.038330078125, "learning_rate": 9.857527916827108e-07, "loss": 0.0015, "reward": 3.390625, "reward_std": 0.1507278736680746, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 444 }, { "completion_length": 86.9296875, "epoch": 0.02855767688111664, "grad_norm": 1.8235861743118784, "kl": 0.033447265625, "learning_rate": 9.857207033756898e-07, "loss": 0.0013, "reward": 3.36328125, "reward_std": 0.13178371638059616, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 445 }, { "completion_length": 105.5546875, "epoch": 0.028621851435905662, "grad_norm": 3.0947309664837945, "kl": 0.039306640625, "learning_rate": 9.85688615068669e-07, "loss": 0.0016, "reward": 3.359375, "reward_std": 0.16665863245725632, "rewards/accuracy_reward": 0.375, "rewards/format_count_numbers": 1.984375, "rewards/format_reward": 1.0, "step": 446 }, { "completion_length": 94.3046875, "epoch": 0.02868602599069469, "grad_norm": 3.6185839140029166, "kl": 0.04412841796875, "learning_rate": 9.85656526761648e-07, "loss": 0.0018, "reward": 3.52734375, "reward_std": 0.14180145412683487, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 0.9921875, "step": 447 }, { "completion_length": 86.90625, "epoch": 0.028750200545483717, "grad_norm": 2.9667421575045885, "kl": 0.085693359375, "learning_rate": 9.85624438454627e-07, "loss": 0.0034, "reward": 3.28125, "reward_std": 0.15072788298130035, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 448 }, { "completion_length": 95.34375, "epoch": 0.02881437510027274, "grad_norm": 6.015346220687002, "kl": 0.038818359375, "learning_rate": 9.855923501476062e-07, "loss": 0.0016, "reward": 3.703125, "reward_std": 0.22043407708406448, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.984375, "rewards/format_reward": 0.9921875, "step": 449 }, { "completion_length": 91.6484375, "epoch": 0.02887854965506177, "grad_norm": 24.50034940349415, "kl": 0.04248046875, "learning_rate": 9.855602618405852e-07, "loss": 0.0017, "reward": 3.61328125, "reward_std": 0.21011091023683548, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 450 }, { "completion_length": 89.859375, "epoch": 0.028942724209850793, "grad_norm": 1.2978957173296415, "kl": 0.0406494140625, "learning_rate": 9.855281735335644e-07, "loss": 0.0016, "reward": 3.20703125, "reward_std": 0.11652141809463501, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 451 }, { "completion_length": 90.421875, "epoch": 0.02900689876463982, "grad_norm": 0.2494563258652741, "kl": 0.0352783203125, "learning_rate": 9.854960852265434e-07, "loss": 0.0014, "reward": 3.6875, "reward_std": 0.0, "rewards/accuracy_reward": 0.9375, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 452 }, { "completion_length": 84.421875, "epoch": 0.029071073319428847, "grad_norm": 8.121967586224253, "kl": 0.06005859375, "learning_rate": 9.854639969195226e-07, "loss": 0.0024, "reward": 3.421875, "reward_std": 0.15540007501840591, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 453 }, { "completion_length": 96.8671875, "epoch": 0.02913524787421787, "grad_norm": 2.3090295229853464, "kl": 0.0443115234375, "learning_rate": 9.854319086125016e-07, "loss": 0.0018, "reward": 3.73828125, "reward_std": 0.07733980193734169, "rewards/accuracy_reward": 0.875, "rewards/format_count_numbers": 1.86328125, "rewards/format_reward": 1.0, "step": 454 }, { "completion_length": 78.625, "epoch": 0.0291994224290069, "grad_norm": 1.6998335771926507, "kl": 0.0335693359375, "learning_rate": 9.853998203054806e-07, "loss": 0.0013, "reward": 3.359375, "reward_std": 0.0731260534375906, "rewards/accuracy_reward": 0.859375, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 455 }, { "completion_length": 86.2109375, "epoch": 0.029263596983795926, "grad_norm": 2.5520679434395706, "kl": 0.05712890625, "learning_rate": 9.853677319984596e-07, "loss": 0.0023, "reward": 3.515625, "reward_std": 0.2927774414420128, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 0.984375, "step": 456 }, { "completion_length": 90.1875, "epoch": 0.02932777153858495, "grad_norm": 6.781072003161571, "kl": 0.0362548828125, "learning_rate": 9.853356436914388e-07, "loss": 0.0014, "reward": 3.53515625, "reward_std": 0.1851307563483715, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 457 }, { "completion_length": 93.4921875, "epoch": 0.029391946093373977, "grad_norm": 1.618505893904474, "kl": 0.033935546875, "learning_rate": 9.853035553844178e-07, "loss": 0.0014, "reward": 2.80859375, "reward_std": 0.20729636400938034, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.21484375, "rewards/format_reward": 0.9921875, "step": 458 }, { "completion_length": 81.734375, "epoch": 0.029456120648163005, "grad_norm": 2.084135210954287, "kl": 0.0380859375, "learning_rate": 9.852714670773969e-07, "loss": 0.0015, "reward": 3.32421875, "reward_std": 0.1430942788720131, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 459 }, { "completion_length": 87.1328125, "epoch": 0.02952029520295203, "grad_norm": 4.054914709393892, "kl": 0.03631591796875, "learning_rate": 9.85239378770376e-07, "loss": 0.0015, "reward": 3.5234375, "reward_std": 0.18253792077302933, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.875, "rewards/format_reward": 1.0, "step": 460 }, { "completion_length": 89.5625, "epoch": 0.029584469757741056, "grad_norm": 1.7184101351761263, "kl": 0.0367431640625, "learning_rate": 9.85207290463355e-07, "loss": 0.0015, "reward": 3.53515625, "reward_std": 0.17886094748973846, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 461 }, { "completion_length": 86.5234375, "epoch": 0.029648644312530083, "grad_norm": 17.796761488814926, "kl": 0.044189453125, "learning_rate": 9.851752021563343e-07, "loss": 0.0018, "reward": 3.1484375, "reward_std": 0.20801587402820587, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 1.0, "step": 462 }, { "completion_length": 92.1953125, "epoch": 0.029712818867319107, "grad_norm": 2.1383248057431063, "kl": 0.037353515625, "learning_rate": 9.851431138493133e-07, "loss": 0.0015, "reward": 3.18359375, "reward_std": 0.153128020465374, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 0.9921875, "step": 463 }, { "completion_length": 84.6015625, "epoch": 0.029776993422108135, "grad_norm": 5.908724265779606, "kl": 0.04541015625, "learning_rate": 9.851110255422923e-07, "loss": 0.0018, "reward": 3.0, "reward_std": 0.21937815845012665, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 1.0, "step": 464 }, { "completion_length": 89.0625, "epoch": 0.02984116797689716, "grad_norm": 3.484869427750808, "kl": 0.040283203125, "learning_rate": 9.850789372352715e-07, "loss": 0.0016, "reward": 3.51171875, "reward_std": 0.25447896867990494, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.83984375, "rewards/format_reward": 1.0, "step": 465 }, { "completion_length": 81.265625, "epoch": 0.029905342531686186, "grad_norm": 2.2821940477936398, "kl": 0.0394287109375, "learning_rate": 9.850468489282505e-07, "loss": 0.0016, "reward": 3.12109375, "reward_std": 0.1754460111260414, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 466 }, { "completion_length": 83.7890625, "epoch": 0.029969517086475213, "grad_norm": 4.8968058959800915, "kl": 0.0384521484375, "learning_rate": 9.850147606212295e-07, "loss": 0.0015, "reward": 3.328125, "reward_std": 0.09753045253455639, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 467 }, { "completion_length": 93.2109375, "epoch": 0.030033691641264237, "grad_norm": 3.3606160149021393, "kl": 0.03466796875, "learning_rate": 9.849826723142087e-07, "loss": 0.0014, "reward": 2.78515625, "reward_std": 0.2625827267765999, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.21484375, "rewards/format_reward": 0.9921875, "step": 468 }, { "completion_length": 85.3671875, "epoch": 0.030097866196053265, "grad_norm": 2.942210750599675, "kl": 0.03125, "learning_rate": 9.849505840071877e-07, "loss": 0.0013, "reward": 2.90625, "reward_std": 0.1712273545563221, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.125, "rewards/format_reward": 1.0, "step": 469 }, { "completion_length": 86.4296875, "epoch": 0.030162040750842292, "grad_norm": 4.115938277602418, "kl": 0.04150390625, "learning_rate": 9.84918495700167e-07, "loss": 0.0017, "reward": 3.44140625, "reward_std": 0.13644260168075562, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 470 }, { "completion_length": 82.421875, "epoch": 0.030226215305631316, "grad_norm": 1.773312551729988, "kl": 0.037841796875, "learning_rate": 9.84886407393146e-07, "loss": 0.0015, "reward": 3.4609375, "reward_std": 0.06664611399173737, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 471 }, { "completion_length": 90.6640625, "epoch": 0.030290389860420344, "grad_norm": 3.8993124087847066, "kl": 0.0377197265625, "learning_rate": 9.84854319086125e-07, "loss": 0.0015, "reward": 3.0546875, "reward_std": 0.12415501847863197, "rewards/accuracy_reward": 0.5703125, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 472 }, { "completion_length": 79.375, "epoch": 0.03035456441520937, "grad_norm": 1.5677032665830635, "kl": 0.0477294921875, "learning_rate": 9.848222307791041e-07, "loss": 0.0019, "reward": 3.203125, "reward_std": 0.08337578922510147, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 473 }, { "completion_length": 89.8671875, "epoch": 0.030418738969998395, "grad_norm": 2.6815432473628635, "kl": 0.04913330078125, "learning_rate": 9.847901424720831e-07, "loss": 0.002, "reward": 3.265625, "reward_std": 0.1173202246427536, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 474 }, { "completion_length": 94.4140625, "epoch": 0.030482913524787422, "grad_norm": 1.281708883722509, "kl": 0.034423828125, "learning_rate": 9.847580541650621e-07, "loss": 0.0014, "reward": 3.6015625, "reward_std": 0.06629125773906708, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.859375, "rewards/format_reward": 1.0, "step": 475 }, { "completion_length": 97.1796875, "epoch": 0.03054708807957645, "grad_norm": 4.593069559954877, "kl": 0.03759765625, "learning_rate": 9.847259658580413e-07, "loss": 0.0015, "reward": 3.3359375, "reward_std": 0.20865952223539352, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 476 }, { "completion_length": 81.5, "epoch": 0.030611262634365474, "grad_norm": 4.25554079186872, "kl": 0.03656005859375, "learning_rate": 9.846938775510203e-07, "loss": 0.0015, "reward": 3.4453125, "reward_std": 0.16781240701675415, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 0.9921875, "step": 477 }, { "completion_length": 93.515625, "epoch": 0.0306754371891545, "grad_norm": 2.150159472302353, "kl": 0.044921875, "learning_rate": 9.846617892439995e-07, "loss": 0.0018, "reward": 3.4453125, "reward_std": 0.21773582696914673, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.71875, "rewards/format_reward": 1.0, "step": 478 }, { "completion_length": 100.4375, "epoch": 0.030739611743943525, "grad_norm": 1.7999659192875432, "kl": 0.04345703125, "learning_rate": 9.846297009369785e-07, "loss": 0.0017, "reward": 2.8125, "reward_std": 0.17320566624403, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.234375, "rewards/format_reward": 0.9921875, "step": 479 }, { "completion_length": 80.015625, "epoch": 0.030803786298732552, "grad_norm": 3.1933718114956773, "kl": 0.0362548828125, "learning_rate": 9.845976126299575e-07, "loss": 0.0015, "reward": 3.26171875, "reward_std": 0.1283687688410282, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 480 }, { "completion_length": 100.0859375, "epoch": 0.03086796085352158, "grad_norm": 2.2576935323731635, "kl": 0.046630859375, "learning_rate": 9.845655243229368e-07, "loss": 0.0019, "reward": 3.37109375, "reward_std": 0.20131288468837738, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 481 }, { "completion_length": 93.796875, "epoch": 0.030932135408310604, "grad_norm": 4.2287469663133725, "kl": 0.048828125, "learning_rate": 9.845334360159158e-07, "loss": 0.002, "reward": 3.09375, "reward_std": 0.1989966332912445, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 0.9921875, "step": 482 }, { "completion_length": 87.1015625, "epoch": 0.03099630996309963, "grad_norm": 1.4757712554279336, "kl": 0.0391845703125, "learning_rate": 9.845013477088948e-07, "loss": 0.0016, "reward": 3.55078125, "reward_std": 0.1173502579331398, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 1.0, "step": 483 }, { "completion_length": 82.7109375, "epoch": 0.03106048451788866, "grad_norm": 1.2548592838498975, "kl": 0.046630859375, "learning_rate": 9.844692594018738e-07, "loss": 0.0019, "reward": 3.125, "reward_std": 0.1179121658205986, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 484 }, { "completion_length": 80.6484375, "epoch": 0.031124659072677682, "grad_norm": 1.5905627551000663, "kl": 0.042724609375, "learning_rate": 9.84437171094853e-07, "loss": 0.0017, "reward": 3.5703125, "reward_std": 0.08679073117673397, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 485 }, { "completion_length": 79.1875, "epoch": 0.03118883362746671, "grad_norm": 3.6359008898576493, "kl": 0.0347900390625, "learning_rate": 9.84405082787832e-07, "loss": 0.0014, "reward": 3.3515625, "reward_std": 0.05639637541025877, "rewards/accuracy_reward": 0.859375, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 486 }, { "completion_length": 90.5, "epoch": 0.03125300818225574, "grad_norm": 2.749936406954127, "kl": 0.042236328125, "learning_rate": 9.843729944808112e-07, "loss": 0.0017, "reward": 3.234375, "reward_std": 0.26507389545440674, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 0.9921875, "step": 487 }, { "completion_length": 88.875, "epoch": 0.031317182737044764, "grad_norm": 5.8432311617015555, "kl": 0.0400390625, "learning_rate": 9.843409061737902e-07, "loss": 0.0016, "reward": 3.28125, "reward_std": 0.21542152762413025, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 488 }, { "completion_length": 85.0078125, "epoch": 0.031381357291833785, "grad_norm": 1.8987303062585643, "kl": 0.055908203125, "learning_rate": 9.843088178667694e-07, "loss": 0.0022, "reward": 3.42578125, "reward_std": 0.20357174426317215, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.58984375, "rewards/format_reward": 0.9921875, "step": 489 }, { "completion_length": 83.640625, "epoch": 0.03144553184662281, "grad_norm": 1.208951892848367, "kl": 0.0382080078125, "learning_rate": 9.842767295597484e-07, "loss": 0.0015, "reward": 3.28125, "reward_std": 0.05444390885531902, "rewards/accuracy_reward": 0.90625, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 1.0, "step": 490 }, { "completion_length": 87.953125, "epoch": 0.03150970640141184, "grad_norm": 2.6978643204943387, "kl": 0.0433349609375, "learning_rate": 9.842446412527274e-07, "loss": 0.0017, "reward": 3.46875, "reward_std": 0.21947985142469406, "rewards/accuracy_reward": 0.8515625, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 491 }, { "completion_length": 84.5546875, "epoch": 0.03157388095620087, "grad_norm": 2.8536162367139895, "kl": 0.044189453125, "learning_rate": 9.842125529457064e-07, "loss": 0.0018, "reward": 3.4765625, "reward_std": 0.15991678833961487, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 492 }, { "completion_length": 87.6796875, "epoch": 0.031638055510989894, "grad_norm": 2.752433248161783, "kl": 0.0462646484375, "learning_rate": 9.841804646386856e-07, "loss": 0.0019, "reward": 3.10546875, "reward_std": 0.2598777264356613, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 493 }, { "completion_length": 92.953125, "epoch": 0.03170223006577892, "grad_norm": 2.021124063069248, "kl": 0.0396728515625, "learning_rate": 9.841483763316646e-07, "loss": 0.0016, "reward": 3.24609375, "reward_std": 0.19890380650758743, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 494 }, { "completion_length": 88.1953125, "epoch": 0.03176640462056794, "grad_norm": 2.2214564092630806, "kl": 0.049072265625, "learning_rate": 9.841162880246438e-07, "loss": 0.002, "reward": 2.94921875, "reward_std": 0.1933268904685974, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.23828125, "rewards/format_reward": 1.0, "step": 495 }, { "completion_length": 93.796875, "epoch": 0.03183057917535697, "grad_norm": 3.4651270706656874, "kl": 0.045166015625, "learning_rate": 9.840841997176228e-07, "loss": 0.0018, "reward": 3.42578125, "reward_std": 0.2593730390071869, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 496 }, { "completion_length": 92.625, "epoch": 0.031894753730146, "grad_norm": 4.095613526145911, "kl": 0.0465087890625, "learning_rate": 9.84052111410602e-07, "loss": 0.0019, "reward": 3.51953125, "reward_std": 0.24045299738645554, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.86328125, "rewards/format_reward": 1.0, "step": 497 }, { "completion_length": 90.1171875, "epoch": 0.031958928284935025, "grad_norm": 4.447475847323495, "kl": 0.052734375, "learning_rate": 9.84020023103581e-07, "loss": 0.0021, "reward": 3.4921875, "reward_std": 0.10691440850496292, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 498 }, { "completion_length": 87.84375, "epoch": 0.03202310283972405, "grad_norm": 2.666607863897794, "kl": 0.06103515625, "learning_rate": 9.8398793479656e-07, "loss": 0.0024, "reward": 3.1015625, "reward_std": 0.188182532787323, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.34375, "rewards/format_reward": 1.0, "step": 499 }, { "completion_length": 92.984375, "epoch": 0.03208727739451307, "grad_norm": 3.167936219121426, "kl": 0.04443359375, "learning_rate": 9.839558464895392e-07, "loss": 0.0018, "reward": 3.3203125, "reward_std": 0.15490422397851944, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 500 }, { "completion_length": 86.15625, "epoch": 0.0321514519493021, "grad_norm": 5.573564849949336, "kl": 0.0504150390625, "learning_rate": 9.839237581825182e-07, "loss": 0.002, "reward": 3.24609375, "reward_std": 0.2737399786710739, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.55859375, "rewards/format_reward": 1.0, "step": 501 }, { "completion_length": 73.328125, "epoch": 0.03221562650409113, "grad_norm": 1.3261674604360032, "kl": 0.045166015625, "learning_rate": 9.838916698754972e-07, "loss": 0.0018, "reward": 3.2109375, "reward_std": 0.11914245784282684, "rewards/accuracy_reward": 0.8515625, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 1.0, "step": 502 }, { "completion_length": 85.3125, "epoch": 0.032279801058880155, "grad_norm": 2.3952183318256424, "kl": 0.052001953125, "learning_rate": 9.838595815684765e-07, "loss": 0.0021, "reward": 3.07421875, "reward_std": 0.21622034162282944, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 503 }, { "completion_length": 73.5625, "epoch": 0.03234397561366918, "grad_norm": 1.7165970251557385, "kl": 0.058837890625, "learning_rate": 9.838274932614555e-07, "loss": 0.0024, "reward": 2.87890625, "reward_std": 0.1768567766994238, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 1.0, "step": 504 }, { "completion_length": 89.0078125, "epoch": 0.03240815016845821, "grad_norm": 10.589773589417028, "kl": 0.04833984375, "learning_rate": 9.837954049544347e-07, "loss": 0.0019, "reward": 3.2890625, "reward_std": 0.15756267309188843, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 505 }, { "completion_length": 78.5078125, "epoch": 0.03247232472324723, "grad_norm": 2.8166247259166863, "kl": 0.072998046875, "learning_rate": 9.837633166474137e-07, "loss": 0.0029, "reward": 2.90234375, "reward_std": 0.16332025080919266, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.23828125, "rewards/format_reward": 1.0, "step": 506 }, { "completion_length": 80.15625, "epoch": 0.03253649927803626, "grad_norm": 5.680797940236752, "kl": 0.0546875, "learning_rate": 9.837312283403927e-07, "loss": 0.0022, "reward": 3.19140625, "reward_std": 0.24567626416683197, "rewards/accuracy_reward": 0.5703125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 507 }, { "completion_length": 94.6953125, "epoch": 0.032600673832825285, "grad_norm": 4.165270442333038, "kl": 0.041259765625, "learning_rate": 9.836991400333719e-07, "loss": 0.0016, "reward": 3.484375, "reward_std": 0.24655454605817795, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.84375, "rewards/format_reward": 0.984375, "step": 508 }, { "completion_length": 80.765625, "epoch": 0.03266484838761431, "grad_norm": 1.966464032191065, "kl": 0.0728759765625, "learning_rate": 9.836670517263509e-07, "loss": 0.0029, "reward": 3.4921875, "reward_std": 0.13098490796983242, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 0.9921875, "step": 509 }, { "completion_length": 85.2734375, "epoch": 0.03272902294240334, "grad_norm": 1.2153201396450348, "kl": 0.055908203125, "learning_rate": 9.836349634193299e-07, "loss": 0.0022, "reward": 3.29296875, "reward_std": 0.13006530329585075, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 510 }, { "completion_length": 79.3046875, "epoch": 0.03279319749719236, "grad_norm": 3.250410835769812, "kl": 0.0501708984375, "learning_rate": 9.83602875112309e-07, "loss": 0.002, "reward": 3.421875, "reward_std": 0.2829606309533119, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.71875, "rewards/format_reward": 0.9921875, "step": 511 }, { "completion_length": 81.3359375, "epoch": 0.03285737205198139, "grad_norm": 3.9121958468276614, "kl": 0.0565185546875, "learning_rate": 9.83570786805288e-07, "loss": 0.0023, "reward": 3.44140625, "reward_std": 0.10751071851700544, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 512 }, { "completion_length": 71.6640625, "epoch": 0.032921546606770415, "grad_norm": 13.118032252043873, "kl": 0.125732421875, "learning_rate": 9.835386984982673e-07, "loss": 0.005, "reward": 3.09765625, "reward_std": 0.16643784940242767, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 0.9921875, "step": 513 }, { "completion_length": 85.984375, "epoch": 0.03298572116155944, "grad_norm": 1.812014550189291, "kl": 0.0601806640625, "learning_rate": 9.835066101912463e-07, "loss": 0.0024, "reward": 3.33203125, "reward_std": 0.21151255816221237, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.56640625, "rewards/format_reward": 0.9921875, "step": 514 }, { "completion_length": 72.4453125, "epoch": 0.03304989571634847, "grad_norm": 2.0200607965425976, "kl": 0.09375, "learning_rate": 9.834745218842253e-07, "loss": 0.0038, "reward": 2.671875, "reward_std": 0.19044626876711845, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.109375, "rewards/format_reward": 0.984375, "step": 515 }, { "completion_length": 74.9296875, "epoch": 0.0331140702711375, "grad_norm": 8.934865911792897, "kl": 0.0576171875, "learning_rate": 9.834424335772045e-07, "loss": 0.0023, "reward": 3.3515625, "reward_std": 0.2409384548664093, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 1.0, "step": 516 }, { "completion_length": 73.3046875, "epoch": 0.03317824482592652, "grad_norm": 2.576453660813919, "kl": 0.0545654296875, "learning_rate": 9.834103452701835e-07, "loss": 0.0022, "reward": 3.1953125, "reward_std": 0.20938798785209656, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.4609375, "rewards/format_reward": 0.9921875, "step": 517 }, { "completion_length": 66.8515625, "epoch": 0.033242419380715545, "grad_norm": 16.506242045391964, "kl": 0.058837890625, "learning_rate": 9.833782569631625e-07, "loss": 0.0024, "reward": 3.83984375, "reward_std": 0.1822758913040161, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.99609375, "rewards/format_reward": 1.0, "step": 518 }, { "completion_length": 70.4765625, "epoch": 0.03330659393550457, "grad_norm": 5.694455584128437, "kl": 0.0545654296875, "learning_rate": 9.833461686561415e-07, "loss": 0.0022, "reward": 3.30859375, "reward_std": 0.17156639695167542, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 519 }, { "completion_length": 78.59375, "epoch": 0.0333707684902936, "grad_norm": 4.441561232929554, "kl": 0.057861328125, "learning_rate": 9.833140803491207e-07, "loss": 0.0023, "reward": 3.25, "reward_std": 0.1217960026115179, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 0.9921875, "step": 520 }, { "completion_length": 79.1328125, "epoch": 0.03343494304508263, "grad_norm": 2.628065620771496, "kl": 0.0733642578125, "learning_rate": 9.832819920420997e-07, "loss": 0.0029, "reward": 3.265625, "reward_std": 0.1646973416209221, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.59375, "rewards/format_reward": 1.0, "step": 521 }, { "completion_length": 71.9921875, "epoch": 0.033499117599871654, "grad_norm": 6.62421566838726, "kl": 0.0404052734375, "learning_rate": 9.83249903735079e-07, "loss": 0.0016, "reward": 3.4609375, "reward_std": 0.09522313624620438, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 0.9921875, "step": 522 }, { "completion_length": 75.9921875, "epoch": 0.033563292154660675, "grad_norm": 2.6442635662197085, "kl": 0.0533447265625, "learning_rate": 9.83217815428058e-07, "loss": 0.0021, "reward": 3.39453125, "reward_std": 0.16310575604438782, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 523 }, { "completion_length": 71.9296875, "epoch": 0.0336274667094497, "grad_norm": 3.1132933311663438, "kl": 0.056396484375, "learning_rate": 9.831857271210372e-07, "loss": 0.0023, "reward": 3.4375, "reward_std": 0.16469734907150269, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 524 }, { "completion_length": 69.4296875, "epoch": 0.03369164126423873, "grad_norm": 8.62869526303385, "kl": 0.0467529296875, "learning_rate": 9.831536388140162e-07, "loss": 0.0019, "reward": 3.2578125, "reward_std": 0.21882762759923935, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 0.9921875, "step": 525 }, { "completion_length": 81.046875, "epoch": 0.03375581581902776, "grad_norm": 1.981671923725136, "kl": 0.049560546875, "learning_rate": 9.831215505069952e-07, "loss": 0.002, "reward": 3.3125, "reward_std": 0.13443589583039284, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 526 }, { "completion_length": 74.3984375, "epoch": 0.033819990373816784, "grad_norm": 1.7206529642986308, "kl": 0.0531005859375, "learning_rate": 9.830894621999742e-07, "loss": 0.0021, "reward": 3.2421875, "reward_std": 0.14465449005365372, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 527 }, { "completion_length": 75.6796875, "epoch": 0.033884164928605805, "grad_norm": 14.06703923210226, "kl": 0.0511474609375, "learning_rate": 9.830573738929534e-07, "loss": 0.002, "reward": 3.66015625, "reward_std": 0.1346667818725109, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 528 }, { "completion_length": 73.3671875, "epoch": 0.03394833948339483, "grad_norm": 4.959546403603919, "kl": 0.0491943359375, "learning_rate": 9.830252855859324e-07, "loss": 0.002, "reward": 3.4453125, "reward_std": 0.21144159138202667, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 1.0, "step": 529 }, { "completion_length": 71.3203125, "epoch": 0.03401251403818386, "grad_norm": 2.1668136369505726, "kl": 0.0592041015625, "learning_rate": 9.829931972789116e-07, "loss": 0.0024, "reward": 3.19140625, "reward_std": 0.21226860582828522, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 0.9921875, "step": 530 }, { "completion_length": 72.6875, "epoch": 0.03407668859297289, "grad_norm": 1.547671682170786, "kl": 0.045654296875, "learning_rate": 9.829611089718906e-07, "loss": 0.0018, "reward": 3.1796875, "reward_std": 0.12597234547138214, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 531 }, { "completion_length": 73.9609375, "epoch": 0.034140863147761914, "grad_norm": 8.903380233549331, "kl": 0.066162109375, "learning_rate": 9.829290206648698e-07, "loss": 0.0026, "reward": 3.00390625, "reward_std": 0.2809188663959503, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 0.9765625, "step": 532 }, { "completion_length": 78.515625, "epoch": 0.03420503770255094, "grad_norm": 2.4211077215912145, "kl": 0.0498046875, "learning_rate": 9.828969323578488e-07, "loss": 0.002, "reward": 3.453125, "reward_std": 0.16097761318087578, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 533 }, { "completion_length": 73.890625, "epoch": 0.03426921225733996, "grad_norm": 2.0152850991162965, "kl": 0.0760498046875, "learning_rate": 9.828648440508278e-07, "loss": 0.003, "reward": 3.640625, "reward_std": 0.07209636550396681, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.859375, "rewards/format_reward": 1.0, "step": 534 }, { "completion_length": 76.2421875, "epoch": 0.03433338681212899, "grad_norm": 2.7098296638071173, "kl": 0.044677734375, "learning_rate": 9.828327557438068e-07, "loss": 0.0018, "reward": 3.3046875, "reward_std": 0.17464229464530945, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 535 }, { "completion_length": 72.4375, "epoch": 0.03439756136691802, "grad_norm": 4.86550437869551, "kl": 0.04052734375, "learning_rate": 9.82800667436786e-07, "loss": 0.0016, "reward": 3.43359375, "reward_std": 0.16519136726856232, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 536 }, { "completion_length": 82.9296875, "epoch": 0.034461735921707044, "grad_norm": 2.6344250867233208, "kl": 0.03955078125, "learning_rate": 9.82768579129765e-07, "loss": 0.0016, "reward": 3.2578125, "reward_std": 0.14796767383813858, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 537 }, { "completion_length": 80.5859375, "epoch": 0.03452591047649607, "grad_norm": 4.870982603059216, "kl": 0.0423583984375, "learning_rate": 9.827364908227442e-07, "loss": 0.0017, "reward": 3.453125, "reward_std": 0.20805485546588898, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 538 }, { "completion_length": 79.1328125, "epoch": 0.03459008503128509, "grad_norm": 0.7814300107987665, "kl": 0.0523681640625, "learning_rate": 9.827044025157232e-07, "loss": 0.0021, "reward": 2.9296875, "reward_std": 0.06629125960171223, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.125, "rewards/format_reward": 0.9921875, "step": 539 }, { "completion_length": 81.671875, "epoch": 0.03465425958607412, "grad_norm": 1.3357958518543576, "kl": 0.048583984375, "learning_rate": 9.826723142087024e-07, "loss": 0.0019, "reward": 3.2109375, "reward_std": 0.09522313997149467, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 540 }, { "completion_length": 92.4921875, "epoch": 0.03471843414086315, "grad_norm": 3.8099343400387924, "kl": 0.09423828125, "learning_rate": 9.826402259016814e-07, "loss": 0.0038, "reward": 3.50390625, "reward_std": 0.1344023011624813, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.86328125, "rewards/format_reward": 1.0, "step": 541 }, { "completion_length": 73.7578125, "epoch": 0.034782608695652174, "grad_norm": 5.517986654337228, "kl": 0.0455322265625, "learning_rate": 9.826081375946604e-07, "loss": 0.0018, "reward": 3.42578125, "reward_std": 0.039980421774089336, "rewards/accuracy_reward": 0.9296875, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 542 }, { "completion_length": 69.9296875, "epoch": 0.0348467832504412, "grad_norm": 3.627473835145456, "kl": 0.0509033203125, "learning_rate": 9.825760492876394e-07, "loss": 0.002, "reward": 2.8828125, "reward_std": 0.1344047524034977, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.125, "rewards/format_reward": 0.9921875, "step": 543 }, { "completion_length": 82.5625, "epoch": 0.03491095780523023, "grad_norm": 4.871616004926028, "kl": 0.04443359375, "learning_rate": 9.825439609806186e-07, "loss": 0.0018, "reward": 3.1015625, "reward_std": 0.16781240329146385, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 1.0, "step": 544 }, { "completion_length": 82.8828125, "epoch": 0.03497513236001925, "grad_norm": 17.376186277263155, "kl": 0.22900390625, "learning_rate": 9.825118726735976e-07, "loss": 0.0091, "reward": 3.0546875, "reward_std": 0.16151439771056175, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.46875, "rewards/format_reward": 1.0, "step": 545 }, { "completion_length": 91.2265625, "epoch": 0.03503930691480828, "grad_norm": 2.7392230474120858, "kl": 0.0592041015625, "learning_rate": 9.824797843665766e-07, "loss": 0.0024, "reward": 3.3125, "reward_std": 0.22962788492441177, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 546 }, { "completion_length": 85.7421875, "epoch": 0.035103481469597304, "grad_norm": 3.060597320753774, "kl": 0.03369140625, "learning_rate": 9.824476960595559e-07, "loss": 0.0013, "reward": 3.65625, "reward_std": 0.09863808378577232, "rewards/accuracy_reward": 0.90625, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 547 }, { "completion_length": 83.0, "epoch": 0.03516765602438633, "grad_norm": 7.539280918398774, "kl": 0.050048828125, "learning_rate": 9.824156077525349e-07, "loss": 0.002, "reward": 3.31640625, "reward_std": 0.16608300060033798, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 0.9921875, "step": 548 }, { "completion_length": 84.53125, "epoch": 0.03523183057917536, "grad_norm": 1.7609238246194963, "kl": 0.03759765625, "learning_rate": 9.82383519445514e-07, "loss": 0.0015, "reward": 3.25390625, "reward_std": 0.1812150627374649, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 549 }, { "completion_length": 93.5390625, "epoch": 0.03529600513396439, "grad_norm": 4.262934633714605, "kl": 0.0390625, "learning_rate": 9.82351431138493e-07, "loss": 0.0016, "reward": 3.4921875, "reward_std": 0.11253517679870129, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.8671875, "rewards/format_reward": 1.0, "step": 550 }, { "completion_length": 89.1640625, "epoch": 0.03536017968875341, "grad_norm": 6.453059278434739, "kl": 0.0452880859375, "learning_rate": 9.823193428314723e-07, "loss": 0.0018, "reward": 3.33203125, "reward_std": 0.20691193640232086, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.71484375, "rewards/format_reward": 1.0, "step": 551 }, { "completion_length": 99.9765625, "epoch": 0.035424354243542434, "grad_norm": 148.58988453938693, "kl": 0.05078125, "learning_rate": 9.822872545244513e-07, "loss": 0.002, "reward": 3.23828125, "reward_std": 0.27657896280288696, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.69140625, "rewards/format_reward": 1.0, "step": 552 }, { "completion_length": 85.1484375, "epoch": 0.03548852879833146, "grad_norm": 1.6729205860781227, "kl": 0.04638671875, "learning_rate": 9.822551662174303e-07, "loss": 0.0019, "reward": 3.50390625, "reward_std": 0.1456998586654663, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 553 }, { "completion_length": 83.4296875, "epoch": 0.03555270335312049, "grad_norm": 5.753612413488577, "kl": 0.0384521484375, "learning_rate": 9.822230779104093e-07, "loss": 0.0015, "reward": 3.265625, "reward_std": 0.20552664250135422, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 0.9921875, "step": 554 }, { "completion_length": 83.0234375, "epoch": 0.03561687790790952, "grad_norm": 14.295780292248967, "kl": 0.04833984375, "learning_rate": 9.821909896033885e-07, "loss": 0.0019, "reward": 2.859375, "reward_std": 0.27976007759571075, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.234375, "rewards/format_reward": 1.0, "step": 555 }, { "completion_length": 103.2265625, "epoch": 0.03568105246269854, "grad_norm": 3.8411848601289638, "kl": 0.039794921875, "learning_rate": 9.821589012963675e-07, "loss": 0.0016, "reward": 3.33203125, "reward_std": 0.23845895379781723, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.71484375, "rewards/format_reward": 1.0, "step": 556 }, { "completion_length": 79.6953125, "epoch": 0.035745227017487564, "grad_norm": 2.512693775685972, "kl": 0.048583984375, "learning_rate": 9.821268129893467e-07, "loss": 0.0019, "reward": 3.26953125, "reward_std": 0.14660941064357758, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 557 }, { "completion_length": 82.71875, "epoch": 0.03580940157227659, "grad_norm": 2.596070751293012, "kl": 0.0440673828125, "learning_rate": 9.820947246823257e-07, "loss": 0.0018, "reward": 3.33203125, "reward_std": 0.1716754287481308, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 558 }, { "completion_length": 88.328125, "epoch": 0.03587357612706562, "grad_norm": 3.943536215799186, "kl": 0.047119140625, "learning_rate": 9.82062636375305e-07, "loss": 0.0019, "reward": 2.98046875, "reward_std": 0.25665540248155594, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.36328125, "rewards/format_reward": 0.9921875, "step": 559 }, { "completion_length": 91.7109375, "epoch": 0.03593775068185465, "grad_norm": 5.034386305545289, "kl": 0.0394287109375, "learning_rate": 9.82030548068284e-07, "loss": 0.0016, "reward": 3.375, "reward_std": 0.20457638427615166, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.703125, "rewards/format_reward": 1.0, "step": 560 }, { "completion_length": 94.5546875, "epoch": 0.036001925236643674, "grad_norm": 3.5196211718401105, "kl": 0.04541015625, "learning_rate": 9.81998459761263e-07, "loss": 0.0018, "reward": 2.98828125, "reward_std": 0.2169996052980423, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.46484375, "rewards/format_reward": 1.0, "step": 561 }, { "completion_length": 92.0546875, "epoch": 0.036066099791432694, "grad_norm": 5.990679277480426, "kl": 0.0458984375, "learning_rate": 9.81966371454242e-07, "loss": 0.0018, "reward": 2.9609375, "reward_std": 0.17662061005830765, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.3515625, "rewards/format_reward": 1.0, "step": 562 }, { "completion_length": 104.1015625, "epoch": 0.03613027434622172, "grad_norm": 5.378758654421011, "kl": 0.06005859375, "learning_rate": 9.819342831472211e-07, "loss": 0.0024, "reward": 2.97265625, "reward_std": 0.256045326590538, "rewards/accuracy_reward": 0.5078125, "rewards/format_count_numbers": 1.46484375, "rewards/format_reward": 1.0, "step": 563 }, { "completion_length": 86.8984375, "epoch": 0.03619444890101075, "grad_norm": 3.6512722704960052, "kl": 0.0418701171875, "learning_rate": 9.819021948402001e-07, "loss": 0.0017, "reward": 3.4296875, "reward_std": 0.1978272944688797, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 564 }, { "completion_length": 84.4375, "epoch": 0.03625862345579978, "grad_norm": 4.6696886797544845, "kl": 0.0482177734375, "learning_rate": 9.818701065331793e-07, "loss": 0.0019, "reward": 3.09765625, "reward_std": 0.21152067184448242, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.33984375, "rewards/format_reward": 1.0, "step": 565 }, { "completion_length": 92.125, "epoch": 0.036322798010588804, "grad_norm": 1.9197019584500314, "kl": 0.057861328125, "learning_rate": 9.818380182261583e-07, "loss": 0.0023, "reward": 3.26953125, "reward_std": 0.21264750510454178, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 566 }, { "completion_length": 85.4765625, "epoch": 0.036386972565377824, "grad_norm": 6.243194870246088, "kl": 0.052734375, "learning_rate": 9.818059299191376e-07, "loss": 0.0021, "reward": 3.00390625, "reward_std": 0.2988901436328888, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.27734375, "rewards/format_reward": 0.984375, "step": 567 }, { "completion_length": 83.7265625, "epoch": 0.03645114712016685, "grad_norm": 1.8109394124949953, "kl": 0.0703125, "learning_rate": 9.817738416121166e-07, "loss": 0.0028, "reward": 3.3828125, "reward_std": 0.1213759332895279, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 0.9921875, "step": 568 }, { "completion_length": 81.4140625, "epoch": 0.03651532167495588, "grad_norm": 3.240609635417276, "kl": 0.0611572265625, "learning_rate": 9.817417533050956e-07, "loss": 0.0024, "reward": 2.9921875, "reward_std": 0.2044658064842224, "rewards/accuracy_reward": 0.875, "rewards/format_count_numbers": 1.1171875, "rewards/format_reward": 1.0, "step": 569 }, { "completion_length": 88.796875, "epoch": 0.03657949622974491, "grad_norm": 4.281970917321178, "kl": 0.0556640625, "learning_rate": 9.817096649980746e-07, "loss": 0.0022, "reward": 3.265625, "reward_std": 0.11993881314992905, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.46875, "rewards/format_reward": 1.0, "step": 570 }, { "completion_length": 89.5703125, "epoch": 0.036643670784533934, "grad_norm": 2.7180933139063965, "kl": 0.0758056640625, "learning_rate": 9.816775766910538e-07, "loss": 0.003, "reward": 2.859375, "reward_std": 0.21178314462304115, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.109375, "rewards/format_reward": 0.9921875, "step": 571 }, { "completion_length": 98.59375, "epoch": 0.03670784533932296, "grad_norm": 80.19727956304367, "kl": 0.0509033203125, "learning_rate": 9.816454883840328e-07, "loss": 0.002, "reward": 3.17578125, "reward_std": 0.1707644686102867, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 572 }, { "completion_length": 79.078125, "epoch": 0.03677201989411198, "grad_norm": 2.309367711529073, "kl": 0.0576171875, "learning_rate": 9.81613400077012e-07, "loss": 0.0023, "reward": 3.3984375, "reward_std": 0.15756267867982388, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 573 }, { "completion_length": 88.5546875, "epoch": 0.03683619444890101, "grad_norm": 3.0805224272052345, "kl": 0.077880859375, "learning_rate": 9.81581311769991e-07, "loss": 0.0031, "reward": 3.14453125, "reward_std": 0.18967504054307938, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 0.9921875, "step": 574 }, { "completion_length": 96.0234375, "epoch": 0.03690036900369004, "grad_norm": 1.816456722530229, "kl": 0.05517578125, "learning_rate": 9.8154922346297e-07, "loss": 0.0022, "reward": 3.390625, "reward_std": 0.09442678652703762, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.6015625, "rewards/format_reward": 0.9921875, "step": 575 }, { "completion_length": 101.859375, "epoch": 0.036964543558479064, "grad_norm": 3.608324026726574, "kl": 0.0625, "learning_rate": 9.815171351559492e-07, "loss": 0.0025, "reward": 3.1953125, "reward_std": 0.3073201924562454, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.5859375, "rewards/format_reward": 0.9921875, "step": 576 }, { "completion_length": 88.9453125, "epoch": 0.03702871811326809, "grad_norm": 1.9002319775455399, "kl": 0.0771484375, "learning_rate": 9.814850468489282e-07, "loss": 0.0031, "reward": 3.1171875, "reward_std": 0.2170867845416069, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.3515625, "rewards/format_reward": 0.9921875, "step": 577 }, { "completion_length": 94.0078125, "epoch": 0.03709289266805711, "grad_norm": 7.856552438388562, "kl": 0.076171875, "learning_rate": 9.814529585419072e-07, "loss": 0.003, "reward": 3.58203125, "reward_std": 0.13633228093385696, "rewards/accuracy_reward": 0.8515625, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 578 }, { "completion_length": 84.4296875, "epoch": 0.03715706722284614, "grad_norm": 2.2674788963389316, "kl": 0.07763671875, "learning_rate": 9.814208702348864e-07, "loss": 0.0031, "reward": 3.25, "reward_std": 0.17513228952884674, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 0.984375, "step": 579 }, { "completion_length": 87.15625, "epoch": 0.03722124177763517, "grad_norm": 12.813722530337529, "kl": 0.09765625, "learning_rate": 9.813887819278654e-07, "loss": 0.0039, "reward": 3.1796875, "reward_std": 0.13019345700740814, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 580 }, { "completion_length": 92.8515625, "epoch": 0.037285416332424194, "grad_norm": 3.278984331550509, "kl": 0.065185546875, "learning_rate": 9.813566936208444e-07, "loss": 0.0026, "reward": 3.1875, "reward_std": 0.2238539308309555, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 581 }, { "completion_length": 98.5078125, "epoch": 0.03734959088721322, "grad_norm": 2.3978684942627773, "kl": 0.0501708984375, "learning_rate": 9.813246053138236e-07, "loss": 0.002, "reward": 3.6328125, "reward_std": 0.14855197630822659, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.8671875, "rewards/format_reward": 1.0, "step": 582 }, { "completion_length": 97.078125, "epoch": 0.03741376544200225, "grad_norm": 1.9986415595914089, "kl": 0.0904541015625, "learning_rate": 9.812925170068026e-07, "loss": 0.0036, "reward": 3.49609375, "reward_std": 0.21177569031715393, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 0.9921875, "step": 583 }, { "completion_length": 87.8203125, "epoch": 0.03747793999679127, "grad_norm": 2.2266492619242815, "kl": 0.079833984375, "learning_rate": 9.812604286997818e-07, "loss": 0.0032, "reward": 3.06640625, "reward_std": 0.24279320240020752, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.47265625, "rewards/format_reward": 0.9921875, "step": 584 }, { "completion_length": 89.703125, "epoch": 0.0375421145515803, "grad_norm": 2.1986516460500187, "kl": 0.066162109375, "learning_rate": 9.812283403927608e-07, "loss": 0.0026, "reward": 3.578125, "reward_std": 0.16097761690616608, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 585 }, { "completion_length": 87.59375, "epoch": 0.037606289106369324, "grad_norm": 2.9901848114764418, "kl": 0.139892578125, "learning_rate": 9.811962520857398e-07, "loss": 0.0056, "reward": 3.0546875, "reward_std": 0.15756267309188843, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 586 }, { "completion_length": 102.3046875, "epoch": 0.03767046366115835, "grad_norm": 4.912948371071768, "kl": 0.05810546875, "learning_rate": 9.81164163778719e-07, "loss": 0.0023, "reward": 3.2578125, "reward_std": 0.14966704696416855, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 1.0, "step": 587 }, { "completion_length": 89.421875, "epoch": 0.03773463821594738, "grad_norm": 4.2154349658241275, "kl": 0.10400390625, "learning_rate": 9.81132075471698e-07, "loss": 0.0042, "reward": 3.19921875, "reward_std": 0.2003859579563141, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 0.9921875, "step": 588 }, { "completion_length": 92.359375, "epoch": 0.037798812770736406, "grad_norm": 4.787728597542038, "kl": 0.087646484375, "learning_rate": 9.81099987164677e-07, "loss": 0.0035, "reward": 3.2890625, "reward_std": 0.2664504796266556, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 0.984375, "step": 589 }, { "completion_length": 96.5625, "epoch": 0.03786298732552543, "grad_norm": 1.522396554571698, "kl": 0.0594482421875, "learning_rate": 9.810678988576563e-07, "loss": 0.0024, "reward": 3.37890625, "reward_std": 0.09548516571521759, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 590 }, { "completion_length": 90.6953125, "epoch": 0.037927161880314454, "grad_norm": 1.2698943812774897, "kl": 0.0506591796875, "learning_rate": 9.810358105506353e-07, "loss": 0.002, "reward": 3.6875, "reward_std": 0.11048542708158493, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.8671875, "rewards/format_reward": 1.0, "step": 591 }, { "completion_length": 99.734375, "epoch": 0.03799133643510348, "grad_norm": 2.48912535785192, "kl": 0.052734375, "learning_rate": 9.810037222436145e-07, "loss": 0.0021, "reward": 3.421875, "reward_std": 0.17934399843215942, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 0.9921875, "step": 592 }, { "completion_length": 92.96875, "epoch": 0.03805551098989251, "grad_norm": 18.734367422289175, "kl": 0.061767578125, "learning_rate": 9.809716339365935e-07, "loss": 0.0025, "reward": 3.49609375, "reward_std": 0.1612396463751793, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 593 }, { "completion_length": 89.671875, "epoch": 0.038119685544681536, "grad_norm": 3.3959745544794706, "kl": 0.083251953125, "learning_rate": 9.809395456295725e-07, "loss": 0.0033, "reward": 2.8828125, "reward_std": 0.2301519438624382, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 1.0, "step": 594 }, { "completion_length": 92.3359375, "epoch": 0.03818386009947056, "grad_norm": 2.3455894950838814, "kl": 0.0623779296875, "learning_rate": 9.809074573225517e-07, "loss": 0.0025, "reward": 3.15625, "reward_std": 0.14187801629304886, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 595 }, { "completion_length": 97.3671875, "epoch": 0.038248034654259584, "grad_norm": 5.655439426422941, "kl": 0.064697265625, "learning_rate": 9.808753690155307e-07, "loss": 0.0026, "reward": 3.3203125, "reward_std": 0.10547287575900555, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 596 }, { "completion_length": 98.890625, "epoch": 0.03831220920904861, "grad_norm": 2.9401346824466117, "kl": 0.06494140625, "learning_rate": 9.808432807085097e-07, "loss": 0.0026, "reward": 3.24609375, "reward_std": 0.25476987659931183, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 597 }, { "completion_length": 94.8046875, "epoch": 0.03837638376383764, "grad_norm": 3.764741307799039, "kl": 0.089599609375, "learning_rate": 9.80811192401489e-07, "loss": 0.0036, "reward": 2.91015625, "reward_std": 0.21827884018421173, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.20703125, "rewards/format_reward": 0.9921875, "step": 598 }, { "completion_length": 92.9609375, "epoch": 0.038440558318626666, "grad_norm": 2.562748235354907, "kl": 0.035888671875, "learning_rate": 9.80779104094468e-07, "loss": 0.0014, "reward": 3.6328125, "reward_std": 0.07996084541082382, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.875, "rewards/format_reward": 1.0, "step": 599 }, { "completion_length": 81.734375, "epoch": 0.038504732873415694, "grad_norm": 2.2290934936102484, "kl": 0.0400390625, "learning_rate": 9.807470157874471e-07, "loss": 0.0016, "reward": 3.6640625, "reward_std": 0.1173202246427536, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.8671875, "rewards/format_reward": 1.0, "step": 600 }, { "completion_length": 92.7109375, "epoch": 0.038568907428204714, "grad_norm": 2.224207883902399, "kl": 0.05078125, "learning_rate": 9.807149274804261e-07, "loss": 0.002, "reward": 3.47265625, "reward_std": 0.09548516198992729, "rewards/accuracy_reward": 0.8515625, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 601 }, { "completion_length": 97.2890625, "epoch": 0.03863308198299374, "grad_norm": 3.31083752151982, "kl": 0.0521240234375, "learning_rate": 9.806828391734053e-07, "loss": 0.0021, "reward": 3.34765625, "reward_std": 0.23728906363248825, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 602 }, { "completion_length": 78.8828125, "epoch": 0.03869725653778277, "grad_norm": 1.8587056767026766, "kl": 0.0712890625, "learning_rate": 9.806507508663843e-07, "loss": 0.0029, "reward": 3.44921875, "reward_std": 0.1644664630293846, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 1.0, "step": 603 }, { "completion_length": 95.28125, "epoch": 0.038761431092571796, "grad_norm": 2.9493571681528046, "kl": 0.0501708984375, "learning_rate": 9.806186625593633e-07, "loss": 0.002, "reward": 3.375, "reward_std": 0.17300995439291, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 604 }, { "completion_length": 86.921875, "epoch": 0.038825605647360824, "grad_norm": 1.9754569188630644, "kl": 0.0810546875, "learning_rate": 9.805865742523423e-07, "loss": 0.0032, "reward": 3.04296875, "reward_std": 0.22988500446081161, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.23046875, "rewards/format_reward": 1.0, "step": 605 }, { "completion_length": 77.375, "epoch": 0.038889780202149844, "grad_norm": 2.67416517903953, "kl": 0.041259765625, "learning_rate": 9.805544859453215e-07, "loss": 0.0017, "reward": 3.3671875, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.8671875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 606 }, { "completion_length": 80.1796875, "epoch": 0.03895395475693887, "grad_norm": 4.256708301631226, "kl": 0.061279296875, "learning_rate": 9.805223976383005e-07, "loss": 0.0025, "reward": 3.59375, "reward_std": 0.16996126621961594, "rewards/accuracy_reward": 0.875, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 0.9921875, "step": 607 }, { "completion_length": 88.328125, "epoch": 0.0390181293117279, "grad_norm": 1.4121702076717546, "kl": 0.0703125, "learning_rate": 9.804903093312795e-07, "loss": 0.0028, "reward": 3.42578125, "reward_std": 0.12335620820522308, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 608 }, { "completion_length": 89.7109375, "epoch": 0.039082303866516926, "grad_norm": 2.9289517813985047, "kl": 0.0479736328125, "learning_rate": 9.804582210242587e-07, "loss": 0.0019, "reward": 3.25390625, "reward_std": 0.2535797134041786, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.70703125, "rewards/format_reward": 1.0, "step": 609 }, { "completion_length": 84.6640625, "epoch": 0.039146478421305954, "grad_norm": 4.27371756541338, "kl": 0.0869140625, "learning_rate": 9.804261327172377e-07, "loss": 0.0035, "reward": 2.6640625, "reward_std": 0.1173202246427536, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.1171875, "rewards/format_reward": 1.0, "step": 610 }, { "completion_length": 82.1875, "epoch": 0.03921065297609498, "grad_norm": 3.824006349390961, "kl": 0.075927734375, "learning_rate": 9.80394044410217e-07, "loss": 0.003, "reward": 3.03515625, "reward_std": 0.11104921251535416, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.34765625, "rewards/format_reward": 0.9921875, "step": 611 }, { "completion_length": 87.109375, "epoch": 0.039274827530884, "grad_norm": 4.503082702515632, "kl": 0.04345703125, "learning_rate": 9.80361956103196e-07, "loss": 0.0017, "reward": 3.32421875, "reward_std": 0.14886823296546936, "rewards/accuracy_reward": 0.4765625, "rewards/format_count_numbers": 1.84765625, "rewards/format_reward": 1.0, "step": 612 }, { "completion_length": 80.8046875, "epoch": 0.03933900208567303, "grad_norm": 1.8807482121390109, "kl": 0.063720703125, "learning_rate": 9.80329867796175e-07, "loss": 0.0025, "reward": 3.23828125, "reward_std": 0.18422836065292358, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 613 }, { "completion_length": 76.8984375, "epoch": 0.039403176640462056, "grad_norm": 2.798533094933186, "kl": 0.11376953125, "learning_rate": 9.802977794891542e-07, "loss": 0.0045, "reward": 3.53125, "reward_std": 0.23725903034210205, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 1.0, "step": 614 }, { "completion_length": 79.6328125, "epoch": 0.039467351195251084, "grad_norm": 1.9071240598272174, "kl": 0.065185546875, "learning_rate": 9.802656911821332e-07, "loss": 0.0026, "reward": 3.44921875, "reward_std": 0.14180145412683487, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 615 }, { "completion_length": 84.3125, "epoch": 0.03953152575004011, "grad_norm": 3.201389273457516, "kl": 0.060302734375, "learning_rate": 9.802336028751122e-07, "loss": 0.0024, "reward": 3.4296875, "reward_std": 0.20251823961734772, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 0.9921875, "step": 616 }, { "completion_length": 75.03125, "epoch": 0.03959570030482914, "grad_norm": 2.8516891022891038, "kl": 0.042236328125, "learning_rate": 9.802015145680914e-07, "loss": 0.0017, "reward": 3.64453125, "reward_std": 0.08758953958749771, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 617 }, { "completion_length": 89.4140625, "epoch": 0.03965987485961816, "grad_norm": 2.280386323323077, "kl": 0.057373046875, "learning_rate": 9.801694262610704e-07, "loss": 0.0023, "reward": 3.5546875, "reward_std": 0.16507087647914886, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.8359375, "rewards/format_reward": 1.0, "step": 618 }, { "completion_length": 76.6328125, "epoch": 0.039724049414407187, "grad_norm": 3.1205969765963535, "kl": 0.0528564453125, "learning_rate": 9.801373379540496e-07, "loss": 0.0021, "reward": 3.46875, "reward_std": 0.09863808378577232, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 619 }, { "completion_length": 74.21875, "epoch": 0.039788223969196214, "grad_norm": 1.632888418174939, "kl": 0.0555419921875, "learning_rate": 9.801052496470286e-07, "loss": 0.0022, "reward": 3.296875, "reward_std": 0.1325825210660696, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 620 }, { "completion_length": 71.0703125, "epoch": 0.03985239852398524, "grad_norm": 1.3407098911291901, "kl": 0.081298828125, "learning_rate": 9.800731613400076e-07, "loss": 0.0033, "reward": 3.31640625, "reward_std": 0.10932306572794914, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 621 }, { "completion_length": 72.734375, "epoch": 0.03991657307877427, "grad_norm": 13.674818372699388, "kl": 0.0574951171875, "learning_rate": 9.800410730329868e-07, "loss": 0.0023, "reward": 3.4609375, "reward_std": 0.07232969999313354, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 0.9921875, "step": 622 }, { "completion_length": 74.015625, "epoch": 0.03998074763356329, "grad_norm": 5.844236281645039, "kl": 0.05712890625, "learning_rate": 9.800089847259658e-07, "loss": 0.0023, "reward": 3.3046875, "reward_std": 0.17407146096229553, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 623 }, { "completion_length": 80.0859375, "epoch": 0.04004492218835232, "grad_norm": 1.2947404569939251, "kl": 0.0579833984375, "learning_rate": 9.799768964189448e-07, "loss": 0.0023, "reward": 3.5, "reward_std": 0.09863808564841747, "rewards/accuracy_reward": 0.875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 624 }, { "completion_length": 98.4765625, "epoch": 0.040109096743141344, "grad_norm": 69.40734693983866, "kl": 0.0667724609375, "learning_rate": 9.79944808111924e-07, "loss": 0.0027, "reward": 3.10546875, "reward_std": 0.19772130250930786, "rewards/accuracy_reward": 0.4921875, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 625 }, { "completion_length": 93.359375, "epoch": 0.04017327129793037, "grad_norm": 1.9851920883248664, "kl": 0.0653076171875, "learning_rate": 9.79912719804903e-07, "loss": 0.0026, "reward": 3.0703125, "reward_std": 0.16781241446733475, "rewards/accuracy_reward": 0.484375, "rewards/format_count_numbers": 1.5859375, "rewards/format_reward": 1.0, "step": 626 }, { "completion_length": 90.546875, "epoch": 0.0402374458527194, "grad_norm": 2.6499168723184616, "kl": 0.0499267578125, "learning_rate": 9.798806314978822e-07, "loss": 0.002, "reward": 3.67578125, "reward_std": 0.2121821790933609, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.98046875, "rewards/format_reward": 1.0, "step": 627 }, { "completion_length": 80.8359375, "epoch": 0.040301620407508426, "grad_norm": 4.027910005077781, "kl": 0.077880859375, "learning_rate": 9.798485431908612e-07, "loss": 0.0031, "reward": 2.953125, "reward_std": 0.2561129778623581, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.3203125, "rewards/format_reward": 1.0, "step": 628 }, { "completion_length": 85.1015625, "epoch": 0.04036579496229745, "grad_norm": 4.2835870422485325, "kl": 0.0543212890625, "learning_rate": 9.798164548838402e-07, "loss": 0.0022, "reward": 3.2421875, "reward_std": 0.2436375617980957, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.6015625, "rewards/format_reward": 0.9921875, "step": 629 }, { "completion_length": 76.6328125, "epoch": 0.040429969517086474, "grad_norm": 2.2798679587397177, "kl": 0.06201171875, "learning_rate": 9.797843665768194e-07, "loss": 0.0025, "reward": 2.90625, "reward_std": 0.1065337061882019, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.234375, "rewards/format_reward": 1.0, "step": 630 }, { "completion_length": 79.9765625, "epoch": 0.0404941440718755, "grad_norm": 2.1199908611910936, "kl": 0.0528564453125, "learning_rate": 9.797522782697984e-07, "loss": 0.0021, "reward": 3.1796875, "reward_std": 0.212006576359272, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 0.9921875, "step": 631 }, { "completion_length": 85.5234375, "epoch": 0.04055831862666453, "grad_norm": 13.600562795923006, "kl": 0.0450439453125, "learning_rate": 9.797201899627774e-07, "loss": 0.0018, "reward": 3.33984375, "reward_std": 0.1549416333436966, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 632 }, { "completion_length": 92.4453125, "epoch": 0.040622493181453556, "grad_norm": 6.884750270762488, "kl": 0.0655517578125, "learning_rate": 9.796881016557564e-07, "loss": 0.0026, "reward": 3.3125, "reward_std": 0.23911622166633606, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 1.0, "step": 633 }, { "completion_length": 81.703125, "epoch": 0.04068666773624258, "grad_norm": 2.034886030936257, "kl": 0.0517578125, "learning_rate": 9.796560133487357e-07, "loss": 0.0021, "reward": 3.4921875, "reward_std": 0.08679073303937912, "rewards/accuracy_reward": 0.8671875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 634 }, { "completion_length": 87.2265625, "epoch": 0.040750842291031604, "grad_norm": 2.4766031925863, "kl": 0.0635986328125, "learning_rate": 9.796239250417147e-07, "loss": 0.0025, "reward": 3.41796875, "reward_std": 0.15649937838315964, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 1.0, "step": 635 }, { "completion_length": 91.921875, "epoch": 0.04081501684582063, "grad_norm": 3.118820757471236, "kl": 0.1627197265625, "learning_rate": 9.795918367346939e-07, "loss": 0.0065, "reward": 3.38671875, "reward_std": 0.19100818037986755, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 0.9921875, "step": 636 }, { "completion_length": 88.859375, "epoch": 0.04087919140060966, "grad_norm": 1.6435688099260775, "kl": 0.0537109375, "learning_rate": 9.795597484276729e-07, "loss": 0.0021, "reward": 3.33203125, "reward_std": 0.0999559760093689, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 0.9921875, "step": 637 }, { "completion_length": 83.65625, "epoch": 0.040943365955398686, "grad_norm": 3.576066007625262, "kl": 0.067626953125, "learning_rate": 9.79527660120652e-07, "loss": 0.0027, "reward": 2.953125, "reward_std": 0.30455560982227325, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.2421875, "rewards/format_reward": 0.9921875, "step": 638 }, { "completion_length": 84.484375, "epoch": 0.041007540510187714, "grad_norm": 2.95792809900302, "kl": 0.04632568359375, "learning_rate": 9.79495571813631e-07, "loss": 0.0018, "reward": 3.41796875, "reward_std": 0.22145751863718033, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 639 }, { "completion_length": 85.3984375, "epoch": 0.041071715064976734, "grad_norm": 3.2376070144230735, "kl": 0.07861328125, "learning_rate": 9.7946348350661e-07, "loss": 0.0031, "reward": 2.85546875, "reward_std": 0.38650763034820557, "rewards/accuracy_reward": 0.5546875, "rewards/format_count_numbers": 1.32421875, "rewards/format_reward": 0.9765625, "step": 640 }, { "completion_length": 102.1484375, "epoch": 0.04113588961976576, "grad_norm": 2.9056670983664445, "kl": 0.0859375, "learning_rate": 9.79431395199589e-07, "loss": 0.0034, "reward": 2.984375, "reward_std": 0.3881564885377884, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.3203125, "rewards/format_reward": 0.9765625, "step": 641 }, { "completion_length": 98.0625, "epoch": 0.04120006417455479, "grad_norm": 5.735504484158051, "kl": 0.06494140625, "learning_rate": 9.793993068925683e-07, "loss": 0.0026, "reward": 2.8671875, "reward_std": 0.36327123641967773, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.1796875, "rewards/format_reward": 0.9921875, "step": 642 }, { "completion_length": 94.046875, "epoch": 0.041264238729343816, "grad_norm": 3.1757372660213625, "kl": 0.052490234375, "learning_rate": 9.793672185855473e-07, "loss": 0.0021, "reward": 3.390625, "reward_std": 0.40001727640628815, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.6875, "rewards/format_reward": 0.9609375, "step": 643 }, { "completion_length": 88.5859375, "epoch": 0.041328413284132844, "grad_norm": 2.4766909083710376, "kl": 0.068115234375, "learning_rate": 9.793351302785265e-07, "loss": 0.0027, "reward": 3.23046875, "reward_std": 0.3160104900598526, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.51171875, "rewards/format_reward": 0.9765625, "step": 644 }, { "completion_length": 91.78125, "epoch": 0.041392587838921864, "grad_norm": 1.2457023256053124, "kl": 0.041259765625, "learning_rate": 9.793030419715055e-07, "loss": 0.0016, "reward": 3.43359375, "reward_std": 0.11310647707432508, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 645 }, { "completion_length": 83.90625, "epoch": 0.04145676239371089, "grad_norm": 1.8415046708993632, "kl": 0.072509765625, "learning_rate": 9.792709536644847e-07, "loss": 0.0029, "reward": 3.2578125, "reward_std": 0.4359375387430191, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.6484375, "rewards/format_reward": 0.96875, "step": 646 }, { "completion_length": 98.6953125, "epoch": 0.04152093694849992, "grad_norm": 3.7682170728512294, "kl": 0.05224609375, "learning_rate": 9.792388653574637e-07, "loss": 0.0021, "reward": 3.2109375, "reward_std": 0.2856694236397743, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.59375, "rewards/format_reward": 0.984375, "step": 647 }, { "completion_length": 87.328125, "epoch": 0.041585111503288946, "grad_norm": 2.690650568412293, "kl": 0.0830078125, "learning_rate": 9.792067770504427e-07, "loss": 0.0033, "reward": 2.6484375, "reward_std": 0.23495376855134964, "rewards/accuracy_reward": 0.5546875, "rewards/format_count_numbers": 1.1015625, "rewards/format_reward": 0.9921875, "step": 648 }, { "completion_length": 94.3671875, "epoch": 0.041649286058077974, "grad_norm": 1.8609407340880912, "kl": 0.0712890625, "learning_rate": 9.79174688743422e-07, "loss": 0.0028, "reward": 3.26171875, "reward_std": 0.15423564612865448, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 0.9921875, "step": 649 }, { "completion_length": 101.8984375, "epoch": 0.041713460612867, "grad_norm": 1.8731081217249252, "kl": 0.0511474609375, "learning_rate": 9.79142600436401e-07, "loss": 0.002, "reward": 3.4140625, "reward_std": 0.11914245784282684, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 650 }, { "completion_length": 86.5546875, "epoch": 0.04177763516765602, "grad_norm": 7.674630318942799, "kl": 0.0438232421875, "learning_rate": 9.7911051212938e-07, "loss": 0.0018, "reward": 3.6640625, "reward_std": 0.35141897946596146, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.828125, "rewards/format_reward": 0.9921875, "step": 651 }, { "completion_length": 91.9140625, "epoch": 0.04184180972244505, "grad_norm": 2.643378245996653, "kl": 0.04638671875, "learning_rate": 9.790784238223591e-07, "loss": 0.0019, "reward": 3.27734375, "reward_std": 0.1568402722477913, "rewards/accuracy_reward": 0.5546875, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 1.0, "step": 652 }, { "completion_length": 91.4921875, "epoch": 0.041905984277234076, "grad_norm": 2.3247277814717604, "kl": 0.057861328125, "learning_rate": 9.790463355153381e-07, "loss": 0.0023, "reward": 3.1328125, "reward_std": 0.23424497246742249, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 0.9921875, "step": 653 }, { "completion_length": 84.734375, "epoch": 0.041970158832023104, "grad_norm": 1.2679423421212694, "kl": 0.047119140625, "learning_rate": 9.790142472083174e-07, "loss": 0.0019, "reward": 3.32421875, "reward_std": 0.11004147678613663, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 654 }, { "completion_length": 94.9140625, "epoch": 0.04203433338681213, "grad_norm": 2.7222298370277844, "kl": 0.134765625, "learning_rate": 9.789821589012964e-07, "loss": 0.0054, "reward": 3.46484375, "reward_std": 0.1573006510734558, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 0.9921875, "step": 655 }, { "completion_length": 82.15625, "epoch": 0.04209850794160116, "grad_norm": 2.188110835333146, "kl": 0.056884765625, "learning_rate": 9.789500705942754e-07, "loss": 0.0023, "reward": 2.8671875, "reward_std": 0.12863079458475113, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 1.0, "step": 656 }, { "completion_length": 91.9921875, "epoch": 0.04216268249639018, "grad_norm": 1.6980481745780014, "kl": 0.05859375, "learning_rate": 9.789179822872546e-07, "loss": 0.0023, "reward": 3.49609375, "reward_std": 0.12889280915260315, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 657 }, { "completion_length": 78.84375, "epoch": 0.042226857051179206, "grad_norm": 1.0466912360806655, "kl": 0.051025390625, "learning_rate": 9.788858939802336e-07, "loss": 0.002, "reward": 3.2890625, "reward_std": 0.0765409953892231, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 0.9921875, "step": 658 }, { "completion_length": 90.7421875, "epoch": 0.042291031605968234, "grad_norm": 12.879523056389981, "kl": 0.0521240234375, "learning_rate": 9.788538056732126e-07, "loss": 0.0021, "reward": 3.25, "reward_std": 0.13994135707616806, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 0.9921875, "step": 659 }, { "completion_length": 83.7890625, "epoch": 0.04235520616075726, "grad_norm": 23.495317479551133, "kl": 0.0386962890625, "learning_rate": 9.788217173661918e-07, "loss": 0.0016, "reward": 3.58203125, "reward_std": 0.17544110864400864, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 660 }, { "completion_length": 86.4140625, "epoch": 0.04241938071554629, "grad_norm": 2.318114026771669, "kl": 0.0538330078125, "learning_rate": 9.787896290591708e-07, "loss": 0.0022, "reward": 3.19921875, "reward_std": 0.11993636563420296, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 661 }, { "completion_length": 79.3671875, "epoch": 0.04248355527033531, "grad_norm": 5.95676898450184, "kl": 0.0498046875, "learning_rate": 9.7875754075215e-07, "loss": 0.002, "reward": 3.01953125, "reward_std": 0.19784298539161682, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 0.9921875, "step": 662 }, { "completion_length": 78.8359375, "epoch": 0.042547729825124336, "grad_norm": 2.1080482602573176, "kl": 0.05078125, "learning_rate": 9.78725452445129e-07, "loss": 0.002, "reward": 3.390625, "reward_std": 0.07760182581841946, "rewards/accuracy_reward": 0.890625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 663 }, { "completion_length": 77.375, "epoch": 0.042611904379913364, "grad_norm": 4.334413325598048, "kl": 0.0443115234375, "learning_rate": 9.78693364138108e-07, "loss": 0.0018, "reward": 3.2734375, "reward_std": 0.14966705441474915, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 664 }, { "completion_length": 88.0, "epoch": 0.04267607893470239, "grad_norm": 4.373968131677572, "kl": 0.040283203125, "learning_rate": 9.786612758310872e-07, "loss": 0.0016, "reward": 3.5078125, "reward_std": 0.061278700828552246, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 665 }, { "completion_length": 81.4453125, "epoch": 0.04274025348949142, "grad_norm": 6.074286952789377, "kl": 0.0526123046875, "learning_rate": 9.786291875240662e-07, "loss": 0.0021, "reward": 2.99609375, "reward_std": 0.11365771293640137, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 1.0, "step": 666 }, { "completion_length": 88.3125, "epoch": 0.042804428044280446, "grad_norm": 6.487879034658459, "kl": 0.040771484375, "learning_rate": 9.785970992170452e-07, "loss": 0.0016, "reward": 3.03515625, "reward_std": 0.11994126252830029, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 667 }, { "completion_length": 85.890625, "epoch": 0.042868602599069466, "grad_norm": 4.73154969948121, "kl": 0.04638671875, "learning_rate": 9.785650109100242e-07, "loss": 0.0019, "reward": 3.5, "reward_std": 0.26427949219942093, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.8515625, "rewards/format_reward": 1.0, "step": 668 }, { "completion_length": 71.625, "epoch": 0.042932777153858494, "grad_norm": 0.9973950008604898, "kl": 0.043212890625, "learning_rate": 9.785329226030034e-07, "loss": 0.0017, "reward": 3.4921875, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.8671875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 669 }, { "completion_length": 78.2890625, "epoch": 0.04299695170864752, "grad_norm": 2.843976443037679, "kl": 0.050537109375, "learning_rate": 9.785008342959824e-07, "loss": 0.002, "reward": 3.17578125, "reward_std": 0.17096532881259918, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 670 }, { "completion_length": 78.8984375, "epoch": 0.04306112626343655, "grad_norm": 6.991700346720553, "kl": 0.0391845703125, "learning_rate": 9.784687459889616e-07, "loss": 0.0016, "reward": 3.4609375, "reward_std": 0.12415502220392227, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 671 }, { "completion_length": 80.4765625, "epoch": 0.043125300818225576, "grad_norm": 2.603205995042236, "kl": 0.046142578125, "learning_rate": 9.784366576819406e-07, "loss": 0.0018, "reward": 3.34765625, "reward_std": 0.20834405720233917, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 672 }, { "completion_length": 77.1171875, "epoch": 0.043189475373014596, "grad_norm": 9.77029944610399, "kl": 0.051513671875, "learning_rate": 9.784045693749198e-07, "loss": 0.0021, "reward": 3.640625, "reward_std": 0.12756995856761932, "rewards/accuracy_reward": 0.90625, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 673 }, { "completion_length": 91.3203125, "epoch": 0.043253649927803624, "grad_norm": 2.0910705044297875, "kl": 0.0416259765625, "learning_rate": 9.783724810678988e-07, "loss": 0.0017, "reward": 3.171875, "reward_std": 0.1430967329069972, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.46875, "rewards/format_reward": 1.0, "step": 674 }, { "completion_length": 77.3515625, "epoch": 0.04331782448259265, "grad_norm": 2.034224292830113, "kl": 0.0487060546875, "learning_rate": 9.783403927608778e-07, "loss": 0.002, "reward": 3.25, "reward_std": 0.0731260534375906, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 675 }, { "completion_length": 73.296875, "epoch": 0.04338199903738168, "grad_norm": 3.702549868219202, "kl": 0.057373046875, "learning_rate": 9.783083044538568e-07, "loss": 0.0023, "reward": 3.296875, "reward_std": 0.1615143921226263, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.46875, "rewards/format_reward": 1.0, "step": 676 }, { "completion_length": 88.1953125, "epoch": 0.043446173592170706, "grad_norm": 2.6150920726172227, "kl": 0.05126953125, "learning_rate": 9.78276216146836e-07, "loss": 0.0021, "reward": 3.3984375, "reward_std": 0.16834919154644012, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 0.9921875, "step": 677 }, { "completion_length": 82.2421875, "epoch": 0.04351034814695973, "grad_norm": 3.127804850363183, "kl": 0.0435791015625, "learning_rate": 9.78244127839815e-07, "loss": 0.0017, "reward": 3.49609375, "reward_std": 0.23770058155059814, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 0.9921875, "step": 678 }, { "completion_length": 92.4375, "epoch": 0.043574522701748754, "grad_norm": 2.3704147916671388, "kl": 0.048095703125, "learning_rate": 9.782120395327943e-07, "loss": 0.0019, "reward": 3.37109375, "reward_std": 0.21028479933738708, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.85546875, "rewards/format_reward": 0.984375, "step": 679 }, { "completion_length": 84.71875, "epoch": 0.04363869725653778, "grad_norm": 1.706418603885426, "kl": 0.047119140625, "learning_rate": 9.781799512257733e-07, "loss": 0.0019, "reward": 3.56640625, "reward_std": 0.10662653297185898, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 680 }, { "completion_length": 82.265625, "epoch": 0.04370287181132681, "grad_norm": 33.04712004657233, "kl": 0.0469970703125, "learning_rate": 9.781478629187525e-07, "loss": 0.0019, "reward": 3.3046875, "reward_std": 0.1525501161813736, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 681 }, { "completion_length": 81.5234375, "epoch": 0.043767046366115836, "grad_norm": 4.236278696115693, "kl": 0.0489501953125, "learning_rate": 9.781157746117315e-07, "loss": 0.002, "reward": 2.9765625, "reward_std": 0.22225631400942802, "rewards/accuracy_reward": 0.4921875, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 682 }, { "completion_length": 78.5078125, "epoch": 0.04383122092090486, "grad_norm": 3.0745168997406482, "kl": 0.03662109375, "learning_rate": 9.780836863047105e-07, "loss": 0.0015, "reward": 3.5390625, "reward_std": 0.154142826795578, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 683 }, { "completion_length": 80.578125, "epoch": 0.04389539547569389, "grad_norm": 9.008747058591027, "kl": 0.0574951171875, "learning_rate": 9.780515979976895e-07, "loss": 0.0023, "reward": 3.48828125, "reward_std": 0.19965489953756332, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 684 }, { "completion_length": 84.984375, "epoch": 0.04395957003048291, "grad_norm": 1.970732222436413, "kl": 0.050537109375, "learning_rate": 9.780195096906687e-07, "loss": 0.002, "reward": 3.09375, "reward_std": 0.1462521031498909, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 685 }, { "completion_length": 74.15625, "epoch": 0.04402374458527194, "grad_norm": 5.804151455812569, "kl": 0.0478515625, "learning_rate": 9.779874213836477e-07, "loss": 0.0019, "reward": 3.1171875, "reward_std": 0.24218494445085526, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 0.9921875, "step": 686 }, { "completion_length": 82.90625, "epoch": 0.044087919140060966, "grad_norm": 2.126710220152752, "kl": 0.0556640625, "learning_rate": 9.77955333076627e-07, "loss": 0.0022, "reward": 3.2265625, "reward_std": 0.16583409160375595, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 687 }, { "completion_length": 79.015625, "epoch": 0.04415209369484999, "grad_norm": 7.022789652844672, "kl": 0.057373046875, "learning_rate": 9.77923244769606e-07, "loss": 0.0023, "reward": 3.16015625, "reward_std": 0.16896606981754303, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 688 }, { "completion_length": 81.6796875, "epoch": 0.04421626824963902, "grad_norm": 1.9302629928834274, "kl": 0.0457763671875, "learning_rate": 9.778911564625851e-07, "loss": 0.0018, "reward": 3.60546875, "reward_std": 0.11882129311561584, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 689 }, { "completion_length": 86.7109375, "epoch": 0.04428044280442804, "grad_norm": 2.7379221117216606, "kl": 0.0474853515625, "learning_rate": 9.778590681555641e-07, "loss": 0.0019, "reward": 3.328125, "reward_std": 0.13994135707616806, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 690 }, { "completion_length": 76.578125, "epoch": 0.04434461735921707, "grad_norm": 1.3823349057089354, "kl": 0.041015625, "learning_rate": 9.778269798485431e-07, "loss": 0.0016, "reward": 3.6484375, "reward_std": 0.03234682232141495, "rewards/accuracy_reward": 0.8984375, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 691 }, { "completion_length": 96.0, "epoch": 0.044408791914006096, "grad_norm": 2.922908300130553, "kl": 0.04248046875, "learning_rate": 9.777948915415221e-07, "loss": 0.0017, "reward": 3.0390625, "reward_std": 0.2597939074039459, "rewards/accuracy_reward": 0.5703125, "rewards/format_count_numbers": 1.46875, "rewards/format_reward": 1.0, "step": 692 }, { "completion_length": 81.2578125, "epoch": 0.04447296646879512, "grad_norm": 1.2524866373426822, "kl": 0.0406494140625, "learning_rate": 9.777628032345013e-07, "loss": 0.0016, "reward": 3.12890625, "reward_std": 0.07745211198925972, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 1.0, "step": 693 }, { "completion_length": 75.5703125, "epoch": 0.04453714102358415, "grad_norm": 1.760452108233736, "kl": 0.052490234375, "learning_rate": 9.777307149274803e-07, "loss": 0.0021, "reward": 3.68359375, "reward_std": 0.12737222015857697, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.85546875, "rewards/format_reward": 1.0, "step": 694 }, { "completion_length": 79.1640625, "epoch": 0.04460131557837318, "grad_norm": 2.477662063766542, "kl": 0.069091796875, "learning_rate": 9.776986266204593e-07, "loss": 0.0028, "reward": 3.0703125, "reward_std": 0.16834918782114983, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 0.9921875, "step": 695 }, { "completion_length": 90.2578125, "epoch": 0.0446654901331622, "grad_norm": 4.834333348846118, "kl": 0.06591796875, "learning_rate": 9.776665383134385e-07, "loss": 0.0026, "reward": 3.3125, "reward_std": 0.342337891459465, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.71875, "rewards/format_reward": 0.9921875, "step": 696 }, { "completion_length": 90.046875, "epoch": 0.044729664687951226, "grad_norm": 3.8343522026995163, "kl": 0.057373046875, "learning_rate": 9.776344500064175e-07, "loss": 0.0023, "reward": 3.24609375, "reward_std": 0.24745208770036697, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 0.9921875, "step": 697 }, { "completion_length": 86.6015625, "epoch": 0.044793839242740253, "grad_norm": 4.492106801917972, "kl": 0.064453125, "learning_rate": 9.776023616993968e-07, "loss": 0.0026, "reward": 3.37109375, "reward_std": 0.22860007733106613, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 698 }, { "completion_length": 84.734375, "epoch": 0.04485801379752928, "grad_norm": 1.4842812890013282, "kl": 0.0684814453125, "learning_rate": 9.775702733923758e-07, "loss": 0.0027, "reward": 3.1484375, "reward_std": 0.16834919899702072, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 699 }, { "completion_length": 91.0390625, "epoch": 0.04492218835231831, "grad_norm": 3.6438368977702007, "kl": 0.0732421875, "learning_rate": 9.77538185085355e-07, "loss": 0.0029, "reward": 3.14453125, "reward_std": 0.21199019998311996, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 700 }, { "completion_length": 74.109375, "epoch": 0.04498636290710733, "grad_norm": 0.6697792381691832, "kl": 0.052734375, "learning_rate": 9.77506096778334e-07, "loss": 0.0021, "reward": 3.65625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.9375, "rewards/format_count_numbers": 1.71875, "rewards/format_reward": 1.0, "step": 701 }, { "completion_length": 86.78125, "epoch": 0.045050537461896356, "grad_norm": 2.3303029405561952, "kl": 0.069091796875, "learning_rate": 9.77474008471313e-07, "loss": 0.0028, "reward": 3.484375, "reward_std": 0.12551573291420937, "rewards/accuracy_reward": 0.8671875, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 702 }, { "completion_length": 89.890625, "epoch": 0.045114712016685383, "grad_norm": 3.3540186879337237, "kl": 0.048095703125, "learning_rate": 9.77441920164292e-07, "loss": 0.0019, "reward": 3.55859375, "reward_std": 0.21857933700084686, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 703 }, { "completion_length": 93.8046875, "epoch": 0.04517888657147441, "grad_norm": 2.7018468803088758, "kl": 0.048583984375, "learning_rate": 9.774098318572712e-07, "loss": 0.0019, "reward": 3.2734375, "reward_std": 0.21336729824543, "rewards/accuracy_reward": 0.40625, "rewards/format_count_numbers": 1.8671875, "rewards/format_reward": 1.0, "step": 704 }, { "completion_length": 90.890625, "epoch": 0.04524306112626344, "grad_norm": 32.10436587653831, "kl": 0.0479736328125, "learning_rate": 9.773777435502502e-07, "loss": 0.0019, "reward": 3.39453125, "reward_std": 0.19859902560710907, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 705 }, { "completion_length": 89.453125, "epoch": 0.045307235681052466, "grad_norm": 1.327059158558712, "kl": 0.065673828125, "learning_rate": 9.773456552432294e-07, "loss": 0.0026, "reward": 3.2890625, "reward_std": 0.09916212782263756, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 706 }, { "completion_length": 102.171875, "epoch": 0.045371410235841486, "grad_norm": 1.4027037327355352, "kl": 0.059814453125, "learning_rate": 9.773135669362084e-07, "loss": 0.0024, "reward": 3.01171875, "reward_std": 0.18933620303869247, "rewards/accuracy_reward": 0.4296875, "rewards/format_count_numbers": 1.58203125, "rewards/format_reward": 1.0, "step": 707 }, { "completion_length": 103.984375, "epoch": 0.045435584790630514, "grad_norm": 3.7264451637963187, "kl": 0.055908203125, "learning_rate": 9.772814786291876e-07, "loss": 0.0022, "reward": 3.29296875, "reward_std": 0.211207777261734, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 708 }, { "completion_length": 89.3046875, "epoch": 0.04549975934541954, "grad_norm": 1.6190023297414338, "kl": 0.0780029296875, "learning_rate": 9.772493903221666e-07, "loss": 0.0031, "reward": 3.19140625, "reward_std": 0.1437433697283268, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 709 }, { "completion_length": 98.59375, "epoch": 0.04556393390020857, "grad_norm": 3.0391119265889435, "kl": 0.053955078125, "learning_rate": 9.772173020151456e-07, "loss": 0.0022, "reward": 3.6328125, "reward_std": 0.29602084308862686, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.8515625, "rewards/format_reward": 1.0, "step": 710 }, { "completion_length": 88.1171875, "epoch": 0.045628108454997596, "grad_norm": 2.1096280181602496, "kl": 0.0616455078125, "learning_rate": 9.771852137081246e-07, "loss": 0.0025, "reward": 3.38671875, "reward_std": 0.17669834941625595, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 711 }, { "completion_length": 87.9609375, "epoch": 0.045692283009786616, "grad_norm": 37.55014215575033, "kl": 0.0540771484375, "learning_rate": 9.771531254011038e-07, "loss": 0.0022, "reward": 3.1484375, "reward_std": 0.14230038225650787, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 712 }, { "completion_length": 94.4609375, "epoch": 0.045756457564575644, "grad_norm": 3.5860640935648296, "kl": 0.0576171875, "learning_rate": 9.771210370940828e-07, "loss": 0.0023, "reward": 2.82421875, "reward_std": 0.1838735118508339, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 1.0, "step": 713 }, { "completion_length": 105.4296875, "epoch": 0.04582063211936467, "grad_norm": 2.0000594963509513, "kl": 0.072021484375, "learning_rate": 9.77088948787062e-07, "loss": 0.0029, "reward": 2.93359375, "reward_std": 0.2624771222472191, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.23046875, "rewards/format_reward": 0.9921875, "step": 714 }, { "completion_length": 100.921875, "epoch": 0.0458848066741537, "grad_norm": 1.7473624891349069, "kl": 0.044677734375, "learning_rate": 9.77056860480041e-07, "loss": 0.0018, "reward": 3.33203125, "reward_std": 0.19809433817863464, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 715 }, { "completion_length": 95.0234375, "epoch": 0.045948981228942726, "grad_norm": 4.847402920368904, "kl": 0.0557861328125, "learning_rate": 9.770247721730202e-07, "loss": 0.0022, "reward": 3.5078125, "reward_std": 0.21160002797842026, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 1.0, "step": 716 }, { "completion_length": 103.34375, "epoch": 0.04601315578373175, "grad_norm": 1.7571886472712293, "kl": 0.0511474609375, "learning_rate": 9.769926838659992e-07, "loss": 0.002, "reward": 3.2734375, "reward_std": 0.14807433634996414, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 717 }, { "completion_length": 103.46875, "epoch": 0.046077330338520774, "grad_norm": 2.3164085058090884, "kl": 0.0472412109375, "learning_rate": 9.769605955589782e-07, "loss": 0.0019, "reward": 3.41796875, "reward_std": 0.16383001953363419, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 1.0, "step": 718 }, { "completion_length": 105.3515625, "epoch": 0.0461415048933098, "grad_norm": 6.445146414989472, "kl": 0.0477294921875, "learning_rate": 9.769285072519572e-07, "loss": 0.0019, "reward": 3.2265625, "reward_std": 0.23857943713665009, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 719 }, { "completion_length": 102.5, "epoch": 0.04620567944809883, "grad_norm": 4.753051753431614, "kl": 0.04638671875, "learning_rate": 9.768964189449365e-07, "loss": 0.0019, "reward": 3.5, "reward_std": 0.24276069551706314, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.875, "rewards/format_reward": 1.0, "step": 720 }, { "completion_length": 93.8046875, "epoch": 0.046269854002887856, "grad_norm": 2.8267845746675944, "kl": 0.103759765625, "learning_rate": 9.768643306379155e-07, "loss": 0.0042, "reward": 3.296875, "reward_std": 0.08337578736245632, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 721 }, { "completion_length": 96.7890625, "epoch": 0.04633402855767688, "grad_norm": 1.8345026018441861, "kl": 0.0489501953125, "learning_rate": 9.768322423308947e-07, "loss": 0.002, "reward": 3.12890625, "reward_std": 0.2040724754333496, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 0.9921875, "step": 722 }, { "completion_length": 96.734375, "epoch": 0.04639820311246591, "grad_norm": 1.7020604179535077, "kl": 0.074951171875, "learning_rate": 9.768001540238737e-07, "loss": 0.003, "reward": 3.296875, "reward_std": 0.14873018115758896, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 723 }, { "completion_length": 99.2109375, "epoch": 0.04646237766725493, "grad_norm": 1.7053794521094234, "kl": 0.04443359375, "learning_rate": 9.767680657168529e-07, "loss": 0.0018, "reward": 3.359375, "reward_std": 0.11230766773223877, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 724 }, { "completion_length": 90.7421875, "epoch": 0.04652655222204396, "grad_norm": 1.0777985219506854, "kl": 0.0458984375, "learning_rate": 9.767359774098319e-07, "loss": 0.0018, "reward": 2.87109375, "reward_std": 0.08417459577322006, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 0.9921875, "step": 725 }, { "completion_length": 96.21875, "epoch": 0.046590726776832986, "grad_norm": 2.723424650541465, "kl": 0.0509033203125, "learning_rate": 9.767038891028109e-07, "loss": 0.002, "reward": 3.2109375, "reward_std": 0.20411095768213272, "rewards/accuracy_reward": 0.46875, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 726 }, { "completion_length": 93.4296875, "epoch": 0.04665490133162201, "grad_norm": 3.1063958624567545, "kl": 0.052490234375, "learning_rate": 9.766718007957899e-07, "loss": 0.0021, "reward": 3.203125, "reward_std": 0.2406623512506485, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.453125, "rewards/format_reward": 0.9921875, "step": 727 }, { "completion_length": 90.65625, "epoch": 0.04671907588641104, "grad_norm": 5.143270191547739, "kl": 0.0633544921875, "learning_rate": 9.76639712488769e-07, "loss": 0.0025, "reward": 3.51171875, "reward_std": 0.1457892656326294, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 0.9921875, "step": 728 }, { "completion_length": 93.0859375, "epoch": 0.04678325044120006, "grad_norm": 2.379972901533909, "kl": 0.048583984375, "learning_rate": 9.76607624181748e-07, "loss": 0.0019, "reward": 3.5546875, "reward_std": 0.16293007880449295, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 729 }, { "completion_length": 87.015625, "epoch": 0.04684742499598909, "grad_norm": 2.510002069564492, "kl": 0.075927734375, "learning_rate": 9.76575535874727e-07, "loss": 0.003, "reward": 3.41015625, "reward_std": 0.12700175493955612, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.71484375, "rewards/format_reward": 0.9921875, "step": 730 }, { "completion_length": 83.2421875, "epoch": 0.046911599550778116, "grad_norm": 2.4145120528523463, "kl": 0.0538330078125, "learning_rate": 9.765434475677063e-07, "loss": 0.0022, "reward": 3.46875, "reward_std": 0.1422954797744751, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 731 }, { "completion_length": 86.765625, "epoch": 0.04697577410556714, "grad_norm": 1.82688614699028, "kl": 0.0411376953125, "learning_rate": 9.765113592606853e-07, "loss": 0.0017, "reward": 3.55078125, "reward_std": 0.07232724130153656, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 732 }, { "completion_length": 91.828125, "epoch": 0.04703994866035617, "grad_norm": 12.249281049665607, "kl": 0.04150390625, "learning_rate": 9.764792709536645e-07, "loss": 0.0017, "reward": 2.953125, "reward_std": 0.26538965851068497, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.234375, "rewards/format_reward": 1.0, "step": 733 }, { "completion_length": 90.84375, "epoch": 0.0471041232151452, "grad_norm": 2.5008445544592486, "kl": 0.0693359375, "learning_rate": 9.764471826466435e-07, "loss": 0.0028, "reward": 3.43359375, "reward_std": 0.20663955807685852, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.81640625, "rewards/format_reward": 1.0, "step": 734 }, { "completion_length": 87.1328125, "epoch": 0.04716829776993422, "grad_norm": 2.161386475058229, "kl": 0.0443115234375, "learning_rate": 9.764150943396225e-07, "loss": 0.0018, "reward": 2.9765625, "reward_std": 0.27872658520936966, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.3359375, "rewards/format_reward": 0.9921875, "step": 735 }, { "completion_length": 85.875, "epoch": 0.047232472324723246, "grad_norm": 10.607954487383465, "kl": 0.0498046875, "learning_rate": 9.763830060326017e-07, "loss": 0.002, "reward": 3.01953125, "reward_std": 0.2600190341472626, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 0.9921875, "step": 736 }, { "completion_length": 79.8125, "epoch": 0.04729664687951227, "grad_norm": 1.5255666442758538, "kl": 0.0972900390625, "learning_rate": 9.763509177255807e-07, "loss": 0.0039, "reward": 3.796875, "reward_std": 0.08075719699263573, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.9765625, "rewards/format_reward": 1.0, "step": 737 }, { "completion_length": 74.5390625, "epoch": 0.0473608214343013, "grad_norm": 2.6742535809979797, "kl": 0.038818359375, "learning_rate": 9.763188294185597e-07, "loss": 0.0016, "reward": 2.98828125, "reward_std": 0.2778538912534714, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 738 }, { "completion_length": 81.21875, "epoch": 0.04742499598909033, "grad_norm": 1.752278342051842, "kl": 0.048095703125, "learning_rate": 9.76286741111539e-07, "loss": 0.0019, "reward": 3.36328125, "reward_std": 0.10740040242671967, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 739 }, { "completion_length": 83.6796875, "epoch": 0.04748917054387935, "grad_norm": 5.039206599212193, "kl": 0.0477294921875, "learning_rate": 9.76254652804518e-07, "loss": 0.0019, "reward": 3.19140625, "reward_std": 0.22987818717956543, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.58984375, "rewards/format_reward": 0.9921875, "step": 740 }, { "completion_length": 82.1953125, "epoch": 0.047553345098668376, "grad_norm": 2.4531251216839993, "kl": 0.0494384765625, "learning_rate": 9.762225644974972e-07, "loss": 0.002, "reward": 2.79296875, "reward_std": 0.20915354043245316, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.10546875, "rewards/format_reward": 1.0, "step": 741 }, { "completion_length": 74.9140625, "epoch": 0.0476175196534574, "grad_norm": 2.418833777726406, "kl": 0.0450439453125, "learning_rate": 9.761904761904762e-07, "loss": 0.0018, "reward": 3.30859375, "reward_std": 0.1448042094707489, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 742 }, { "completion_length": 85.875, "epoch": 0.04768169420824643, "grad_norm": 3.8340138718189323, "kl": 0.045166015625, "learning_rate": 9.761583878834552e-07, "loss": 0.0018, "reward": 2.97265625, "reward_std": 0.2894367575645447, "rewards/accuracy_reward": 0.515625, "rewards/format_count_numbers": 1.45703125, "rewards/format_reward": 1.0, "step": 743 }, { "completion_length": 78.46875, "epoch": 0.04774586876303546, "grad_norm": 2.573033703950277, "kl": 0.052490234375, "learning_rate": 9.761262995764344e-07, "loss": 0.0021, "reward": 3.39453125, "reward_std": 0.3527545630931854, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.65234375, "rewards/format_reward": 0.9921875, "step": 744 }, { "completion_length": 89.8671875, "epoch": 0.047810043317824485, "grad_norm": 2.733001554494192, "kl": 0.0489501953125, "learning_rate": 9.760942112694134e-07, "loss": 0.002, "reward": 3.29296875, "reward_std": 0.287816658616066, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.69921875, "rewards/format_reward": 1.0, "step": 745 }, { "completion_length": 76.3125, "epoch": 0.047874217872613506, "grad_norm": 2.879654987886879, "kl": 0.056396484375, "learning_rate": 9.760621229623924e-07, "loss": 0.0023, "reward": 3.33203125, "reward_std": 0.3126887083053589, "rewards/accuracy_reward": 0.5546875, "rewards/format_count_numbers": 1.77734375, "rewards/format_reward": 1.0, "step": 746 }, { "completion_length": 75.328125, "epoch": 0.04793839242740253, "grad_norm": 1.8237723506610835, "kl": 0.048583984375, "learning_rate": 9.760300346553716e-07, "loss": 0.0019, "reward": 3.16796875, "reward_std": 0.07574218884110451, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 747 }, { "completion_length": 72.8828125, "epoch": 0.04800256698219156, "grad_norm": 3.5677244705536473, "kl": 0.060791015625, "learning_rate": 9.759979463483506e-07, "loss": 0.0024, "reward": 3.10546875, "reward_std": 0.26896222680807114, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.41796875, "rewards/format_reward": 0.9921875, "step": 748 }, { "completion_length": 78.890625, "epoch": 0.04806674153698059, "grad_norm": 1.942942747061593, "kl": 0.0360107421875, "learning_rate": 9.759658580413298e-07, "loss": 0.0014, "reward": 2.984375, "reward_std": 0.17700131237506866, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.2421875, "rewards/format_reward": 1.0, "step": 749 }, { "completion_length": 80.1484375, "epoch": 0.048130916091769615, "grad_norm": 3.8795055517270076, "kl": 0.048095703125, "learning_rate": 9.759337697343088e-07, "loss": 0.0019, "reward": 3.37109375, "reward_std": 0.20597058534622192, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.71484375, "rewards/format_reward": 1.0, "step": 750 }, { "completion_length": 73.65625, "epoch": 0.04819509064655864, "grad_norm": 2.7933572954167283, "kl": 0.1051025390625, "learning_rate": 9.75901681427288e-07, "loss": 0.0042, "reward": 3.1796875, "reward_std": 0.17684098333120346, "rewards/accuracy_reward": 0.8515625, "rewards/format_count_numbers": 1.3359375, "rewards/format_reward": 0.9921875, "step": 751 }, { "completion_length": 87.9140625, "epoch": 0.04825926520134766, "grad_norm": 1.4620075595853868, "kl": 0.041015625, "learning_rate": 9.75869593120267e-07, "loss": 0.0016, "reward": 3.03515625, "reward_std": 0.11074746213853359, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 752 }, { "completion_length": 88.703125, "epoch": 0.04832343975613669, "grad_norm": 3.371243328971925, "kl": 0.0413818359375, "learning_rate": 9.75837504813246e-07, "loss": 0.0017, "reward": 3.3203125, "reward_std": 0.1695174239575863, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 753 }, { "completion_length": 77.078125, "epoch": 0.04838761431092572, "grad_norm": 1.6225760268750145, "kl": 0.062744140625, "learning_rate": 9.75805416506225e-07, "loss": 0.0025, "reward": 3.74609375, "reward_std": 0.09442432783544064, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.99609375, "rewards/format_reward": 1.0, "step": 754 }, { "completion_length": 74.234375, "epoch": 0.048451788865714746, "grad_norm": 3.4149213515317864, "kl": 0.0499267578125, "learning_rate": 9.757733281992042e-07, "loss": 0.002, "reward": 2.984375, "reward_std": 0.13781969621777534, "rewards/accuracy_reward": 0.4921875, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 755 }, { "completion_length": 74.5234375, "epoch": 0.04851596342050377, "grad_norm": 3.482856470607772, "kl": 0.03759765625, "learning_rate": 9.757412398921832e-07, "loss": 0.0015, "reward": 3.6796875, "reward_std": 0.0946863517165184, "rewards/accuracy_reward": 0.9296875, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 756 }, { "completion_length": 83.9296875, "epoch": 0.04858013797529279, "grad_norm": 11.033620905550427, "kl": 0.04443359375, "learning_rate": 9.757091515851622e-07, "loss": 0.0018, "reward": 3.0703125, "reward_std": 0.24539462849497795, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.3515625, "rewards/format_reward": 0.9921875, "step": 757 }, { "completion_length": 81.109375, "epoch": 0.04864431253008182, "grad_norm": 4.048362387606543, "kl": 0.037841796875, "learning_rate": 9.756770632781414e-07, "loss": 0.0015, "reward": 3.078125, "reward_std": 0.16545338928699493, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 1.0, "step": 758 }, { "completion_length": 78.2578125, "epoch": 0.04870848708487085, "grad_norm": 2.0488221338734744, "kl": 0.037353515625, "learning_rate": 9.756449749711204e-07, "loss": 0.0015, "reward": 3.2578125, "reward_std": 0.1669941134750843, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 759 }, { "completion_length": 79.5390625, "epoch": 0.048772661639659876, "grad_norm": 1.8564694109885338, "kl": 0.0487060546875, "learning_rate": 9.756128866640996e-07, "loss": 0.0019, "reward": 3.4375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.6015625, "rewards/format_reward": 1.0, "step": 760 }, { "completion_length": 92.78125, "epoch": 0.0488368361944489, "grad_norm": 2.239573687798494, "kl": 0.0498046875, "learning_rate": 9.755807983570786e-07, "loss": 0.002, "reward": 3.66796875, "reward_std": 0.2930232286453247, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.96484375, "rewards/format_reward": 1.0, "step": 761 }, { "completion_length": 82.5234375, "epoch": 0.04890101074923793, "grad_norm": 4.379819200312334, "kl": 0.0460205078125, "learning_rate": 9.755487100500576e-07, "loss": 0.0018, "reward": 3.2421875, "reward_std": 0.19332443922758102, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 762 }, { "completion_length": 81.359375, "epoch": 0.04896518530402695, "grad_norm": 2.3900592927394935, "kl": 0.0401611328125, "learning_rate": 9.755166217430369e-07, "loss": 0.0016, "reward": 3.453125, "reward_std": 0.15728970617055893, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 763 }, { "completion_length": 77.5703125, "epoch": 0.04902935985881598, "grad_norm": 4.041045642518399, "kl": 0.04974365234375, "learning_rate": 9.754845334360159e-07, "loss": 0.002, "reward": 3.3125, "reward_std": 0.12179599329829216, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.6015625, "rewards/format_reward": 1.0, "step": 764 }, { "completion_length": 81.34375, "epoch": 0.049093534413605006, "grad_norm": 1.7621143536192156, "kl": 0.03656005859375, "learning_rate": 9.754524451289949e-07, "loss": 0.0015, "reward": 3.296875, "reward_std": 0.11230766773223877, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 765 }, { "completion_length": 86.4609375, "epoch": 0.04915770896839403, "grad_norm": 2.139248342897888, "kl": 0.03271484375, "learning_rate": 9.75420356821974e-07, "loss": 0.0013, "reward": 3.4453125, "reward_std": 0.1207351740449667, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 766 }, { "completion_length": 88.25, "epoch": 0.04922188352318306, "grad_norm": 4.292408890270349, "kl": 0.0396728515625, "learning_rate": 9.75388268514953e-07, "loss": 0.0016, "reward": 3.36328125, "reward_std": 0.2719214856624603, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 0.9921875, "step": 767 }, { "completion_length": 91.8671875, "epoch": 0.04928605807797208, "grad_norm": 3.774067225845392, "kl": 0.0556640625, "learning_rate": 9.753561802079323e-07, "loss": 0.0022, "reward": 3.0703125, "reward_std": 0.24571412801742554, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 768 }, { "completion_length": 81.5078125, "epoch": 0.04935023263276111, "grad_norm": 2.002237385509296, "kl": 0.0321044921875, "learning_rate": 9.753240919009113e-07, "loss": 0.0013, "reward": 3.6171875, "reward_std": 0.08891239762306213, "rewards/accuracy_reward": 0.8671875, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 769 }, { "completion_length": 95.2265625, "epoch": 0.049414407187550136, "grad_norm": 2.6505837204186804, "kl": 0.037841796875, "learning_rate": 9.752920035938903e-07, "loss": 0.0015, "reward": 3.05078125, "reward_std": 0.2060510218143463, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 0.984375, "step": 770 }, { "completion_length": 85.6953125, "epoch": 0.04947858174233916, "grad_norm": 10.03344809750991, "kl": 0.059814453125, "learning_rate": 9.752599152868695e-07, "loss": 0.0024, "reward": 3.203125, "reward_std": 0.23364391177892685, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.4609375, "rewards/format_reward": 1.0, "step": 771 }, { "completion_length": 102.390625, "epoch": 0.04954275629712819, "grad_norm": 42.60596848683872, "kl": 0.0452880859375, "learning_rate": 9.752278269798485e-07, "loss": 0.0018, "reward": 3.44140625, "reward_std": 0.18964748084545135, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 772 }, { "completion_length": 106.890625, "epoch": 0.04960693085191722, "grad_norm": 3.628495436974386, "kl": 0.0482177734375, "learning_rate": 9.751957386728275e-07, "loss": 0.0019, "reward": 3.42578125, "reward_std": 0.21908964589238167, "rewards/accuracy_reward": 0.4453125, "rewards/format_count_numbers": 1.98046875, "rewards/format_reward": 1.0, "step": 773 }, { "completion_length": 90.4296875, "epoch": 0.04967110540670624, "grad_norm": 2.4061516479626786, "kl": 0.0406494140625, "learning_rate": 9.751636503658067e-07, "loss": 0.0016, "reward": 3.5234375, "reward_std": 0.13941731117665768, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 774 }, { "completion_length": 96.0390625, "epoch": 0.049735279961495266, "grad_norm": 6.222902859831556, "kl": 0.0435791015625, "learning_rate": 9.751315620587857e-07, "loss": 0.0017, "reward": 2.83203125, "reward_std": 0.16413544118404388, "rewards/accuracy_reward": 0.46875, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 0.9921875, "step": 775 }, { "completion_length": 94.8203125, "epoch": 0.04979945451628429, "grad_norm": 0.8271362685750377, "kl": 0.03375244140625, "learning_rate": 9.75099473751765e-07, "loss": 0.0014, "reward": 3.30078125, "reward_std": 0.0881456807255745, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 776 }, { "completion_length": 95.609375, "epoch": 0.04986362907107332, "grad_norm": 3.7662345214127018, "kl": 0.0772705078125, "learning_rate": 9.75067385444744e-07, "loss": 0.0031, "reward": 3.19921875, "reward_std": 0.14006003737449646, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 777 }, { "completion_length": 83.4296875, "epoch": 0.04992780362586235, "grad_norm": 4.3448920376505935, "kl": 0.0423583984375, "learning_rate": 9.75035297137723e-07, "loss": 0.0017, "reward": 3.27734375, "reward_std": 0.27619215101003647, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 0.984375, "step": 778 }, { "completion_length": 94.25, "epoch": 0.04999197818065137, "grad_norm": 3.284293163206841, "kl": 0.03271484375, "learning_rate": 9.750032088307021e-07, "loss": 0.0013, "reward": 3.671875, "reward_std": 0.13781969621777534, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.875, "rewards/format_reward": 1.0, "step": 779 }, { "completion_length": 90.609375, "epoch": 0.050056152735440396, "grad_norm": 13.187905600501297, "kl": 0.0406494140625, "learning_rate": 9.749711205236811e-07, "loss": 0.0016, "reward": 3.46875, "reward_std": 0.13258251827210188, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.8515625, "rewards/format_reward": 0.9921875, "step": 780 }, { "completion_length": 90.5078125, "epoch": 0.05012032729022942, "grad_norm": 5.033400578659343, "kl": 0.04931640625, "learning_rate": 9.749390322166601e-07, "loss": 0.002, "reward": 2.92578125, "reward_std": 0.16017881035804749, "rewards/accuracy_reward": 0.5546875, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 781 }, { "completion_length": 86.828125, "epoch": 0.05018450184501845, "grad_norm": 1.0687415489263772, "kl": 0.03271484375, "learning_rate": 9.749069439096393e-07, "loss": 0.0013, "reward": 3.125, "reward_std": 0.12179600074887276, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 0.9921875, "step": 782 }, { "completion_length": 85.8125, "epoch": 0.05024867639980748, "grad_norm": 4.2573121796565765, "kl": 0.0484619140625, "learning_rate": 9.748748556026183e-07, "loss": 0.0019, "reward": 3.23828125, "reward_std": 0.13261254876852036, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 783 }, { "completion_length": 78.015625, "epoch": 0.050312850954596505, "grad_norm": 3.191937211894696, "kl": 0.038330078125, "learning_rate": 9.748427672955975e-07, "loss": 0.0015, "reward": 3.28125, "reward_std": 0.09863808378577232, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 784 }, { "completion_length": 86.234375, "epoch": 0.050377025509385526, "grad_norm": 1.5246320677776235, "kl": 0.0391845703125, "learning_rate": 9.748106789885765e-07, "loss": 0.0016, "reward": 3.1875, "reward_std": 0.09863808751106262, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 785 }, { "completion_length": 81.9765625, "epoch": 0.05044120006417455, "grad_norm": 5.664654699027388, "kl": 0.05419921875, "learning_rate": 9.747785906815555e-07, "loss": 0.0022, "reward": 3.41015625, "reward_std": 0.17256293818354607, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 786 }, { "completion_length": 82.0625, "epoch": 0.05050537461896358, "grad_norm": 1.2932752514928887, "kl": 0.043701171875, "learning_rate": 9.747465023745348e-07, "loss": 0.0017, "reward": 3.16796875, "reward_std": 0.1331993918865919, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 0.9921875, "step": 787 }, { "completion_length": 93.8046875, "epoch": 0.05056954917375261, "grad_norm": 2.4390416794852636, "kl": 0.042236328125, "learning_rate": 9.747144140675138e-07, "loss": 0.0017, "reward": 3.3984375, "reward_std": 0.17826200276613235, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 788 }, { "completion_length": 82.8984375, "epoch": 0.050633723728541635, "grad_norm": 2.537556021533954, "kl": 0.0367431640625, "learning_rate": 9.746823257604928e-07, "loss": 0.0015, "reward": 3.69921875, "reward_std": 0.1520880162715912, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 0.9921875, "step": 789 }, { "completion_length": 88.6875, "epoch": 0.05069789828333066, "grad_norm": 1.0697321569013913, "kl": 0.04052734375, "learning_rate": 9.746502374534718e-07, "loss": 0.0016, "reward": 3.62109375, "reward_std": 0.06744491681456566, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 790 }, { "completion_length": 76.1328125, "epoch": 0.05076207283811968, "grad_norm": 1.7812823516441374, "kl": 0.039794921875, "learning_rate": 9.74618149146451e-07, "loss": 0.0016, "reward": 3.09375, "reward_std": 0.193861223757267, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 791 }, { "completion_length": 75.765625, "epoch": 0.05082624739290871, "grad_norm": 4.213943599930693, "kl": 0.05126953125, "learning_rate": 9.7458606083943e-07, "loss": 0.002, "reward": 3.5390625, "reward_std": 0.06629125960171223, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 0.9921875, "step": 792 }, { "completion_length": 79.234375, "epoch": 0.05089042194769774, "grad_norm": 2.1722175215531876, "kl": 0.0438232421875, "learning_rate": 9.745539725324092e-07, "loss": 0.0018, "reward": 3.0390625, "reward_std": 0.12073517218232155, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 1.0, "step": 793 }, { "completion_length": 83.3125, "epoch": 0.050954596502486765, "grad_norm": 2.4212937280823303, "kl": 0.0413818359375, "learning_rate": 9.745218842253882e-07, "loss": 0.0017, "reward": 3.15625, "reward_std": 0.13781969994306564, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 794 }, { "completion_length": 69.25, "epoch": 0.05101877105727579, "grad_norm": 1.899100981145301, "kl": 0.0565185546875, "learning_rate": 9.744897959183674e-07, "loss": 0.0023, "reward": 3.1953125, "reward_std": 0.1054728738963604, "rewards/accuracy_reward": 0.9453125, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 1.0, "step": 795 }, { "completion_length": 80.328125, "epoch": 0.05108294561206481, "grad_norm": 2.9494364699711997, "kl": 0.0506591796875, "learning_rate": 9.744577076113464e-07, "loss": 0.002, "reward": 3.3125, "reward_std": 0.14059045538306236, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 796 }, { "completion_length": 84.8359375, "epoch": 0.05114712016685384, "grad_norm": 3.448696702833348, "kl": 0.04541015625, "learning_rate": 9.744256193043254e-07, "loss": 0.0018, "reward": 3.57421875, "reward_std": 0.1289592832326889, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.86328125, "rewards/format_reward": 1.0, "step": 797 }, { "completion_length": 80.2734375, "epoch": 0.05121129472164287, "grad_norm": 1.5091217695728871, "kl": 0.060791015625, "learning_rate": 9.743935309973046e-07, "loss": 0.0024, "reward": 3.73046875, "reward_std": 0.13546312972903252, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.97265625, "rewards/format_reward": 1.0, "step": 798 }, { "completion_length": 88.0703125, "epoch": 0.051275469276431895, "grad_norm": 1.154598080629475, "kl": 0.0400390625, "learning_rate": 9.743614426902836e-07, "loss": 0.0016, "reward": 3.16015625, "reward_std": 0.14363106712698936, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.45703125, "rewards/format_reward": 1.0, "step": 799 }, { "completion_length": 71.328125, "epoch": 0.05133964383122092, "grad_norm": 1.0204977586438955, "kl": 0.03369140625, "learning_rate": 9.743293543832626e-07, "loss": 0.0013, "reward": 3.53125, "reward_std": 0.08337578736245632, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 0.9921875, "step": 800 }, { "completion_length": 81.140625, "epoch": 0.05140381838600995, "grad_norm": 1.2107804364345305, "kl": 0.045654296875, "learning_rate": 9.742972660762418e-07, "loss": 0.0018, "reward": 3.640625, "reward_std": 0.09863808006048203, "rewards/accuracy_reward": 0.8984375, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 801 }, { "completion_length": 76.6796875, "epoch": 0.05146799294079897, "grad_norm": 7.617241389719958, "kl": 0.0335693359375, "learning_rate": 9.742651777692208e-07, "loss": 0.0013, "reward": 3.65234375, "reward_std": 0.1430942788720131, "rewards/accuracy_reward": 0.921875, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 802 }, { "completion_length": 83.1953125, "epoch": 0.051532167495588, "grad_norm": 2.3803878213647764, "kl": 0.046875, "learning_rate": 9.742330894622e-07, "loss": 0.0019, "reward": 3.3671875, "reward_std": 0.22969217598438263, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.6015625, "rewards/format_reward": 1.0, "step": 803 }, { "completion_length": 83.34375, "epoch": 0.051596342050377025, "grad_norm": 2.9746083982070513, "kl": 0.04296875, "learning_rate": 9.74201001155179e-07, "loss": 0.0017, "reward": 2.78125, "reward_std": 0.20998317748308182, "rewards/accuracy_reward": 0.4453125, "rewards/format_count_numbers": 1.34375, "rewards/format_reward": 0.9921875, "step": 804 }, { "completion_length": 65.4375, "epoch": 0.05166051660516605, "grad_norm": 2.928599035562777, "kl": 0.05029296875, "learning_rate": 9.74168912848158e-07, "loss": 0.002, "reward": 2.7734375, "reward_std": 0.061278700828552246, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.0, "rewards/format_reward": 1.0, "step": 805 }, { "completion_length": 74.515625, "epoch": 0.05172469115995508, "grad_norm": 2.182113883521443, "kl": 0.0535888671875, "learning_rate": 9.741368245411372e-07, "loss": 0.0021, "reward": 3.23828125, "reward_std": 0.03314562980085611, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 0.9921875, "step": 806 }, { "completion_length": 74.15625, "epoch": 0.0517888657147441, "grad_norm": 1.832882725742519, "kl": 0.0478515625, "learning_rate": 9.741047362341162e-07, "loss": 0.0019, "reward": 3.109375, "reward_std": 0.12935256212949753, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 1.0, "step": 807 }, { "completion_length": 72.4453125, "epoch": 0.05185304026953313, "grad_norm": 23.093100376099713, "kl": 0.0611572265625, "learning_rate": 9.740726479270952e-07, "loss": 0.0025, "reward": 3.38671875, "reward_std": 0.19313160330057144, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 808 }, { "completion_length": 76.1328125, "epoch": 0.051917214824322155, "grad_norm": 4.868484407813333, "kl": 0.0428466796875, "learning_rate": 9.740405596200745e-07, "loss": 0.0017, "reward": 3.66015625, "reward_std": 0.1312469318509102, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 809 }, { "completion_length": 77.4375, "epoch": 0.05198138937911118, "grad_norm": 2.77077824334586, "kl": 0.0552978515625, "learning_rate": 9.740084713130535e-07, "loss": 0.0022, "reward": 3.1640625, "reward_std": 0.12863079831004143, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 0.9921875, "step": 810 }, { "completion_length": 71.6328125, "epoch": 0.05204556393390021, "grad_norm": 1.036632062188383, "kl": 0.058349609375, "learning_rate": 9.739763830060327e-07, "loss": 0.0023, "reward": 3.55859375, "reward_std": 0.011048543266952038, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 811 }, { "completion_length": 77.53125, "epoch": 0.05210973848868924, "grad_norm": 1.7784708093741393, "kl": 0.053466796875, "learning_rate": 9.739442946990117e-07, "loss": 0.0021, "reward": 2.98046875, "reward_std": 0.10968662612140179, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 1.0, "step": 812 }, { "completion_length": 75.1015625, "epoch": 0.05217391304347826, "grad_norm": 3.2678259336322855, "kl": 0.0543212890625, "learning_rate": 9.739122063919907e-07, "loss": 0.0022, "reward": 3.14453125, "reward_std": 0.20437297970056534, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 813 }, { "completion_length": 71.3515625, "epoch": 0.052238087598267285, "grad_norm": 7.310265234833813, "kl": 0.0386962890625, "learning_rate": 9.738801180849699e-07, "loss": 0.0015, "reward": 3.2578125, "reward_std": 0.1054728776216507, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 814 }, { "completion_length": 79.9609375, "epoch": 0.05230226215305631, "grad_norm": 10.878508159090309, "kl": 0.0570068359375, "learning_rate": 9.738480297779489e-07, "loss": 0.0023, "reward": 3.3515625, "reward_std": 0.13098490983247757, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 0.9921875, "step": 815 }, { "completion_length": 79.09375, "epoch": 0.05236643670784534, "grad_norm": 1.955929620724961, "kl": 0.0567626953125, "learning_rate": 9.738159414709279e-07, "loss": 0.0023, "reward": 2.859375, "reward_std": 0.1433563008904457, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.2421875, "rewards/format_reward": 1.0, "step": 816 }, { "completion_length": 71.5859375, "epoch": 0.05243061126263437, "grad_norm": 1.9364048716740234, "kl": 0.0562744140625, "learning_rate": 9.737838531639069e-07, "loss": 0.0023, "reward": 3.8046875, "reward_std": 0.1344047486782074, "rewards/accuracy_reward": 0.9453125, "rewards/format_count_numbers": 1.859375, "rewards/format_reward": 1.0, "step": 817 }, { "completion_length": 77.1875, "epoch": 0.052494785817423395, "grad_norm": 2.5299674557539693, "kl": 0.041259765625, "learning_rate": 9.73751764856886e-07, "loss": 0.0017, "reward": 3.3671875, "reward_std": 0.1054728776216507, "rewards/accuracy_reward": 0.8671875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 818 }, { "completion_length": 79.9765625, "epoch": 0.052558960372212415, "grad_norm": 1.1500829857231114, "kl": 0.0517578125, "learning_rate": 9.73719676549865e-07, "loss": 0.0021, "reward": 2.8359375, "reward_std": 0.09206776320934296, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.1171875, "rewards/format_reward": 1.0, "step": 819 }, { "completion_length": 81.0859375, "epoch": 0.05262313492700144, "grad_norm": 7.750414382521354, "kl": 0.041259765625, "learning_rate": 9.736875882428443e-07, "loss": 0.0016, "reward": 3.2890625, "reward_std": 0.051028965041041374, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 820 }, { "completion_length": 75.28125, "epoch": 0.05268730948179047, "grad_norm": 16.26389583010215, "kl": 0.041748046875, "learning_rate": 9.736554999358233e-07, "loss": 0.0017, "reward": 3.45703125, "reward_std": 0.1641305312514305, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 821 }, { "completion_length": 74.65625, "epoch": 0.0527514840365795, "grad_norm": 2.876327318553524, "kl": 0.0484619140625, "learning_rate": 9.736234116288025e-07, "loss": 0.0019, "reward": 3.3046875, "reward_std": 0.20912351459264755, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.46875, "rewards/format_reward": 0.9921875, "step": 822 }, { "completion_length": 82.25, "epoch": 0.052815658591368525, "grad_norm": 2.6758968123380744, "kl": 0.0576171875, "learning_rate": 9.735913233217815e-07, "loss": 0.0023, "reward": 3.40625, "reward_std": 0.22832970321178436, "rewards/accuracy_reward": 0.5703125, "rewards/format_count_numbers": 1.8359375, "rewards/format_reward": 1.0, "step": 823 }, { "completion_length": 83.5, "epoch": 0.052879833146157545, "grad_norm": 9.37859747064933, "kl": 0.0479736328125, "learning_rate": 9.735592350147605e-07, "loss": 0.0019, "reward": 3.4140625, "reward_std": 0.15467960759997368, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 824 }, { "completion_length": 83.625, "epoch": 0.05294400770094657, "grad_norm": 4.512673209651643, "kl": 0.0562744140625, "learning_rate": 9.735271467077395e-07, "loss": 0.0023, "reward": 3.23046875, "reward_std": 0.2360311970114708, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.58984375, "rewards/format_reward": 0.9921875, "step": 825 }, { "completion_length": 74.59375, "epoch": 0.0530081822557356, "grad_norm": 2.3522828351396483, "kl": 0.064453125, "learning_rate": 9.734950584007187e-07, "loss": 0.0026, "reward": 3.29296875, "reward_std": 0.09943688660860062, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 1.0, "step": 826 }, { "completion_length": 73.9375, "epoch": 0.05307235681052463, "grad_norm": 4.829651628340411, "kl": 0.076171875, "learning_rate": 9.734629700936977e-07, "loss": 0.003, "reward": 3.0859375, "reward_std": 0.12073516845703125, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.2421875, "rewards/format_reward": 1.0, "step": 827 }, { "completion_length": 89.515625, "epoch": 0.053136531365313655, "grad_norm": 2.27540981541513, "kl": 0.05078125, "learning_rate": 9.73430881786677e-07, "loss": 0.002, "reward": 3.07421875, "reward_std": 0.14812619984149933, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 828 }, { "completion_length": 86.1953125, "epoch": 0.05320070592010268, "grad_norm": 4.928502902873957, "kl": 0.04638671875, "learning_rate": 9.73398793479656e-07, "loss": 0.0019, "reward": 3.5390625, "reward_std": 0.1273379623889923, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.8671875, "rewards/format_reward": 1.0, "step": 829 }, { "completion_length": 82.0, "epoch": 0.0532648804748917, "grad_norm": 9.42400913971223, "kl": 0.0543212890625, "learning_rate": 9.733667051726352e-07, "loss": 0.0022, "reward": 3.2890625, "reward_std": 0.15676141530275345, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 830 }, { "completion_length": 87.90625, "epoch": 0.05332905502968073, "grad_norm": 5.794983869888584, "kl": 0.05322265625, "learning_rate": 9.733346168656142e-07, "loss": 0.0021, "reward": 3.44921875, "reward_std": 0.17063158005475998, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.85546875, "rewards/format_reward": 1.0, "step": 831 }, { "completion_length": 79.859375, "epoch": 0.05339322958446976, "grad_norm": 1.3191977383451108, "kl": 0.040283203125, "learning_rate": 9.733025285585932e-07, "loss": 0.0016, "reward": 3.08203125, "reward_std": 0.06207751017063856, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 832 }, { "completion_length": 78.984375, "epoch": 0.053457404139258785, "grad_norm": 2.1349914968679706, "kl": 0.048828125, "learning_rate": 9.732704402515722e-07, "loss": 0.0019, "reward": 3.4140625, "reward_std": 0.06629125960171223, "rewards/accuracy_reward": 0.921875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 0.9921875, "step": 833 }, { "completion_length": 81.3984375, "epoch": 0.05352157869404781, "grad_norm": 1.7779325190888169, "kl": 0.040283203125, "learning_rate": 9.732383519445514e-07, "loss": 0.0016, "reward": 3.265625, "reward_std": 0.09235943108797073, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 834 }, { "completion_length": 80.34375, "epoch": 0.05358575324883683, "grad_norm": 2.3898336383368375, "kl": 0.037353515625, "learning_rate": 9.732062636375304e-07, "loss": 0.0015, "reward": 3.20703125, "reward_std": 0.26544642448425293, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 0.9921875, "step": 835 }, { "completion_length": 82.6953125, "epoch": 0.05364992780362586, "grad_norm": 1.7236616483794198, "kl": 0.0560302734375, "learning_rate": 9.731741753305096e-07, "loss": 0.0022, "reward": 3.71484375, "reward_std": 0.1310637667775154, "rewards/accuracy_reward": 0.8515625, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 0.9921875, "step": 836 }, { "completion_length": 79.4375, "epoch": 0.05371410235841489, "grad_norm": 1.5022895992148726, "kl": 0.0482177734375, "learning_rate": 9.731420870234886e-07, "loss": 0.0019, "reward": 3.32421875, "reward_std": 0.131551718339324, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 837 }, { "completion_length": 76.5, "epoch": 0.053778276913203915, "grad_norm": 1.054718452032548, "kl": 0.0430908203125, "learning_rate": 9.731099987164678e-07, "loss": 0.0017, "reward": 3.2578125, "reward_std": 0.09522313997149467, "rewards/accuracy_reward": 0.890625, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 0.9921875, "step": 838 }, { "completion_length": 87.453125, "epoch": 0.05384245146799294, "grad_norm": 16.333134839228965, "kl": 0.0430908203125, "learning_rate": 9.730779104094468e-07, "loss": 0.0017, "reward": 3.1171875, "reward_std": 0.20753081142902374, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 839 }, { "completion_length": 82.40625, "epoch": 0.05390662602278197, "grad_norm": 2.592251263399831, "kl": 0.071533203125, "learning_rate": 9.730458221024258e-07, "loss": 0.0029, "reward": 2.92578125, "reward_std": 0.18443484604358673, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 0.984375, "step": 840 }, { "completion_length": 82.296875, "epoch": 0.05397080057757099, "grad_norm": 2.6063368819301895, "kl": 0.0469970703125, "learning_rate": 9.730137337954048e-07, "loss": 0.0019, "reward": 2.984375, "reward_std": 0.11732023023068905, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 1.0, "step": 841 }, { "completion_length": 89.6796875, "epoch": 0.05403497513236002, "grad_norm": 4.423349428405188, "kl": 0.0369873046875, "learning_rate": 9.72981645488384e-07, "loss": 0.0015, "reward": 3.23046875, "reward_std": 0.10968662612140179, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 842 }, { "completion_length": 77.8203125, "epoch": 0.054099149687149045, "grad_norm": 6.4538629750475796, "kl": 0.05126953125, "learning_rate": 9.72949557181363e-07, "loss": 0.002, "reward": 3.69921875, "reward_std": 0.03998042270541191, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 843 }, { "completion_length": 86.703125, "epoch": 0.05416332424193807, "grad_norm": 1.2897524539754992, "kl": 0.037353515625, "learning_rate": 9.729174688743422e-07, "loss": 0.0015, "reward": 3.375, "reward_std": 0.0936255231499672, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 844 }, { "completion_length": 83.9140625, "epoch": 0.0542274987967271, "grad_norm": 1.9818995860065005, "kl": 0.0394287109375, "learning_rate": 9.728853805673212e-07, "loss": 0.0016, "reward": 3.0625, "reward_std": 0.08075719699263573, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 845 }, { "completion_length": 90.8125, "epoch": 0.05429167335151612, "grad_norm": 3.9650879029805854, "kl": 0.0396728515625, "learning_rate": 9.728532922603004e-07, "loss": 0.0016, "reward": 3.47265625, "reward_std": 0.14992907270789146, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 846 }, { "completion_length": 94.25, "epoch": 0.05435584790630515, "grad_norm": 10.557543442195492, "kl": 0.0458984375, "learning_rate": 9.728212039532794e-07, "loss": 0.0018, "reward": 3.3203125, "reward_std": 0.09522313624620438, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 847 }, { "completion_length": 93.828125, "epoch": 0.054420022461094175, "grad_norm": 1.8965411932649405, "kl": 0.0382080078125, "learning_rate": 9.727891156462584e-07, "loss": 0.0015, "reward": 3.21875, "reward_std": 0.14806943386793137, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 848 }, { "completion_length": 85.3671875, "epoch": 0.0544841970158832, "grad_norm": 9.133551945340331, "kl": 0.0435791015625, "learning_rate": 9.727570273392376e-07, "loss": 0.0017, "reward": 3.38671875, "reward_std": 0.19412324577569962, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 849 }, { "completion_length": 89.0625, "epoch": 0.05454837157067223, "grad_norm": 8.782967862305972, "kl": 0.0821533203125, "learning_rate": 9.727249390322166e-07, "loss": 0.0033, "reward": 3.04296875, "reward_std": 0.055242715403437614, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 0.9921875, "step": 850 }, { "completion_length": 81.2265625, "epoch": 0.05461254612546126, "grad_norm": 5.276785387817552, "kl": 0.0340576171875, "learning_rate": 9.726928507251956e-07, "loss": 0.0014, "reward": 3.3046875, "reward_std": 0.08891239576041698, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 851 }, { "completion_length": 84.96875, "epoch": 0.05467672068025028, "grad_norm": 3.127782553778879, "kl": 0.03265380859375, "learning_rate": 9.726607624181746e-07, "loss": 0.0013, "reward": 3.0703125, "reward_std": 0.154142826795578, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 1.0, "step": 852 }, { "completion_length": 84.5078125, "epoch": 0.054740895235039305, "grad_norm": 2.4470476849950145, "kl": 0.0345458984375, "learning_rate": 9.726286741111539e-07, "loss": 0.0014, "reward": 3.1796875, "reward_std": 0.13310657069087029, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 0.9921875, "step": 853 }, { "completion_length": 86.4140625, "epoch": 0.05480506978982833, "grad_norm": 3.647968888910215, "kl": 0.0631103515625, "learning_rate": 9.725965858041329e-07, "loss": 0.0025, "reward": 3.34375, "reward_std": 0.23805538564920425, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.6015625, "rewards/format_reward": 0.984375, "step": 854 }, { "completion_length": 93.9453125, "epoch": 0.05486924434461736, "grad_norm": 5.657064142919813, "kl": 0.0372314453125, "learning_rate": 9.72564497497112e-07, "loss": 0.0015, "reward": 3.65234375, "reward_std": 0.11085976660251617, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.98828125, "rewards/format_reward": 1.0, "step": 855 }, { "completion_length": 82.6484375, "epoch": 0.05493341889940639, "grad_norm": 2.446253990113906, "kl": 0.032958984375, "learning_rate": 9.72532409190091e-07, "loss": 0.0013, "reward": 3.484375, "reward_std": 0.08838834427297115, "rewards/accuracy_reward": 0.859375, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 856 }, { "completion_length": 77.359375, "epoch": 0.054997593454195415, "grad_norm": 1.9392386307901757, "kl": 0.0389404296875, "learning_rate": 9.725003208830703e-07, "loss": 0.0016, "reward": 3.08203125, "reward_std": 0.1312469244003296, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 857 }, { "completion_length": 82.75, "epoch": 0.055061768008984435, "grad_norm": 1.413714886737312, "kl": 0.0443115234375, "learning_rate": 9.724682325760493e-07, "loss": 0.0018, "reward": 3.703125, "reward_std": 0.09863808006048203, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.875, "rewards/format_reward": 0.9921875, "step": 858 }, { "completion_length": 83.359375, "epoch": 0.05512594256377346, "grad_norm": 3.109894806141791, "kl": 0.0880126953125, "learning_rate": 9.724361442690283e-07, "loss": 0.0035, "reward": 3.02734375, "reward_std": 0.2219942957162857, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 859 }, { "completion_length": 83.2421875, "epoch": 0.05519011711856249, "grad_norm": 2.648407818592615, "kl": 0.0511474609375, "learning_rate": 9.724040559620073e-07, "loss": 0.002, "reward": 3.70703125, "reward_std": 0.16731838881969452, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.99609375, "rewards/format_reward": 1.0, "step": 860 }, { "completion_length": 73.6875, "epoch": 0.05525429167335152, "grad_norm": 2.638473664635188, "kl": 0.0352783203125, "learning_rate": 9.723719676549865e-07, "loss": 0.0014, "reward": 2.6015625, "reward_std": 0.05102896690368652, "rewards/accuracy_reward": 0.8515625, "rewards/format_count_numbers": 0.75, "rewards/format_reward": 1.0, "step": 861 }, { "completion_length": 82.0546875, "epoch": 0.055318466228140545, "grad_norm": 3.421435286626423, "kl": 0.0478515625, "learning_rate": 9.723398793479655e-07, "loss": 0.0019, "reward": 3.1640625, "reward_std": 0.1344047486782074, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 862 }, { "completion_length": 78.828125, "epoch": 0.055382640782929565, "grad_norm": 2.6110213886904896, "kl": 0.0533447265625, "learning_rate": 9.723077910409447e-07, "loss": 0.0021, "reward": 3.3046875, "reward_std": 0.18314750492572784, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.59375, "rewards/format_reward": 1.0, "step": 863 }, { "completion_length": 90.3984375, "epoch": 0.05544681533771859, "grad_norm": 1.7316148727916199, "kl": 0.038330078125, "learning_rate": 9.722757027339237e-07, "loss": 0.0015, "reward": 3.203125, "reward_std": 0.09863807819783688, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 864 }, { "completion_length": 80.296875, "epoch": 0.05551098989250762, "grad_norm": 0.9673680442776675, "kl": 0.0335693359375, "learning_rate": 9.72243614426903e-07, "loss": 0.0013, "reward": 3.8046875, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 2.0, "rewards/format_reward": 1.0, "step": 865 }, { "completion_length": 81.484375, "epoch": 0.05557516444729665, "grad_norm": 6.231903624525518, "kl": 0.037353515625, "learning_rate": 9.72211526119882e-07, "loss": 0.0015, "reward": 3.3359375, "reward_std": 0.11336850188672543, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 866 }, { "completion_length": 82.984375, "epoch": 0.055639339002085675, "grad_norm": 2.920733582485878, "kl": 0.0369873046875, "learning_rate": 9.72179437812861e-07, "loss": 0.0015, "reward": 3.3125, "reward_std": 0.16564394533634186, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 867 }, { "completion_length": 82.4296875, "epoch": 0.0557035135568747, "grad_norm": 6.5375975042212024, "kl": 0.0460205078125, "learning_rate": 9.7214734950584e-07, "loss": 0.0018, "reward": 3.55859375, "reward_std": 0.13861850649118423, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 868 }, { "completion_length": 85.765625, "epoch": 0.05576768811166372, "grad_norm": 2.818089044475151, "kl": 0.040771484375, "learning_rate": 9.721152611988191e-07, "loss": 0.0016, "reward": 3.21484375, "reward_std": 0.1573006436228752, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 0.9921875, "step": 869 }, { "completion_length": 80.0234375, "epoch": 0.05583186266645275, "grad_norm": 1.8541927436151793, "kl": 0.0372314453125, "learning_rate": 9.720831728917981e-07, "loss": 0.0015, "reward": 3.60546875, "reward_std": 0.09784417599439621, "rewards/accuracy_reward": 0.859375, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 870 }, { "completion_length": 83.375, "epoch": 0.05589603722124178, "grad_norm": 1.3838004628749112, "kl": 0.0357666015625, "learning_rate": 9.720510845847773e-07, "loss": 0.0014, "reward": 3.21484375, "reward_std": 0.08865036815404892, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 871 }, { "completion_length": 90.9609375, "epoch": 0.055960211776030805, "grad_norm": 1.6760875406042908, "kl": 0.0462646484375, "learning_rate": 9.720189962777563e-07, "loss": 0.0019, "reward": 3.4453125, "reward_std": 0.12073516473174095, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 1.0, "step": 872 }, { "completion_length": 85.25, "epoch": 0.05602438633081983, "grad_norm": 11.347828105140245, "kl": 0.04052734375, "learning_rate": 9.719869079707356e-07, "loss": 0.0016, "reward": 3.1796875, "reward_std": 0.20175685733556747, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 873 }, { "completion_length": 75.0859375, "epoch": 0.05608856088560885, "grad_norm": 6.6989656251001986, "kl": 0.0577392578125, "learning_rate": 9.719548196637146e-07, "loss": 0.0023, "reward": 3.0078125, "reward_std": 0.19581368193030357, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.2421875, "rewards/format_reward": 0.9921875, "step": 874 }, { "completion_length": 81.71875, "epoch": 0.05615273544039788, "grad_norm": 3.076421892136995, "kl": 0.0552978515625, "learning_rate": 9.719227313566936e-07, "loss": 0.0022, "reward": 3.24609375, "reward_std": 0.15911798179149628, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 875 }, { "completion_length": 80.78125, "epoch": 0.05621690999518691, "grad_norm": 4.30235999647668, "kl": 0.0303955078125, "learning_rate": 9.718906430496726e-07, "loss": 0.0012, "reward": 3.3671875, "reward_std": 0.14465449005365372, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 876 }, { "completion_length": 81.0625, "epoch": 0.056281084549975935, "grad_norm": 1.167724702124958, "kl": 0.045654296875, "learning_rate": 9.718585547426518e-07, "loss": 0.0018, "reward": 3.3046875, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.9375, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 877 }, { "completion_length": 78.171875, "epoch": 0.05634525910476496, "grad_norm": 4.381692241996944, "kl": 0.0465087890625, "learning_rate": 9.718264664356308e-07, "loss": 0.0019, "reward": 3.203125, "reward_std": 0.27145031094551086, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 878 }, { "completion_length": 89.03125, "epoch": 0.05640943365955399, "grad_norm": 1.2283774187867915, "kl": 0.0677490234375, "learning_rate": 9.717943781286098e-07, "loss": 0.0027, "reward": 3.16015625, "reward_std": 0.07733979821205139, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 879 }, { "completion_length": 85.0234375, "epoch": 0.05647360821434301, "grad_norm": 3.104141153646755, "kl": 0.0467529296875, "learning_rate": 9.71762289821589e-07, "loss": 0.0019, "reward": 3.1796875, "reward_std": 0.15308690071105957, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 0.9921875, "step": 880 }, { "completion_length": 86.703125, "epoch": 0.05653778276913204, "grad_norm": 3.212291890024413, "kl": 0.0462646484375, "learning_rate": 9.71730201514568e-07, "loss": 0.0019, "reward": 3.2578125, "reward_std": 0.14522241801023483, "rewards/accuracy_reward": 0.890625, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 881 }, { "completion_length": 81.78125, "epoch": 0.056601957323921065, "grad_norm": 2.1653163824887027, "kl": 0.0382080078125, "learning_rate": 9.716981132075472e-07, "loss": 0.0015, "reward": 3.1875, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 882 }, { "completion_length": 83.6015625, "epoch": 0.05666613187871009, "grad_norm": 4.520383032312005, "kl": 0.0390625, "learning_rate": 9.716660249005262e-07, "loss": 0.0016, "reward": 3.26171875, "reward_std": 0.10467406641691923, "rewards/accuracy_reward": 0.890625, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 883 }, { "completion_length": 87.7578125, "epoch": 0.05673030643349912, "grad_norm": 5.4310621114316895, "kl": 0.052490234375, "learning_rate": 9.716339365935052e-07, "loss": 0.0021, "reward": 3.4453125, "reward_std": 0.19740503281354904, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 884 }, { "completion_length": 81.671875, "epoch": 0.05679448098828815, "grad_norm": 2.1882678738318897, "kl": 0.0523681640625, "learning_rate": 9.716018482864844e-07, "loss": 0.0021, "reward": 3.4453125, "reward_std": 0.14230038225650787, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 885 }, { "completion_length": 88.828125, "epoch": 0.05685865554307717, "grad_norm": 4.54755680336819, "kl": 0.044189453125, "learning_rate": 9.715697599794634e-07, "loss": 0.0018, "reward": 3.3359375, "reward_std": 0.1922685131430626, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 886 }, { "completion_length": 89.84375, "epoch": 0.056922830097866195, "grad_norm": 1.578246598370672, "kl": 0.04150390625, "learning_rate": 9.715376716724424e-07, "loss": 0.0017, "reward": 2.984375, "reward_std": 0.1462521031498909, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 1.0, "step": 887 }, { "completion_length": 81.640625, "epoch": 0.05698700465265522, "grad_norm": 2.8638943185598507, "kl": 0.0394287109375, "learning_rate": 9.715055833654216e-07, "loss": 0.0016, "reward": 3.1640625, "reward_std": 0.07654099725186825, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 888 }, { "completion_length": 91.1328125, "epoch": 0.05705117920744425, "grad_norm": 10.893811076528234, "kl": 0.034912109375, "learning_rate": 9.714734950584006e-07, "loss": 0.0014, "reward": 3.01171875, "reward_std": 0.09442432969808578, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 1.0, "step": 889 }, { "completion_length": 95.0390625, "epoch": 0.05711535376223328, "grad_norm": 3.6154903011657415, "kl": 0.09326171875, "learning_rate": 9.714414067513798e-07, "loss": 0.0037, "reward": 3.546875, "reward_std": 0.1667515691369772, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.8515625, "rewards/format_reward": 1.0, "step": 890 }, { "completion_length": 85.71875, "epoch": 0.0571795283170223, "grad_norm": 6.425250948311458, "kl": 0.042724609375, "learning_rate": 9.714093184443588e-07, "loss": 0.0017, "reward": 3.55078125, "reward_std": 0.18705645948648453, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 891 }, { "completion_length": 96.609375, "epoch": 0.057243702871811325, "grad_norm": 1.9969180864070988, "kl": 0.042236328125, "learning_rate": 9.713772301373378e-07, "loss": 0.0017, "reward": 2.93359375, "reward_std": 0.16943515092134476, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.23046875, "rewards/format_reward": 1.0, "step": 892 }, { "completion_length": 89.0, "epoch": 0.05730787742660035, "grad_norm": 2.40100308701835, "kl": 0.067138671875, "learning_rate": 9.71345141830317e-07, "loss": 0.0027, "reward": 3.25390625, "reward_std": 0.20299965143203735, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.58984375, "rewards/format_reward": 1.0, "step": 893 }, { "completion_length": 99.828125, "epoch": 0.05737205198138938, "grad_norm": 10.209658680290172, "kl": 0.0506591796875, "learning_rate": 9.71313053523296e-07, "loss": 0.002, "reward": 3.49609375, "reward_std": 0.22932492941617966, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.85546875, "rewards/format_reward": 1.0, "step": 894 }, { "completion_length": 84.71875, "epoch": 0.05743622653617841, "grad_norm": 2.3282050597381327, "kl": 0.072021484375, "learning_rate": 9.71280965216275e-07, "loss": 0.0029, "reward": 3.36328125, "reward_std": 0.16861122101545334, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 1.0, "step": 895 }, { "completion_length": 87.421875, "epoch": 0.057500401090967435, "grad_norm": 2.391816179102512, "kl": 0.047607421875, "learning_rate": 9.712488769092543e-07, "loss": 0.0019, "reward": 3.39453125, "reward_std": 0.13417387753725052, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 0.9921875, "step": 896 }, { "completion_length": 87.40625, "epoch": 0.057564575645756455, "grad_norm": 1.3348022601352967, "kl": 0.03515625, "learning_rate": 9.712167886022333e-07, "loss": 0.0014, "reward": 3.66015625, "reward_std": 0.07733979821205139, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.85546875, "rewards/format_reward": 1.0, "step": 897 }, { "completion_length": 80.0078125, "epoch": 0.05762875020054548, "grad_norm": 5.905702305312464, "kl": 0.03955078125, "learning_rate": 9.711847002952125e-07, "loss": 0.0016, "reward": 2.73046875, "reward_std": 0.09890010580420494, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 0.99609375, "rewards/format_reward": 1.0, "step": 898 }, { "completion_length": 87.453125, "epoch": 0.05769292475533451, "grad_norm": 8.65210393846615, "kl": 0.042724609375, "learning_rate": 9.711526119881915e-07, "loss": 0.0017, "reward": 3.41015625, "reward_std": 0.20552846044301987, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 1.0, "step": 899 }, { "completion_length": 94.671875, "epoch": 0.05775709931012354, "grad_norm": 1.8315102009128945, "kl": 0.03955078125, "learning_rate": 9.711205236811707e-07, "loss": 0.0016, "reward": 3.2734375, "reward_std": 0.11336850002408028, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 900 }, { "completion_length": 94.9296875, "epoch": 0.057821273864912565, "grad_norm": 2.7029965119389505, "kl": 0.03814697265625, "learning_rate": 9.710884353741497e-07, "loss": 0.0015, "reward": 3.69140625, "reward_std": 0.06891230028122663, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.99609375, "rewards/format_reward": 1.0, "step": 901 }, { "completion_length": 77.65625, "epoch": 0.057885448419701585, "grad_norm": 2.3238717836914162, "kl": 0.0465087890625, "learning_rate": 9.710563470671287e-07, "loss": 0.0019, "reward": 3.1953125, "reward_std": 0.08679073117673397, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 0.9921875, "step": 902 }, { "completion_length": 86.0546875, "epoch": 0.05794962297449061, "grad_norm": 3.1632666436303682, "kl": 0.041748046875, "learning_rate": 9.710242587601077e-07, "loss": 0.0017, "reward": 3.73046875, "reward_std": 0.10603968799114227, "rewards/accuracy_reward": 0.859375, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 1.0, "step": 903 }, { "completion_length": 77.5234375, "epoch": 0.05801379752927964, "grad_norm": 0.4481255861055262, "kl": 0.04248046875, "learning_rate": 9.70992170453087e-07, "loss": 0.0017, "reward": 3.3671875, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.8671875, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 904 }, { "completion_length": 91.8046875, "epoch": 0.05807797208406867, "grad_norm": 3.7271974557351695, "kl": 0.032958984375, "learning_rate": 9.70960082146066e-07, "loss": 0.0013, "reward": 3.35546875, "reward_std": 0.19703350216150284, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 0.9921875, "step": 905 }, { "completion_length": 100.765625, "epoch": 0.058142146638857695, "grad_norm": 4.571344799300008, "kl": 0.0396728515625, "learning_rate": 9.70927993839045e-07, "loss": 0.0016, "reward": 3.47265625, "reward_std": 0.18336882442235947, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.87109375, "rewards/format_reward": 0.9921875, "step": 906 }, { "completion_length": 85.09375, "epoch": 0.05820632119364672, "grad_norm": 0.7170139595084525, "kl": 0.037353515625, "learning_rate": 9.708959055320241e-07, "loss": 0.0015, "reward": 3.1875, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.9375, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 1.0, "step": 907 }, { "completion_length": 82.8203125, "epoch": 0.05827049574843574, "grad_norm": 13.99705079106526, "kl": 0.0457763671875, "learning_rate": 9.708638172250031e-07, "loss": 0.0018, "reward": 3.51171875, "reward_std": 0.13861849904060364, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 908 }, { "completion_length": 91.40625, "epoch": 0.05833467030322477, "grad_norm": 1.9654341427656066, "kl": 0.0350341796875, "learning_rate": 9.708317289179823e-07, "loss": 0.0014, "reward": 2.87890625, "reward_std": 0.13520356267690659, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 1.0, "step": 909 }, { "completion_length": 86.3828125, "epoch": 0.0583988448580138, "grad_norm": 0.6802703261654865, "kl": 0.040283203125, "learning_rate": 9.707996406109613e-07, "loss": 0.0016, "reward": 2.8984375, "reward_std": 0.05550473928451538, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 1.0, "step": 910 }, { "completion_length": 92.875, "epoch": 0.058463019412802825, "grad_norm": 3.3157013341701926, "kl": 0.038818359375, "learning_rate": 9.707675523039403e-07, "loss": 0.0016, "reward": 2.8984375, "reward_std": 0.134404756128788, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 1.0, "step": 911 }, { "completion_length": 89.0078125, "epoch": 0.05852719396759185, "grad_norm": 3.262006681968917, "kl": 0.0567626953125, "learning_rate": 9.707354639969195e-07, "loss": 0.0023, "reward": 3.32421875, "reward_std": 0.12836876511573792, "rewards/accuracy_reward": 0.8515625, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 0.9921875, "step": 912 }, { "completion_length": 101.140625, "epoch": 0.05859136852238088, "grad_norm": 13.84586416962107, "kl": 0.0523681640625, "learning_rate": 9.707033756898985e-07, "loss": 0.0021, "reward": 3.37890625, "reward_std": 0.22806766629219055, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.69921875, "rewards/format_reward": 1.0, "step": 913 }, { "completion_length": 92.265625, "epoch": 0.0586555430771699, "grad_norm": 5.227682934217266, "kl": 0.0509033203125, "learning_rate": 9.706712873828775e-07, "loss": 0.002, "reward": 3.43359375, "reward_std": 0.08417459763586521, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 914 }, { "completion_length": 83.28125, "epoch": 0.05871971763195893, "grad_norm": 2.4874620257369298, "kl": 0.0435791015625, "learning_rate": 9.706391990758567e-07, "loss": 0.0017, "reward": 3.28515625, "reward_std": 0.10627168416976929, "rewards/accuracy_reward": 0.7890625, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 915 }, { "completion_length": 86.1015625, "epoch": 0.058783892186747955, "grad_norm": 8.041692940801862, "kl": 0.0546875, "learning_rate": 9.706071107688357e-07, "loss": 0.0022, "reward": 2.9296875, "reward_std": 0.13888052850961685, "rewards/accuracy_reward": 0.5703125, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 1.0, "step": 916 }, { "completion_length": 100.125, "epoch": 0.05884806674153698, "grad_norm": 2.037308374090791, "kl": 0.048095703125, "learning_rate": 9.70575022461815e-07, "loss": 0.0019, "reward": 2.796875, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 1.0, "step": 917 }, { "completion_length": 89.75, "epoch": 0.05891224129632601, "grad_norm": 6.100185084700387, "kl": 0.0439453125, "learning_rate": 9.70542934154794e-07, "loss": 0.0018, "reward": 3.5390625, "reward_std": 0.1938612163066864, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 918 }, { "completion_length": 98.7890625, "epoch": 0.05897641585111503, "grad_norm": 13.702696511915338, "kl": 0.0478515625, "learning_rate": 9.70510845847773e-07, "loss": 0.0019, "reward": 3.34375, "reward_std": 0.2997615784406662, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.6953125, "rewards/format_reward": 0.9921875, "step": 919 }, { "completion_length": 98.3515625, "epoch": 0.05904059040590406, "grad_norm": 1.9257244796606259, "kl": 0.0438232421875, "learning_rate": 9.704787575407522e-07, "loss": 0.0018, "reward": 3.6484375, "reward_std": 0.134404756128788, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.875, "rewards/format_reward": 0.9921875, "step": 920 }, { "completion_length": 92.171875, "epoch": 0.059104764960693085, "grad_norm": 3.0315472593712114, "kl": 0.0391845703125, "learning_rate": 9.704466692337312e-07, "loss": 0.0016, "reward": 3.29296875, "reward_std": 0.2252446785569191, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.71484375, "rewards/format_reward": 1.0, "step": 921 }, { "completion_length": 84.7890625, "epoch": 0.05916893951548211, "grad_norm": 2.2229144939928647, "kl": 0.0484619140625, "learning_rate": 9.704145809267102e-07, "loss": 0.0019, "reward": 3.5078125, "reward_std": 0.20175684988498688, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 922 }, { "completion_length": 94.7578125, "epoch": 0.05923311407027114, "grad_norm": 2.0545641395642154, "kl": 0.0445556640625, "learning_rate": 9.703824926196894e-07, "loss": 0.0018, "reward": 2.76171875, "reward_std": 0.1432773917913437, "rewards/accuracy_reward": 0.3984375, "rewards/format_count_numbers": 1.36328125, "rewards/format_reward": 1.0, "step": 923 }, { "completion_length": 87.7890625, "epoch": 0.05929728862506017, "grad_norm": 3.5360398691980763, "kl": 0.0361328125, "learning_rate": 9.703504043126684e-07, "loss": 0.0014, "reward": 3.28515625, "reward_std": 0.07733980566263199, "rewards/accuracy_reward": 0.9140625, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 924 }, { "completion_length": 91.9375, "epoch": 0.05936146317984919, "grad_norm": 2.690829659184727, "kl": 0.0531005859375, "learning_rate": 9.703183160056476e-07, "loss": 0.0021, "reward": 3.37109375, "reward_std": 0.20670828223228455, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.58984375, "rewards/format_reward": 1.0, "step": 925 }, { "completion_length": 84.453125, "epoch": 0.059425637734638215, "grad_norm": 7.77848582475477, "kl": 0.0428466796875, "learning_rate": 9.702862276986266e-07, "loss": 0.0017, "reward": 3.171875, "reward_std": 0.10205793008208275, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 1.0, "step": 926 }, { "completion_length": 94.0390625, "epoch": 0.05948981228942724, "grad_norm": 2.8319672603083963, "kl": 0.0537109375, "learning_rate": 9.702541393916056e-07, "loss": 0.0021, "reward": 3.3125, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 0.9921875, "step": 927 }, { "completion_length": 95.09375, "epoch": 0.05955398684421627, "grad_norm": 13.42813714788352, "kl": 0.05859375, "learning_rate": 9.702220510845848e-07, "loss": 0.0024, "reward": 3.24609375, "reward_std": 0.2906537801027298, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.56640625, "rewards/format_reward": 1.0, "step": 928 }, { "completion_length": 94.6328125, "epoch": 0.0596181613990053, "grad_norm": 3.6503644685962113, "kl": 0.0440673828125, "learning_rate": 9.701899627775638e-07, "loss": 0.0018, "reward": 3.515625, "reward_std": 0.14283224940299988, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.8515625, "rewards/format_reward": 1.0, "step": 929 }, { "completion_length": 94.0234375, "epoch": 0.05968233595379432, "grad_norm": 2.8258862711560147, "kl": 0.0478515625, "learning_rate": 9.701578744705428e-07, "loss": 0.0019, "reward": 3.45703125, "reward_std": 0.11994126439094543, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 930 }, { "completion_length": 95.0390625, "epoch": 0.059746510508583345, "grad_norm": 3.331997820137994, "kl": 0.0413818359375, "learning_rate": 9.70125786163522e-07, "loss": 0.0017, "reward": 3.2265625, "reward_std": 0.1694779098033905, "rewards/accuracy_reward": 0.4921875, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 1.0, "step": 931 }, { "completion_length": 86.7265625, "epoch": 0.05981068506337237, "grad_norm": 1.7686734121690535, "kl": 0.0361328125, "learning_rate": 9.70093697856501e-07, "loss": 0.0014, "reward": 3.3359375, "reward_std": 0.09522314183413982, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 932 }, { "completion_length": 97.71875, "epoch": 0.0598748596181614, "grad_norm": 3.264220351557874, "kl": 0.043212890625, "learning_rate": 9.700616095494802e-07, "loss": 0.0017, "reward": 3.5078125, "reward_std": 0.16834919899702072, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 933 }, { "completion_length": 81.2265625, "epoch": 0.05993903417295043, "grad_norm": 1.3146926662505907, "kl": 0.0418701171875, "learning_rate": 9.700295212424592e-07, "loss": 0.0017, "reward": 3.3203125, "reward_std": 0.06629125960171223, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 934 }, { "completion_length": 88.28125, "epoch": 0.060003208727739454, "grad_norm": 2.4186666145506135, "kl": 0.0408935546875, "learning_rate": 9.699974329354382e-07, "loss": 0.0016, "reward": 3.42578125, "reward_std": 0.16303367167711258, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 935 }, { "completion_length": 89.28125, "epoch": 0.060067383282528475, "grad_norm": 3.4529571753121933, "kl": 0.0911865234375, "learning_rate": 9.699653446284174e-07, "loss": 0.0036, "reward": 3.20703125, "reward_std": 0.09783927351236343, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 936 }, { "completion_length": 84.359375, "epoch": 0.0601315578373175, "grad_norm": 1.6768599417825407, "kl": 0.0386962890625, "learning_rate": 9.699332563213964e-07, "loss": 0.0015, "reward": 2.84765625, "reward_std": 0.14992906898260117, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.10546875, "rewards/format_reward": 1.0, "step": 937 }, { "completion_length": 86.265625, "epoch": 0.06019573239210653, "grad_norm": 2.4454927691381934, "kl": 0.0465087890625, "learning_rate": 9.699011680143754e-07, "loss": 0.0019, "reward": 3.7734375, "reward_std": 0.11426013335585594, "rewards/accuracy_reward": 0.90625, "rewards/format_count_numbers": 1.8671875, "rewards/format_reward": 1.0, "step": 938 }, { "completion_length": 81.8203125, "epoch": 0.06025990694689556, "grad_norm": 1.5197817363015238, "kl": 0.038330078125, "learning_rate": 9.698690797073544e-07, "loss": 0.0015, "reward": 3.15625, "reward_std": 0.13781969994306564, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 939 }, { "completion_length": 90.3125, "epoch": 0.060324081501684584, "grad_norm": 1.227330558412233, "kl": 0.0469970703125, "learning_rate": 9.698369914003337e-07, "loss": 0.0019, "reward": 3.55859375, "reward_std": 0.12677115201950073, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 940 }, { "completion_length": 84.1484375, "epoch": 0.060388256056473605, "grad_norm": 1.994867494352177, "kl": 0.036865234375, "learning_rate": 9.698049030933127e-07, "loss": 0.0015, "reward": 3.62890625, "reward_std": 0.13967934250831604, "rewards/accuracy_reward": 0.8828125, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 941 }, { "completion_length": 97.921875, "epoch": 0.06045243061126263, "grad_norm": 1.7723285905166963, "kl": 0.04248046875, "learning_rate": 9.697728147862919e-07, "loss": 0.0017, "reward": 3.234375, "reward_std": 0.07312605064362288, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 942 }, { "completion_length": 93.6953125, "epoch": 0.06051660516605166, "grad_norm": 2.249129636957678, "kl": 0.048583984375, "learning_rate": 9.697407264792709e-07, "loss": 0.0019, "reward": 3.15625, "reward_std": 0.18033315986394882, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 1.0, "step": 943 }, { "completion_length": 85.171875, "epoch": 0.06058077972084069, "grad_norm": 10.256650665544598, "kl": 0.0411376953125, "learning_rate": 9.6970863817225e-07, "loss": 0.0016, "reward": 3.19140625, "reward_std": 0.1488993838429451, "rewards/accuracy_reward": 0.8203125, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 944 }, { "completion_length": 84.3515625, "epoch": 0.060644954275629714, "grad_norm": 22.43478299619983, "kl": 0.0423583984375, "learning_rate": 9.69676549865229e-07, "loss": 0.0017, "reward": 3.25390625, "reward_std": 0.15570303052663803, "rewards/accuracy_reward": 0.8828125, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 945 }, { "completion_length": 79.1796875, "epoch": 0.06070912883041874, "grad_norm": 1.825483579298726, "kl": 0.055908203125, "learning_rate": 9.69644461558208e-07, "loss": 0.0022, "reward": 3.40625, "reward_std": 0.0731260534375906, "rewards/accuracy_reward": 0.90625, "rewards/format_count_numbers": 1.5, "rewards/format_reward": 1.0, "step": 946 }, { "completion_length": 84.421875, "epoch": 0.06077330338520776, "grad_norm": 1.6084734153960205, "kl": 0.03857421875, "learning_rate": 9.696123732511873e-07, "loss": 0.0015, "reward": 3.5625, "reward_std": 0.08785156160593033, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 947 }, { "completion_length": 82.328125, "epoch": 0.06083747793999679, "grad_norm": 2.947354548767598, "kl": 0.0423583984375, "learning_rate": 9.695802849441663e-07, "loss": 0.0017, "reward": 3.20703125, "reward_std": 0.11331771314144135, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.47265625, "rewards/format_reward": 1.0, "step": 948 }, { "completion_length": 80.3984375, "epoch": 0.06090165249478582, "grad_norm": 3.1590147885522804, "kl": 0.0447998046875, "learning_rate": 9.695481966371453e-07, "loss": 0.0018, "reward": 3.1171875, "reward_std": 0.23012374341487885, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 949 }, { "completion_length": 89.296875, "epoch": 0.060965827049574844, "grad_norm": 1.9585448432528862, "kl": 0.0367431640625, "learning_rate": 9.695161083301245e-07, "loss": 0.0015, "reward": 3.0625, "reward_std": 0.1865624412894249, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.359375, "rewards/format_reward": 1.0, "step": 950 }, { "completion_length": 92.546875, "epoch": 0.06103000160436387, "grad_norm": 9.048768103945655, "kl": 0.0592041015625, "learning_rate": 9.694840200231035e-07, "loss": 0.0024, "reward": 3.171875, "reward_std": 0.18452247232198715, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 0.9921875, "step": 951 }, { "completion_length": 93.8203125, "epoch": 0.0610941761591529, "grad_norm": 4.087469842466786, "kl": 0.05224609375, "learning_rate": 9.694519317160827e-07, "loss": 0.0021, "reward": 3.359375, "reward_std": 0.19860084354877472, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 1.0, "step": 952 }, { "completion_length": 76.5078125, "epoch": 0.06115835071394192, "grad_norm": 1.6424447371560398, "kl": 0.04736328125, "learning_rate": 9.694198434090617e-07, "loss": 0.0019, "reward": 3.1328125, "reward_std": 0.07959238067269325, "rewards/accuracy_reward": 0.890625, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 0.9921875, "step": 953 }, { "completion_length": 81.1796875, "epoch": 0.06122252526873095, "grad_norm": 1.36869759809087, "kl": 0.042724609375, "learning_rate": 9.693877551020407e-07, "loss": 0.0017, "reward": 3.078125, "reward_std": 0.07312605530023575, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 1.0, "step": 954 }, { "completion_length": 84.59375, "epoch": 0.061286699823519974, "grad_norm": 3.178842783606729, "kl": 0.0634765625, "learning_rate": 9.6935566679502e-07, "loss": 0.0025, "reward": 3.67578125, "reward_std": 0.19365980848670006, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.83984375, "rewards/format_reward": 1.0, "step": 955 }, { "completion_length": 81.625, "epoch": 0.061350874378309, "grad_norm": 1.6903373143157647, "kl": 0.045654296875, "learning_rate": 9.69323578487999e-07, "loss": 0.0018, "reward": 3.18359375, "reward_std": 0.17939773201942444, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 956 }, { "completion_length": 80.71875, "epoch": 0.06141504893309803, "grad_norm": 0.683941212800394, "kl": 0.04443359375, "learning_rate": 9.69291490180978e-07, "loss": 0.0018, "reward": 3.3671875, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.7421875, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 957 }, { "completion_length": 73.0234375, "epoch": 0.06147922348788705, "grad_norm": 0.6250258623730682, "kl": 0.0391845703125, "learning_rate": 9.692594018739571e-07, "loss": 0.0016, "reward": 3.046875, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 958 }, { "completion_length": 83.7265625, "epoch": 0.06154339804267608, "grad_norm": 2.2300362073171622, "kl": 0.04931640625, "learning_rate": 9.692273135669361e-07, "loss": 0.002, "reward": 3.390625, "reward_std": 0.11678344011306763, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 959 }, { "completion_length": 74.3359375, "epoch": 0.061607572597465105, "grad_norm": 1.4073706789262752, "kl": 0.0438232421875, "learning_rate": 9.691952252599154e-07, "loss": 0.0018, "reward": 3.40234375, "reward_std": 0.10603968799114227, "rewards/accuracy_reward": 0.90625, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 960 }, { "completion_length": 80.03125, "epoch": 0.06167174715225413, "grad_norm": 1.4844013107166356, "kl": 0.054931640625, "learning_rate": 9.691631369528944e-07, "loss": 0.0022, "reward": 3.3984375, "reward_std": 0.09704046696424484, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 0.9921875, "step": 961 }, { "completion_length": 79.359375, "epoch": 0.06173592170704316, "grad_norm": 1.026288788186902, "kl": 0.0499267578125, "learning_rate": 9.691310486458734e-07, "loss": 0.002, "reward": 2.86328125, "reward_std": 0.08647446520626545, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.24609375, "rewards/format_reward": 0.9921875, "step": 962 }, { "completion_length": 76.3359375, "epoch": 0.06180009626183219, "grad_norm": 1.9563863513431772, "kl": 0.0404052734375, "learning_rate": 9.690989603388526e-07, "loss": 0.0016, "reward": 3.23828125, "reward_std": 0.108589768409729, "rewards/accuracy_reward": 0.8671875, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 963 }, { "completion_length": 75.6875, "epoch": 0.06186427081662121, "grad_norm": 2.139021885010674, "kl": 0.048095703125, "learning_rate": 9.690668720318316e-07, "loss": 0.0019, "reward": 3.09375, "reward_std": 0.13541388511657715, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 0.9921875, "step": 964 }, { "completion_length": 72.765625, "epoch": 0.061928445371410235, "grad_norm": 1.45507575787633, "kl": 0.042724609375, "learning_rate": 9.690347837248106e-07, "loss": 0.0017, "reward": 3.32421875, "reward_std": 0.0881456807255745, "rewards/accuracy_reward": 0.828125, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 965 }, { "completion_length": 80.765625, "epoch": 0.06199261992619926, "grad_norm": 3.31649782971472, "kl": 0.0621337890625, "learning_rate": 9.690026954177896e-07, "loss": 0.0025, "reward": 3.28515625, "reward_std": 0.10771321505308151, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 1.0, "step": 966 }, { "completion_length": 80.5546875, "epoch": 0.06205679448098829, "grad_norm": 2.5228291106593277, "kl": 0.0462646484375, "learning_rate": 9.689706071107688e-07, "loss": 0.0018, "reward": 3.46875, "reward_std": 0.13781969994306564, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 967 }, { "completion_length": 76.9296875, "epoch": 0.06212096903577732, "grad_norm": 3.7313025091401686, "kl": 0.0411376953125, "learning_rate": 9.689385188037478e-07, "loss": 0.0016, "reward": 3.2734375, "reward_std": 0.1660095378756523, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 968 }, { "completion_length": 73.953125, "epoch": 0.06218514359056634, "grad_norm": 1.309804034711807, "kl": 0.0360107421875, "learning_rate": 9.68906430496727e-07, "loss": 0.0014, "reward": 3.55078125, "reward_std": 0.0992048978805542, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.74609375, "rewards/format_reward": 1.0, "step": 969 }, { "completion_length": 77.359375, "epoch": 0.062249318145355365, "grad_norm": 2.509491499478309, "kl": 0.0606689453125, "learning_rate": 9.68874342189706e-07, "loss": 0.0024, "reward": 3.0390625, "reward_std": 0.10889272391796112, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.375, "rewards/format_reward": 0.9921875, "step": 970 }, { "completion_length": 85.671875, "epoch": 0.06231349270014439, "grad_norm": 3.004523587333302, "kl": 0.04443359375, "learning_rate": 9.688422538826852e-07, "loss": 0.0018, "reward": 3.3515625, "reward_std": 0.11336849629878998, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.625, "rewards/format_reward": 1.0, "step": 971 }, { "completion_length": 89.375, "epoch": 0.06237766725493342, "grad_norm": 2.4378002757724473, "kl": 0.048828125, "learning_rate": 9.688101655756642e-07, "loss": 0.002, "reward": 3.55859375, "reward_std": 0.2009580433368683, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.86328125, "rewards/format_reward": 1.0, "step": 972 }, { "completion_length": 78.84375, "epoch": 0.06244184180972245, "grad_norm": 0.946373795977536, "kl": 0.0499267578125, "learning_rate": 9.687780772686432e-07, "loss": 0.002, "reward": 3.19921875, "reward_std": 0.039980421774089336, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 973 }, { "completion_length": 78.3984375, "epoch": 0.06250601636451147, "grad_norm": 2.7098544740949273, "kl": 0.0477294921875, "learning_rate": 9.687459889616222e-07, "loss": 0.0019, "reward": 3.20703125, "reward_std": 0.14775316417217255, "rewards/accuracy_reward": 0.8359375, "rewards/format_count_numbers": 1.37109375, "rewards/format_reward": 1.0, "step": 974 }, { "completion_length": 93.375, "epoch": 0.0625701909193005, "grad_norm": 5.084564561449596, "kl": 0.0526123046875, "learning_rate": 9.687139006546014e-07, "loss": 0.0021, "reward": 3.31640625, "reward_std": 0.13178370893001556, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 0.9921875, "step": 975 }, { "completion_length": 83.3359375, "epoch": 0.06263436547408953, "grad_norm": 2.711455289003609, "kl": 0.0458984375, "learning_rate": 9.686818123475804e-07, "loss": 0.0018, "reward": 3.34765625, "reward_std": 0.07733980193734169, "rewards/accuracy_reward": 0.8515625, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 1.0, "step": 976 }, { "completion_length": 89.8359375, "epoch": 0.06269854002887855, "grad_norm": 4.758913874816007, "kl": 0.0863037109375, "learning_rate": 9.686497240405596e-07, "loss": 0.0035, "reward": 3.27734375, "reward_std": 0.3005019724369049, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.58203125, "rewards/format_reward": 1.0, "step": 977 }, { "completion_length": 91.8125, "epoch": 0.06276271458366757, "grad_norm": 4.659659140510254, "kl": 0.04150390625, "learning_rate": 9.686176357335386e-07, "loss": 0.0017, "reward": 3.4765625, "reward_std": 0.1054728701710701, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 978 }, { "completion_length": 90.890625, "epoch": 0.0628268891384566, "grad_norm": 2.0174792957813263, "kl": 0.055419921875, "learning_rate": 9.685855474265178e-07, "loss": 0.0022, "reward": 3.42578125, "reward_std": 0.10945463180541992, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 979 }, { "completion_length": 79.1484375, "epoch": 0.06289106369324562, "grad_norm": 4.257200459750483, "kl": 0.04638671875, "learning_rate": 9.685534591194968e-07, "loss": 0.0019, "reward": 3.49609375, "reward_std": 0.06207750737667084, "rewards/accuracy_reward": 0.875, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 980 }, { "completion_length": 103.53125, "epoch": 0.06295523824803466, "grad_norm": 2.9753963368414054, "kl": 0.0443115234375, "learning_rate": 9.685213708124758e-07, "loss": 0.0018, "reward": 3.3828125, "reward_std": 0.12268763408064842, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.7421875, "rewards/format_reward": 1.0, "step": 981 }, { "completion_length": 99.84375, "epoch": 0.06301941280282368, "grad_norm": 1.525818114061489, "kl": 0.0631103515625, "learning_rate": 9.684892825054548e-07, "loss": 0.0025, "reward": 3.1484375, "reward_std": 0.11482449620962143, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 0.9921875, "step": 982 }, { "completion_length": 92.6328125, "epoch": 0.0630835873576127, "grad_norm": 2.0496514295180277, "kl": 0.051513671875, "learning_rate": 9.68457194198434e-07, "loss": 0.0021, "reward": 3.44921875, "reward_std": 0.1541452743113041, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 0.9921875, "step": 983 }, { "completion_length": 94.0546875, "epoch": 0.06314776191240173, "grad_norm": 8.944777448696968, "kl": 0.0489501953125, "learning_rate": 9.68425105891413e-07, "loss": 0.002, "reward": 3.52734375, "reward_std": 0.11712249182164669, "rewards/accuracy_reward": 0.796875, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 984 }, { "completion_length": 90.3359375, "epoch": 0.06321193646719075, "grad_norm": 2.363675786699937, "kl": 0.0469970703125, "learning_rate": 9.683930175843923e-07, "loss": 0.0019, "reward": 2.9765625, "reward_std": 0.1888798028230667, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.3671875, "rewards/format_reward": 1.0, "step": 985 }, { "completion_length": 89.5703125, "epoch": 0.06327611102197979, "grad_norm": 9.572096103898028, "kl": 0.045166015625, "learning_rate": 9.683609292773713e-07, "loss": 0.0018, "reward": 3.51953125, "reward_std": 0.12458535842597485, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.85546875, "rewards/format_reward": 1.0, "step": 986 }, { "completion_length": 95.625, "epoch": 0.06334028557676881, "grad_norm": 2.231229679852149, "kl": 0.044677734375, "learning_rate": 9.683288409703505e-07, "loss": 0.0018, "reward": 3.2890625, "reward_std": 0.18648963794112206, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 987 }, { "completion_length": 99.8359375, "epoch": 0.06340446013155784, "grad_norm": 4.062331285793409, "kl": 0.0421142578125, "learning_rate": 9.682967526633295e-07, "loss": 0.0017, "reward": 3.26171875, "reward_std": 0.14886824414134026, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 988 }, { "completion_length": 100.6015625, "epoch": 0.06346863468634686, "grad_norm": 1.80324736091925, "kl": 0.041015625, "learning_rate": 9.682646643563085e-07, "loss": 0.0016, "reward": 3.4296875, "reward_std": 0.0946863517165184, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 989 }, { "completion_length": 95.078125, "epoch": 0.06353280924113588, "grad_norm": 2.867165882028107, "kl": 0.0428466796875, "learning_rate": 9.682325760492875e-07, "loss": 0.0017, "reward": 3.4140625, "reward_std": 0.11914245784282684, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 990 }, { "completion_length": 94.203125, "epoch": 0.06359698379592492, "grad_norm": 3.4453849994158166, "kl": 0.0474853515625, "learning_rate": 9.682004877422667e-07, "loss": 0.0019, "reward": 3.34375, "reward_std": 0.18489694222807884, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.609375, "rewards/format_reward": 1.0, "step": 991 }, { "completion_length": 89.8828125, "epoch": 0.06366115835071394, "grad_norm": 1.216025113807148, "kl": 0.0472412109375, "learning_rate": 9.681683994352457e-07, "loss": 0.0019, "reward": 3.3984375, "reward_std": 0.1481909602880478, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.59375, "rewards/format_reward": 1.0, "step": 992 }, { "completion_length": 106.34375, "epoch": 0.06372533290550297, "grad_norm": 1.385692675813348, "kl": 0.03955078125, "learning_rate": 9.68136311128225e-07, "loss": 0.0016, "reward": 3.28125, "reward_std": 0.06549490615725517, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 993 }, { "completion_length": 95.359375, "epoch": 0.063789507460292, "grad_norm": 1.3697400408076814, "kl": 0.0513916015625, "learning_rate": 9.68104222821204e-07, "loss": 0.0021, "reward": 3.125, "reward_std": 0.11100947484374046, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.4921875, "rewards/format_reward": 1.0, "step": 994 }, { "completion_length": 90.2109375, "epoch": 0.06385368201508101, "grad_norm": 1.5458448041098067, "kl": 0.041015625, "learning_rate": 9.680721345141831e-07, "loss": 0.0016, "reward": 3.4609375, "reward_std": 0.11572261154651642, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.75, "rewards/format_reward": 1.0, "step": 995 }, { "completion_length": 92.9453125, "epoch": 0.06391785656987005, "grad_norm": 0.9410644686627334, "kl": 0.0418701171875, "learning_rate": 9.680400462071621e-07, "loss": 0.0017, "reward": 3.15625, "reward_std": 0.033407654613256454, "rewards/accuracy_reward": 0.90625, "rewards/format_count_numbers": 1.25, "rewards/format_reward": 1.0, "step": 996 }, { "completion_length": 103.5859375, "epoch": 0.06398203112465907, "grad_norm": 3.6924481491686576, "kl": 0.1009521484375, "learning_rate": 9.680079579001411e-07, "loss": 0.004, "reward": 3.14453125, "reward_std": 0.2409295253455639, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 0.984375, "step": 997 }, { "completion_length": 91.875, "epoch": 0.0640462056794481, "grad_norm": 0.9822843332069, "kl": 0.0570068359375, "learning_rate": 9.679758695931203e-07, "loss": 0.0023, "reward": 3.42578125, "reward_std": 0.06320622842758894, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.73046875, "rewards/format_reward": 1.0, "step": 998 }, { "completion_length": 92.46875, "epoch": 0.06411038023423712, "grad_norm": 6.039264441920903, "kl": 0.063720703125, "learning_rate": 9.679437812860993e-07, "loss": 0.0025, "reward": 3.19921875, "reward_std": 0.2414727881550789, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 999 }, { "completion_length": 90.2734375, "epoch": 0.06417455478902614, "grad_norm": 3.2487587019837094, "kl": 0.03564453125, "learning_rate": 9.679116929790783e-07, "loss": 0.0014, "reward": 3.46484375, "reward_std": 0.1488682385534048, "rewards/accuracy_reward": 0.84375, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 1000 } ], "logging_steps": 1.0, "max_steps": 31164, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }