{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.006417455478902615, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 89.1796875, "epoch": 6.417455478902615e-05, "grad_norm": 6.457693642993236, "kl": 0.0, "learning_rate": 9.99967911692979e-07, "loss": 0.0, "reward": 2.8125, "reward_std": 0.5811586081981659, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.2265625, "rewards/format_reward": 0.9296875, "step": 1 }, { "completion_length": 100.859375, "epoch": 0.0001283491095780523, "grad_norm": 3.7199479867427576, "kl": 0.0006103515625, "learning_rate": 9.999358233859582e-07, "loss": 0.0, "reward": 2.95703125, "reward_std": 0.9970237612724304, "rewards/accuracy_reward": 0.5, "rewards/format_count_numbers": 1.53515625, "rewards/format_reward": 0.921875, "step": 2 }, { "completion_length": 99.9609375, "epoch": 0.00019252366436707844, "grad_norm": 4.667048062421479, "kl": 0.000865936279296875, "learning_rate": 9.999037350789372e-07, "loss": 0.0, "reward": 2.75, "reward_std": 0.6299314796924591, "rewards/accuracy_reward": 0.3984375, "rewards/format_count_numbers": 1.40625, "rewards/format_reward": 0.9453125, "step": 3 }, { "completion_length": 105.375, "epoch": 0.0002566982191561046, "grad_norm": 6.61841287599563, "kl": 0.000850677490234375, "learning_rate": 9.998716467719162e-07, "loss": 0.0, "reward": 2.6484375, "reward_std": 0.6614057421684265, "rewards/accuracy_reward": 0.4453125, "rewards/format_count_numbers": 1.2890625, "rewards/format_reward": 0.9140625, "step": 4 }, { "completion_length": 98.4921875, "epoch": 0.00032087277394513073, "grad_norm": 10.226707577648735, "kl": 0.001094818115234375, "learning_rate": 9.998395584648954e-07, "loss": 0.0, "reward": 2.7734375, "reward_std": 0.684965580701828, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.3125, "rewards/format_reward": 0.9296875, "step": 5 }, { "completion_length": 105.0078125, "epoch": 0.0003850473287341569, "grad_norm": 16.540801471606787, "kl": 0.00214385986328125, "learning_rate": 9.998074701578744e-07, "loss": 0.0001, "reward": 2.38671875, "reward_std": 0.6537165194749832, "rewards/accuracy_reward": 0.4140625, "rewards/format_count_numbers": 1.08984375, "rewards/format_reward": 0.8828125, "step": 6 }, { "completion_length": 96.6484375, "epoch": 0.0004492218835231831, "grad_norm": 4.007831804571436, "kl": 0.003238677978515625, "learning_rate": 9.997753818508536e-07, "loss": 0.0001, "reward": 2.85546875, "reward_std": 0.5957659184932709, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.23828125, "rewards/format_reward": 0.953125, "step": 7 }, { "completion_length": 94.6328125, "epoch": 0.0005133964383122092, "grad_norm": 3.3765246174936343, "kl": 0.00238037109375, "learning_rate": 9.997432935438326e-07, "loss": 0.0001, "reward": 3.05078125, "reward_std": 0.5704643428325653, "rewards/accuracy_reward": 0.515625, "rewards/format_count_numbers": 1.58203125, "rewards/format_reward": 0.953125, "step": 8 }, { "completion_length": 95.71875, "epoch": 0.0005775709931012354, "grad_norm": 3.445754640313553, "kl": 0.0037994384765625, "learning_rate": 9.997112052368116e-07, "loss": 0.0002, "reward": 2.921875, "reward_std": 0.7348538041114807, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.4453125, "rewards/format_reward": 0.9453125, "step": 9 }, { "completion_length": 81.859375, "epoch": 0.0006417455478902615, "grad_norm": 13.788009049584414, "kl": 0.00360870361328125, "learning_rate": 9.996791169297908e-07, "loss": 0.0001, "reward": 2.41015625, "reward_std": 0.3873346596956253, "rewards/accuracy_reward": 0.46875, "rewards/format_count_numbers": 0.97265625, "rewards/format_reward": 0.96875, "step": 10 }, { "completion_length": 95.3515625, "epoch": 0.0007059201026792877, "grad_norm": 3.802264533705271, "kl": 0.007354736328125, "learning_rate": 9.996470286227698e-07, "loss": 0.0003, "reward": 3.0703125, "reward_std": 0.5251666307449341, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.5546875, "rewards/format_reward": 0.9921875, "step": 11 }, { "completion_length": 91.515625, "epoch": 0.0007700946574683138, "grad_norm": 7.025555072435911, "kl": 0.0046234130859375, "learning_rate": 9.996149403157488e-07, "loss": 0.0002, "reward": 3.0, "reward_std": 0.433403342962265, "rewards/accuracy_reward": 0.4765625, "rewards/format_count_numbers": 1.5390625, "rewards/format_reward": 0.984375, "step": 12 }, { "completion_length": 82.625, "epoch": 0.00083426921225734, "grad_norm": 8.089653038056108, "kl": 0.0070648193359375, "learning_rate": 9.99582852008728e-07, "loss": 0.0003, "reward": 3.3828125, "reward_std": 0.47115227580070496, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 0.9921875, "step": 13 }, { "completion_length": 82.984375, "epoch": 0.0008984437670463662, "grad_norm": 2.6921754114824137, "kl": 0.00640869140625, "learning_rate": 9.99550763701707e-07, "loss": 0.0003, "reward": 3.25390625, "reward_std": 0.37268710136413574, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.53515625, "rewards/format_reward": 0.984375, "step": 14 }, { "completion_length": 86.34375, "epoch": 0.0009626183218353923, "grad_norm": 5.3694361415616685, "kl": 0.014495849609375, "learning_rate": 9.995186753946862e-07, "loss": 0.0006, "reward": 2.9453125, "reward_std": 0.3975609838962555, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.234375, "rewards/format_reward": 0.9921875, "step": 15 }, { "completion_length": 73.1171875, "epoch": 0.0010267928766244184, "grad_norm": 3.920415856048503, "kl": 0.011138916015625, "learning_rate": 9.994865870876652e-07, "loss": 0.0004, "reward": 3.05859375, "reward_std": 0.3202301412820816, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.42578125, "rewards/format_reward": 0.9921875, "step": 16 }, { "completion_length": 77.875, "epoch": 0.0010909674314134447, "grad_norm": 4.105012934454526, "kl": 0.007293701171875, "learning_rate": 9.994544987806442e-07, "loss": 0.0003, "reward": 2.80859375, "reward_std": 0.36826513707637787, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.26953125, "rewards/format_reward": 1.0, "step": 17 }, { "completion_length": 74.1640625, "epoch": 0.0011551419862024708, "grad_norm": 3.025079315189922, "kl": 0.009002685546875, "learning_rate": 9.994224104736234e-07, "loss": 0.0004, "reward": 2.72265625, "reward_std": 0.3272075057029724, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.16015625, "rewards/format_reward": 1.0, "step": 18 }, { "completion_length": 75.703125, "epoch": 0.0012193165409914968, "grad_norm": 2.702032953911433, "kl": 0.0196685791015625, "learning_rate": 9.993903221666024e-07, "loss": 0.0008, "reward": 2.99609375, "reward_std": 0.35988467931747437, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.39453125, "rewards/format_reward": 0.984375, "step": 19 }, { "completion_length": 72.75, "epoch": 0.001283491095780523, "grad_norm": 6.672377706898142, "kl": 0.00946044921875, "learning_rate": 9.993582338595814e-07, "loss": 0.0004, "reward": 2.78515625, "reward_std": 0.3711431473493576, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.14453125, "rewards/format_reward": 0.9921875, "step": 20 }, { "completion_length": 72.234375, "epoch": 0.0013476656505695492, "grad_norm": 3.21648017083967, "kl": 0.01513671875, "learning_rate": 9.993261455525607e-07, "loss": 0.0006, "reward": 3.3046875, "reward_std": 0.3110102415084839, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.6484375, "rewards/format_reward": 0.9765625, "step": 21 }, { "completion_length": 81.3203125, "epoch": 0.0014118402053585753, "grad_norm": 5.2036438638538405, "kl": 0.020416259765625, "learning_rate": 9.992940572455397e-07, "loss": 0.0008, "reward": 2.83203125, "reward_std": 0.3504672795534134, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.30078125, "rewards/format_reward": 0.9921875, "step": 22 }, { "completion_length": 77.3515625, "epoch": 0.0014760147601476014, "grad_norm": 4.546185134633502, "kl": 0.013824462890625, "learning_rate": 9.992619689385189e-07, "loss": 0.0006, "reward": 3.34375, "reward_std": 0.3390200138092041, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.6328125, "rewards/format_reward": 1.0, "step": 23 }, { "completion_length": 78.234375, "epoch": 0.0015401893149366275, "grad_norm": 3.663495400021626, "kl": 0.015289306640625, "learning_rate": 9.992298806314979e-07, "loss": 0.0006, "reward": 3.02734375, "reward_std": 0.3106808215379715, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.41015625, "rewards/format_reward": 1.0, "step": 24 }, { "completion_length": 85.5078125, "epoch": 0.0016043638697256538, "grad_norm": 4.017477734738997, "kl": 0.009735107421875, "learning_rate": 9.991977923244769e-07, "loss": 0.0004, "reward": 3.02734375, "reward_std": 0.3911897838115692, "rewards/accuracy_reward": 0.5, "rewards/format_count_numbers": 1.55078125, "rewards/format_reward": 0.9765625, "step": 25 }, { "completion_length": 70.953125, "epoch": 0.00166853842451468, "grad_norm": 131.7420509388314, "kl": 0.01788330078125, "learning_rate": 9.99165704017456e-07, "loss": 0.0007, "reward": 3.32421875, "reward_std": 0.3377445787191391, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.51953125, "rewards/format_reward": 0.9921875, "step": 26 }, { "completion_length": 83.9140625, "epoch": 0.001732712979303706, "grad_norm": 8.203637941977545, "kl": 0.012908935546875, "learning_rate": 9.99133615710435e-07, "loss": 0.0005, "reward": 3.140625, "reward_std": 0.31107497215270996, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.53125, "rewards/format_reward": 1.0, "step": 27 }, { "completion_length": 87.546875, "epoch": 0.0017968875340927323, "grad_norm": 3.074634047919227, "kl": 0.01849365234375, "learning_rate": 9.99101527403414e-07, "loss": 0.0007, "reward": 3.26171875, "reward_std": 0.3248459994792938, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 0.984375, "step": 28 }, { "completion_length": 83.1015625, "epoch": 0.0018610620888817584, "grad_norm": 1.7996263974927855, "kl": 0.01739501953125, "learning_rate": 9.990694390963933e-07, "loss": 0.0007, "reward": 3.36328125, "reward_std": 0.34072481095790863, "rewards/accuracy_reward": 0.703125, "rewards/format_count_numbers": 1.67578125, "rewards/format_reward": 0.984375, "step": 29 }, { "completion_length": 82.0390625, "epoch": 0.0019252366436707845, "grad_norm": 5.044487990967384, "kl": 0.01666259765625, "learning_rate": 9.990373507893723e-07, "loss": 0.0007, "reward": 3.1328125, "reward_std": 0.3294168561697006, "rewards/accuracy_reward": 0.59375, "rewards/format_count_numbers": 1.546875, "rewards/format_reward": 0.9921875, "step": 30 }, { "completion_length": 75.6640625, "epoch": 0.0019894111984598106, "grad_norm": 4.303535913004868, "kl": 0.02020263671875, "learning_rate": 9.990052624823513e-07, "loss": 0.0008, "reward": 3.14453125, "reward_std": 0.3391089290380478, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.43359375, "rewards/format_reward": 0.9921875, "step": 31 }, { "completion_length": 91.375, "epoch": 0.0020535857532488367, "grad_norm": 3.867777936702832, "kl": 0.0233154296875, "learning_rate": 9.989731741753305e-07, "loss": 0.0009, "reward": 3.17578125, "reward_std": 0.3817155063152313, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.65234375, "rewards/format_reward": 0.9921875, "step": 32 }, { "completion_length": 80.0859375, "epoch": 0.002117760308037863, "grad_norm": 5.660841872987584, "kl": 0.0257568359375, "learning_rate": 9.989410858683095e-07, "loss": 0.001, "reward": 3.13671875, "reward_std": 0.3630076050758362, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 0.9921875, "step": 33 }, { "completion_length": 72.8828125, "epoch": 0.0021819348628268893, "grad_norm": 4.882900614418831, "kl": 0.02276611328125, "learning_rate": 9.989089975612887e-07, "loss": 0.0009, "reward": 2.640625, "reward_std": 0.44960109889507294, "rewards/accuracy_reward": 0.484375, "rewards/format_count_numbers": 1.1640625, "rewards/format_reward": 0.9921875, "step": 34 }, { "completion_length": 80.9296875, "epoch": 0.0022461094176159154, "grad_norm": 4.3133118366946475, "kl": 0.02490234375, "learning_rate": 9.988769092542677e-07, "loss": 0.001, "reward": 3.453125, "reward_std": 0.3391571342945099, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.7734375, "rewards/format_reward": 1.0, "step": 35 }, { "completion_length": 89.9609375, "epoch": 0.0023102839724049415, "grad_norm": 6.955721387397482, "kl": 0.01922607421875, "learning_rate": 9.988448209472467e-07, "loss": 0.0008, "reward": 2.87890625, "reward_std": 0.2806504964828491, "rewards/accuracy_reward": 0.4765625, "rewards/format_count_numbers": 1.40234375, "rewards/format_reward": 1.0, "step": 36 }, { "completion_length": 74.5390625, "epoch": 0.0023744585271939676, "grad_norm": 4.861590830565515, "kl": 0.02423095703125, "learning_rate": 9.988127326402257e-07, "loss": 0.001, "reward": 3.06640625, "reward_std": 0.2550649642944336, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.43359375, "rewards/format_reward": 1.0, "step": 37 }, { "completion_length": 73.921875, "epoch": 0.0024386330819829937, "grad_norm": 2.854709944808263, "kl": 0.02410888671875, "learning_rate": 9.98780644333205e-07, "loss": 0.001, "reward": 2.8984375, "reward_std": 0.3886113613843918, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.296875, "rewards/format_reward": 0.984375, "step": 38 }, { "completion_length": 70.9609375, "epoch": 0.00250280763677202, "grad_norm": 3.5923630444215755, "kl": 0.0277099609375, "learning_rate": 9.98748556026184e-07, "loss": 0.0011, "reward": 3.2265625, "reward_std": 0.2659813463687897, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.4609375, "rewards/format_reward": 1.0, "step": 39 }, { "completion_length": 66.390625, "epoch": 0.002566982191561046, "grad_norm": 3.041742830092947, "kl": 0.02459716796875, "learning_rate": 9.987164677191631e-07, "loss": 0.001, "reward": 3.296875, "reward_std": 0.2199605107307434, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.59375, "rewards/format_reward": 0.9921875, "step": 40 }, { "completion_length": 74.828125, "epoch": 0.0026311567463500724, "grad_norm": 3.3807511168475717, "kl": 0.0289306640625, "learning_rate": 9.986843794121421e-07, "loss": 0.0012, "reward": 3.30078125, "reward_std": 0.34865450859069824, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.69921875, "rewards/format_reward": 1.0, "step": 41 }, { "completion_length": 89.6953125, "epoch": 0.0026953313011390985, "grad_norm": 1.9360780618011337, "kl": 0.02545166015625, "learning_rate": 9.986522911051214e-07, "loss": 0.001, "reward": 3.07421875, "reward_std": 0.24038218706846237, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.52734375, "rewards/format_reward": 1.0, "step": 42 }, { "completion_length": 71.140625, "epoch": 0.0027595058559281246, "grad_norm": 2.522942068290189, "kl": 0.033447265625, "learning_rate": 9.986202027981004e-07, "loss": 0.0013, "reward": 3.234375, "reward_std": 0.30629581212997437, "rewards/accuracy_reward": 0.5234375, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 1.0, "step": 43 }, { "completion_length": 76.5703125, "epoch": 0.0028236804107171507, "grad_norm": 2.7470688462972572, "kl": 0.026123046875, "learning_rate": 9.985881144910794e-07, "loss": 0.001, "reward": 3.25, "reward_std": 0.30682672560214996, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 0.9765625, "step": 44 }, { "completion_length": 83.8046875, "epoch": 0.0028878549655061768, "grad_norm": 2.4739700610037985, "kl": 0.02978515625, "learning_rate": 9.985560261840584e-07, "loss": 0.0012, "reward": 3.05078125, "reward_std": 0.29518504440784454, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.48046875, "rewards/format_reward": 0.9921875, "step": 45 }, { "completion_length": 65.40625, "epoch": 0.002952029520295203, "grad_norm": 3.690551370084788, "kl": 0.0428466796875, "learning_rate": 9.985239378770376e-07, "loss": 0.0017, "reward": 3.17578125, "reward_std": 0.30267418175935745, "rewards/accuracy_reward": 0.859375, "rewards/format_count_numbers": 1.32421875, "rewards/format_reward": 0.9921875, "step": 46 }, { "completion_length": 69.03125, "epoch": 0.003016204075084229, "grad_norm": 4.382403144838998, "kl": 0.0341796875, "learning_rate": 9.984918495700166e-07, "loss": 0.0014, "reward": 3.484375, "reward_std": 0.23816770315170288, "rewards/accuracy_reward": 0.78125, "rewards/format_count_numbers": 1.703125, "rewards/format_reward": 1.0, "step": 47 }, { "completion_length": 68.015625, "epoch": 0.003080378629873255, "grad_norm": 3.452349900499519, "kl": 0.0255126953125, "learning_rate": 9.984597612629958e-07, "loss": 0.001, "reward": 2.97265625, "reward_std": 0.32505670189857483, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.33984375, "rewards/format_reward": 1.0, "step": 48 }, { "completion_length": 75.1953125, "epoch": 0.0031445531846622816, "grad_norm": 3.392241748989927, "kl": 0.03125, "learning_rate": 9.984276729559748e-07, "loss": 0.0012, "reward": 3.10546875, "reward_std": 0.39815399050712585, "rewards/accuracy_reward": 0.6796875, "rewards/format_count_numbers": 1.42578125, "rewards/format_reward": 1.0, "step": 49 }, { "completion_length": 67.9765625, "epoch": 0.0032087277394513077, "grad_norm": 2.177945178884684, "kl": 0.047607421875, "learning_rate": 9.98395584648954e-07, "loss": 0.0019, "reward": 2.953125, "reward_std": 0.2759072184562683, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.34375, "rewards/format_reward": 1.0, "step": 50 }, { "completion_length": 78.390625, "epoch": 0.0032729022942403338, "grad_norm": 3.0857042332441456, "kl": 0.0269775390625, "learning_rate": 9.98363496341933e-07, "loss": 0.0011, "reward": 2.890625, "reward_std": 0.27773458510637283, "rewards/accuracy_reward": 0.4296875, "rewards/format_count_numbers": 1.4609375, "rewards/format_reward": 1.0, "step": 51 }, { "completion_length": 67.046875, "epoch": 0.00333707684902936, "grad_norm": 5.440745248958781, "kl": 0.13043212890625, "learning_rate": 9.98331408034912e-07, "loss": 0.0052, "reward": 2.921875, "reward_std": 0.3226177394390106, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.3359375, "rewards/format_reward": 0.984375, "step": 52 }, { "completion_length": 66.4375, "epoch": 0.003401251403818386, "grad_norm": 4.657297519762943, "kl": 0.041259765625, "learning_rate": 9.982993197278912e-07, "loss": 0.0017, "reward": 3.25, "reward_std": 0.3035288602113724, "rewards/accuracy_reward": 0.578125, "rewards/format_count_numbers": 1.671875, "rewards/format_reward": 1.0, "step": 53 }, { "completion_length": 70.2265625, "epoch": 0.003465425958607412, "grad_norm": 2.3261413114277727, "kl": 0.048583984375, "learning_rate": 9.982672314208702e-07, "loss": 0.0019, "reward": 3.48046875, "reward_std": 0.2077426016330719, "rewards/accuracy_reward": 0.65625, "rewards/format_count_numbers": 1.82421875, "rewards/format_reward": 1.0, "step": 54 }, { "completion_length": 69.4765625, "epoch": 0.003529600513396438, "grad_norm": 12.890917932396162, "kl": 0.045166015625, "learning_rate": 9.982351431138492e-07, "loss": 0.0018, "reward": 3.34765625, "reward_std": 0.25980181246995926, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.70703125, "rewards/format_reward": 1.0, "step": 55 }, { "completion_length": 70.75, "epoch": 0.0035937750681854647, "grad_norm": 2.3924995403034814, "kl": 0.0390625, "learning_rate": 9.982030548068284e-07, "loss": 0.0016, "reward": 2.83203125, "reward_std": 0.2434411644935608, "rewards/accuracy_reward": 0.6484375, "rewards/format_count_numbers": 1.18359375, "rewards/format_reward": 1.0, "step": 56 }, { "completion_length": 78.265625, "epoch": 0.0036579496229744908, "grad_norm": 4.097074998616903, "kl": 0.0467529296875, "learning_rate": 9.981709664998074e-07, "loss": 0.0019, "reward": 3.16796875, "reward_std": 0.323630690574646, "rewards/accuracy_reward": 0.625, "rewards/format_count_numbers": 1.54296875, "rewards/format_reward": 1.0, "step": 57 }, { "completion_length": 64.4453125, "epoch": 0.003722124177763517, "grad_norm": 2.8596095528212815, "kl": 0.045166015625, "learning_rate": 9.981388781927866e-07, "loss": 0.0018, "reward": 2.99609375, "reward_std": 0.3102172762155533, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.45703125, "rewards/format_reward": 1.0, "step": 58 }, { "completion_length": 64.9375, "epoch": 0.003786298732552543, "grad_norm": 2.728196990192836, "kl": 0.0457763671875, "learning_rate": 9.981067898857656e-07, "loss": 0.0018, "reward": 2.9609375, "reward_std": 0.2459762617945671, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.3515625, "rewards/format_reward": 1.0, "step": 59 }, { "completion_length": 69.0625, "epoch": 0.003850473287341569, "grad_norm": 3.4406926961923587, "kl": 0.03759765625, "learning_rate": 9.980747015787446e-07, "loss": 0.0015, "reward": 2.87109375, "reward_std": 0.34723127633333206, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.21484375, "rewards/format_reward": 0.9921875, "step": 60 }, { "completion_length": 66.5390625, "epoch": 0.0039146478421305956, "grad_norm": 3.057262479255094, "kl": 0.043212890625, "learning_rate": 9.980426132717238e-07, "loss": 0.0017, "reward": 3.38671875, "reward_std": 0.22124166041612625, "rewards/accuracy_reward": 0.671875, "rewards/format_count_numbers": 1.71484375, "rewards/format_reward": 1.0, "step": 61 }, { "completion_length": 84.8125, "epoch": 0.003978822396919621, "grad_norm": 6.416434548752605, "kl": 0.0321044921875, "learning_rate": 9.980105249647028e-07, "loss": 0.0013, "reward": 3.08984375, "reward_std": 0.32453496754169464, "rewards/accuracy_reward": 0.4453125, "rewards/format_count_numbers": 1.64453125, "rewards/format_reward": 1.0, "step": 62 }, { "completion_length": 77.1953125, "epoch": 0.004042996951708648, "grad_norm": 3.732769709996209, "kl": 0.037841796875, "learning_rate": 9.979784366576818e-07, "loss": 0.0015, "reward": 3.1015625, "reward_std": 0.35314419865608215, "rewards/accuracy_reward": 0.53125, "rewards/format_count_numbers": 1.5703125, "rewards/format_reward": 1.0, "step": 63 }, { "completion_length": 77.5, "epoch": 0.004107171506497673, "grad_norm": 2.5167684293651877, "kl": 0.0311279296875, "learning_rate": 9.979463483506608e-07, "loss": 0.0012, "reward": 3.10546875, "reward_std": 0.2587262690067291, "rewards/accuracy_reward": 0.5078125, "rewards/format_count_numbers": 1.60546875, "rewards/format_reward": 0.9921875, "step": 64 }, { "completion_length": 70.609375, "epoch": 0.0041713460612867, "grad_norm": 7.497854848230895, "kl": 0.03466796875, "learning_rate": 9.9791426004364e-07, "loss": 0.0014, "reward": 3.12109375, "reward_std": 0.35663464665412903, "rewards/accuracy_reward": 0.6640625, "rewards/format_count_numbers": 1.45703125, "rewards/format_reward": 1.0, "step": 65 }, { "completion_length": 67.3359375, "epoch": 0.004235520616075726, "grad_norm": 12.830822241443972, "kl": 0.118408203125, "learning_rate": 9.97882171736619e-07, "loss": 0.0047, "reward": 3.08984375, "reward_std": 0.2815767228603363, "rewards/accuracy_reward": 0.7265625, "rewards/format_count_numbers": 1.36328125, "rewards/format_reward": 1.0, "step": 66 }, { "completion_length": 80.15625, "epoch": 0.004299695170864752, "grad_norm": 6.252660074852895, "kl": 0.03729248046875, "learning_rate": 9.978500834295983e-07, "loss": 0.0015, "reward": 3.5703125, "reward_std": 0.21104412525892258, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.9609375, "rewards/format_reward": 1.0, "step": 67 }, { "completion_length": 72.375, "epoch": 0.004363869725653779, "grad_norm": 2.346394602052095, "kl": 0.025146484375, "learning_rate": 9.978179951225773e-07, "loss": 0.001, "reward": 3.26171875, "reward_std": 0.2549284026026726, "rewards/accuracy_reward": 0.7734375, "rewards/format_count_numbers": 1.48828125, "rewards/format_reward": 1.0, "step": 68 }, { "completion_length": 80.0078125, "epoch": 0.004428044280442804, "grad_norm": 6.9747477147107775, "kl": 0.053955078125, "learning_rate": 9.977859068155565e-07, "loss": 0.0022, "reward": 3.1171875, "reward_std": 0.23806139826774597, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.5546875, "rewards/format_reward": 1.0, "step": 69 }, { "completion_length": 76.78125, "epoch": 0.004492218835231831, "grad_norm": 4.378472876328936, "kl": 0.029541015625, "learning_rate": 9.977538185085355e-07, "loss": 0.0012, "reward": 2.93359375, "reward_std": 0.19287973642349243, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.33984375, "rewards/format_reward": 0.984375, "step": 70 }, { "completion_length": 74.171875, "epoch": 0.0045563933900208565, "grad_norm": 2.358606331460238, "kl": 0.0230712890625, "learning_rate": 9.977217302015145e-07, "loss": 0.0009, "reward": 3.54296875, "reward_std": 0.26491738110780716, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.73828125, "rewards/format_reward": 1.0, "step": 71 }, { "completion_length": 75.34375, "epoch": 0.004620567944809883, "grad_norm": 65.47310912953631, "kl": 0.03179931640625, "learning_rate": 9.976896418944935e-07, "loss": 0.0013, "reward": 2.78125, "reward_std": 0.29492397606372833, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.234375, "rewards/format_reward": 1.0, "step": 72 }, { "completion_length": 76.921875, "epoch": 0.004684742499598909, "grad_norm": 3.27041300953387, "kl": 0.0367431640625, "learning_rate": 9.976575535874727e-07, "loss": 0.0015, "reward": 3.33203125, "reward_std": 0.2555892765522003, "rewards/accuracy_reward": 0.734375, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 73 }, { "completion_length": 67.484375, "epoch": 0.004748917054387935, "grad_norm": 5.812215039650067, "kl": 0.03240966796875, "learning_rate": 9.976254652804517e-07, "loss": 0.0013, "reward": 3.43359375, "reward_std": 0.20885366201400757, "rewards/accuracy_reward": 0.8125, "rewards/format_count_numbers": 1.62109375, "rewards/format_reward": 1.0, "step": 74 }, { "completion_length": 82.4921875, "epoch": 0.004813091609176962, "grad_norm": 4.4277516048263506, "kl": 0.0313720703125, "learning_rate": 9.97593376973431e-07, "loss": 0.0013, "reward": 3.2734375, "reward_std": 0.19161942601203918, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.5625, "rewards/format_reward": 1.0, "step": 75 }, { "completion_length": 67.109375, "epoch": 0.004877266163965987, "grad_norm": 3.0595066417538415, "kl": 0.04052734375, "learning_rate": 9.9756128866641e-07, "loss": 0.0016, "reward": 3.5625, "reward_std": 0.32848016172647476, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.84375, "rewards/format_reward": 1.0, "step": 76 }, { "completion_length": 82.9921875, "epoch": 0.004941440718755014, "grad_norm": 3.999806146457781, "kl": 0.0372314453125, "learning_rate": 9.975292003593891e-07, "loss": 0.0015, "reward": 3.0546875, "reward_std": 0.2797150984406471, "rewards/accuracy_reward": 0.4921875, "rewards/format_count_numbers": 1.5703125, "rewards/format_reward": 0.9921875, "step": 77 }, { "completion_length": 75.265625, "epoch": 0.00500561527354404, "grad_norm": 2.4007654591592953, "kl": 0.02508544921875, "learning_rate": 9.974971120523681e-07, "loss": 0.001, "reward": 3.30859375, "reward_std": 0.21722427010536194, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.59765625, "rewards/format_reward": 1.0, "step": 78 }, { "completion_length": 69.8671875, "epoch": 0.005069789828333066, "grad_norm": 4.413870539754506, "kl": 0.0401611328125, "learning_rate": 9.974650237453471e-07, "loss": 0.0016, "reward": 3.09765625, "reward_std": 0.24370676279067993, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.46484375, "rewards/format_reward": 0.9921875, "step": 79 }, { "completion_length": 71.3828125, "epoch": 0.005133964383122092, "grad_norm": 2.7469356159448375, "kl": 0.03594970703125, "learning_rate": 9.974329354383261e-07, "loss": 0.0014, "reward": 3.359375, "reward_std": 0.28942976146936417, "rewards/accuracy_reward": 0.765625, "rewards/format_count_numbers": 1.59375, "rewards/format_reward": 1.0, "step": 80 }, { "completion_length": 71.6875, "epoch": 0.005198138937911118, "grad_norm": 4.694669178987162, "kl": 0.04248046875, "learning_rate": 9.974008471313053e-07, "loss": 0.0017, "reward": 3.09765625, "reward_std": 0.3313465863466263, "rewards/accuracy_reward": 0.640625, "rewards/format_count_numbers": 1.46484375, "rewards/format_reward": 0.9921875, "step": 81 }, { "completion_length": 71.9375, "epoch": 0.005262313492700145, "grad_norm": 5.239839542228686, "kl": 0.0465087890625, "learning_rate": 9.973687588242843e-07, "loss": 0.0019, "reward": 3.40234375, "reward_std": 0.2555918022990227, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.84765625, "rewards/format_reward": 0.9921875, "step": 82 }, { "completion_length": 66.59375, "epoch": 0.0053264880474891705, "grad_norm": 9.58228205262393, "kl": 0.03302001953125, "learning_rate": 9.973366705172635e-07, "loss": 0.0013, "reward": 3.328125, "reward_std": 0.27802956849336624, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.6171875, "rewards/format_reward": 1.0, "step": 83 }, { "completion_length": 73.7734375, "epoch": 0.005390662602278197, "grad_norm": 5.487582516234903, "kl": 0.0311279296875, "learning_rate": 9.973045822102425e-07, "loss": 0.0012, "reward": 3.28515625, "reward_std": 0.24403482675552368, "rewards/accuracy_reward": 0.5625, "rewards/format_count_numbers": 1.72265625, "rewards/format_reward": 1.0, "step": 84 }, { "completion_length": 77.0859375, "epoch": 0.005454837157067223, "grad_norm": 2.8797110234241217, "kl": 0.03607177734375, "learning_rate": 9.972724939032218e-07, "loss": 0.0014, "reward": 3.4296875, "reward_std": 0.3189963102340698, "rewards/accuracy_reward": 0.7109375, "rewards/format_count_numbers": 1.734375, "rewards/format_reward": 0.984375, "step": 85 }, { "completion_length": 71.2734375, "epoch": 0.005519011711856249, "grad_norm": 3.495421949980677, "kl": 0.043212890625, "learning_rate": 9.972404055962008e-07, "loss": 0.0017, "reward": 3.0625, "reward_std": 0.30399875342845917, "rewards/accuracy_reward": 0.5859375, "rewards/format_count_numbers": 1.484375, "rewards/format_reward": 0.9921875, "step": 86 }, { "completion_length": 67.015625, "epoch": 0.005583186266645275, "grad_norm": 6.764316503209916, "kl": 0.0382080078125, "learning_rate": 9.972083172891798e-07, "loss": 0.0015, "reward": 2.8671875, "reward_std": 0.3235751837491989, "rewards/accuracy_reward": 0.546875, "rewards/format_count_numbers": 1.3359375, "rewards/format_reward": 0.984375, "step": 87 }, { "completion_length": 77.1796875, "epoch": 0.005647360821434301, "grad_norm": 2.0348391619602713, "kl": 0.046142578125, "learning_rate": 9.971762289821588e-07, "loss": 0.0018, "reward": 3.046875, "reward_std": 0.26907191798090935, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.4375, "rewards/format_reward": 1.0, "step": 88 }, { "completion_length": 69.859375, "epoch": 0.005711535376223327, "grad_norm": 2.4510593826235993, "kl": 0.044677734375, "learning_rate": 9.97144140675138e-07, "loss": 0.0018, "reward": 3.40625, "reward_std": 0.3325708657503128, "rewards/accuracy_reward": 0.6953125, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 1.0, "step": 89 }, { "completion_length": 69.421875, "epoch": 0.0057757099310123535, "grad_norm": 7.386556285996494, "kl": 0.0384521484375, "learning_rate": 9.97112052368117e-07, "loss": 0.0015, "reward": 3.45703125, "reward_std": 0.18665644526481628, "rewards/accuracy_reward": 0.609375, "rewards/format_count_numbers": 1.84765625, "rewards/format_reward": 1.0, "step": 90 }, { "completion_length": 76.546875, "epoch": 0.00583988448580138, "grad_norm": 10.07527340286235, "kl": 0.03741455078125, "learning_rate": 9.97079964061096e-07, "loss": 0.0015, "reward": 3.4765625, "reward_std": 0.17688900232315063, "rewards/accuracy_reward": 0.75, "rewards/format_count_numbers": 1.7265625, "rewards/format_reward": 1.0, "step": 91 }, { "completion_length": 67.1015625, "epoch": 0.005904059040590406, "grad_norm": 6.961725253066966, "kl": 0.0450439453125, "learning_rate": 9.970478757540752e-07, "loss": 0.0018, "reward": 3.15234375, "reward_std": 0.18702887743711472, "rewards/accuracy_reward": 0.8046875, "rewards/format_count_numbers": 1.34765625, "rewards/format_reward": 1.0, "step": 92 }, { "completion_length": 67.75, "epoch": 0.005968233595379432, "grad_norm": 4.665158893240586, "kl": 0.0504150390625, "learning_rate": 9.970157874470542e-07, "loss": 0.002, "reward": 3.32421875, "reward_std": 0.17975258082151413, "rewards/accuracy_reward": 0.71875, "rewards/format_count_numbers": 1.61328125, "rewards/format_reward": 0.9921875, "step": 93 }, { "completion_length": 62.765625, "epoch": 0.006032408150168458, "grad_norm": 6.052438787803174, "kl": 0.0577392578125, "learning_rate": 9.969836991400334e-07, "loss": 0.0023, "reward": 2.96484375, "reward_std": 0.17256294190883636, "rewards/accuracy_reward": 0.6171875, "rewards/format_count_numbers": 1.35546875, "rewards/format_reward": 0.9921875, "step": 94 }, { "completion_length": 67.546875, "epoch": 0.0060965827049574844, "grad_norm": 3.8543628664738248, "kl": 0.1171875, "learning_rate": 9.969516108330124e-07, "loss": 0.0047, "reward": 2.75, "reward_std": 0.25491149723529816, "rewards/accuracy_reward": 0.5390625, "rewards/format_count_numbers": 1.21875, "rewards/format_reward": 0.9921875, "step": 95 }, { "completion_length": 57.5234375, "epoch": 0.00616075725974651, "grad_norm": 10.068717750488322, "kl": 0.050048828125, "learning_rate": 9.969195225259914e-07, "loss": 0.002, "reward": 3.34765625, "reward_std": 0.15529648214578629, "rewards/accuracy_reward": 0.859375, "rewards/format_count_numbers": 1.49609375, "rewards/format_reward": 0.9921875, "step": 96 }, { "completion_length": 77.4140625, "epoch": 0.006224931814535537, "grad_norm": 76.26414395404969, "kl": 0.0400390625, "learning_rate": 9.968874342189706e-07, "loss": 0.0016, "reward": 3.3046875, "reward_std": 0.18849123269319534, "rewards/accuracy_reward": 0.6015625, "rewards/format_count_numbers": 1.7109375, "rewards/format_reward": 0.9921875, "step": 97 }, { "completion_length": 64.359375, "epoch": 0.006289106369324563, "grad_norm": 6.324698594548965, "kl": 0.05126953125, "learning_rate": 9.968553459119496e-07, "loss": 0.0021, "reward": 3.09765625, "reward_std": 0.16071559116244316, "rewards/accuracy_reward": 0.7578125, "rewards/format_count_numbers": 1.34765625, "rewards/format_reward": 0.9921875, "step": 98 }, { "completion_length": 69.578125, "epoch": 0.006353280924113589, "grad_norm": 2.6642177820540494, "kl": 0.05078125, "learning_rate": 9.968232576049286e-07, "loss": 0.002, "reward": 3.20703125, "reward_std": 0.19081907719373703, "rewards/accuracy_reward": 0.6328125, "rewards/format_count_numbers": 1.57421875, "rewards/format_reward": 1.0, "step": 99 }, { "completion_length": 69.6171875, "epoch": 0.006417455478902615, "grad_norm": 3.1800877639787006, "kl": 0.04925537109375, "learning_rate": 9.967911692979078e-07, "loss": 0.002, "reward": 3.15625, "reward_std": 0.25464994460344315, "rewards/accuracy_reward": 0.6875, "rewards/format_count_numbers": 1.46875, "rewards/format_reward": 1.0, "step": 100 } ], "logging_steps": 1.0, "max_steps": 31164, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }