{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994182664339732, "eval_steps": 500, "global_step": 859, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 489.01953125, "epoch": 0.0011634671320535194, "grad_norm": 0.4066821416431487, "kl": 0.0004706382751464844, "learning_rate": 3.846153846153846e-08, "loss": 0.0, "reward": 0.107421875, "reward_std": 0.14256841503083706, "rewards/correctness_reward_func": 0.1015625, "rewards/strict_format_reward_func": 0.005859375, "step": 1 }, { "completion_length": 491.90234375, "epoch": 0.002326934264107039, "grad_norm": 1.2204081989870978, "kl": 0.0008008480072021484, "learning_rate": 7.692307692307692e-08, "loss": 0.0, "reward": 0.12109375, "reward_std": 0.22897969186306, "rewards/correctness_reward_func": 0.1015625, "rewards/strict_format_reward_func": 0.01953125, "step": 2 }, { "completion_length": 489.51953125, "epoch": 0.0034904013961605585, "grad_norm": 9.972307004201902, "kl": 0.004039764404296875, "learning_rate": 1.1538461538461539e-07, "loss": 0.0002, "reward": 0.064453125, "reward_std": 0.125604297965765, "rewards/correctness_reward_func": 0.0546875, "rewards/strict_format_reward_func": 0.009765625, "step": 3 }, { "completion_length": 506.64453125, "epoch": 0.004653868528214078, "grad_norm": 1.442368000970251, "kl": 0.009157180786132812, "learning_rate": 1.5384615384615385e-07, "loss": 0.0004, "reward": 0.09765625, "reward_std": 0.17736226692795753, "rewards/correctness_reward_func": 0.0859375, "rewards/strict_format_reward_func": 0.01171875, "step": 4 }, { "completion_length": 494.796875, "epoch": 0.005817335660267597, "grad_norm": 2.215173709920235, "kl": 0.004576206207275391, "learning_rate": 1.9230769230769231e-07, "loss": 0.0002, "reward": 0.09375, "reward_std": 0.1610843911767006, "rewards/correctness_reward_func": 0.0859375, "rewards/strict_format_reward_func": 0.0078125, "step": 5 }, { "completion_length": 500.6875, "epoch": 0.006980802792321117, "grad_norm": 1.5436419526429799, "kl": 0.0007085800170898438, "learning_rate": 2.3076923076923078e-07, "loss": 0.0, "reward": 0.15234375, "reward_std": 0.24702188372612, "rewards/correctness_reward_func": 0.140625, "rewards/strict_format_reward_func": 0.01171875, "step": 6 }, { "completion_length": 502.09375, "epoch": 0.008144269924374637, "grad_norm": 16.600174739059614, "kl": 0.1050872802734375, "learning_rate": 2.692307692307692e-07, "loss": 0.0042, "reward": 0.126953125, "reward_std": 0.1962406411767006, "rewards/correctness_reward_func": 0.1171875, "rewards/strict_format_reward_func": 0.009765625, "step": 7 }, { "completion_length": 574.51171875, "epoch": 0.009307737056428156, "grad_norm": 151.48079010972526, "kl": 0.18497467041015625, "learning_rate": 3.076923076923077e-07, "loss": 0.0074, "reward": 0.115234375, "reward_std": 0.1714012213051319, "rewards/correctness_reward_func": 0.1015625, "rewards/strict_format_reward_func": 0.013671875, "step": 8 }, { "completion_length": 456.30859375, "epoch": 0.010471204188481676, "grad_norm": 3.068318566100937, "kl": 0.007595062255859375, "learning_rate": 3.461538461538461e-07, "loss": 0.0003, "reward": 0.12890625, "reward_std": 0.20751213282346725, "rewards/correctness_reward_func": 0.1171875, "rewards/strict_format_reward_func": 0.01171875, "step": 9 }, { "completion_length": 467.6171875, "epoch": 0.011634671320535195, "grad_norm": 5.54045688921292, "kl": 0.02019977569580078, "learning_rate": 3.8461538461538463e-07, "loss": 0.0008, "reward": 0.111328125, "reward_std": 0.2094484455883503, "rewards/correctness_reward_func": 0.1015625, "rewards/strict_format_reward_func": 0.009765625, "step": 10 }, { "completion_length": 522.4609375, "epoch": 0.012798138452588714, "grad_norm": 1.6672527087542282, "kl": 0.007293701171875, "learning_rate": 4.2307692307692304e-07, "loss": 0.0003, "reward": 0.09375, "reward_std": 0.14160171803086996, "rewards/correctness_reward_func": 0.078125, "rewards/strict_format_reward_func": 0.015625, "step": 11 }, { "completion_length": 554.41015625, "epoch": 0.013961605584642234, "grad_norm": 15.713162541614723, "kl": 0.022916793823242188, "learning_rate": 4.6153846153846156e-07, "loss": 0.0009, "reward": 0.05859375, "reward_std": 0.09407384321093559, "rewards/correctness_reward_func": 0.046875, "rewards/strict_format_reward_func": 0.01171875, "step": 12 }, { "completion_length": 472.33203125, "epoch": 0.015125072716695753, "grad_norm": 9.999083570023721, "kl": 0.00162506103515625, "learning_rate": 5e-07, "loss": 0.0001, "reward": 0.099609375, "reward_std": 0.16796875, "rewards/correctness_reward_func": 0.09375, "rewards/strict_format_reward_func": 0.005859375, "step": 13 }, { "completion_length": 471.0390625, "epoch": 0.016288539848749273, "grad_norm": 14.601550530650117, "kl": 0.018660545349121094, "learning_rate": 5.384615384615384e-07, "loss": 0.0007, "reward": 0.142578125, "reward_std": 0.2406984455883503, "rewards/correctness_reward_func": 0.1328125, "rewards/strict_format_reward_func": 0.009765625, "step": 14 }, { "completion_length": 513.76953125, "epoch": 0.017452006980802792, "grad_norm": 3.897589890240525, "kl": 0.0014133453369140625, "learning_rate": 5.769230769230768e-07, "loss": 0.0001, "reward": 0.078125, "reward_std": 0.13499781489372253, "rewards/correctness_reward_func": 0.0703125, "rewards/strict_format_reward_func": 0.0078125, "step": 15 }, { "completion_length": 461.609375, "epoch": 0.01861547411285631, "grad_norm": 0.2331700635519536, "kl": 0.0008556842803955078, "learning_rate": 6.153846153846154e-07, "loss": 0.0, "reward": 0.080078125, "reward_std": 0.13233871944248676, "rewards/correctness_reward_func": 0.0703125, "rewards/strict_format_reward_func": 0.009765625, "step": 16 }, { "completion_length": 493.96484375, "epoch": 0.01977894124490983, "grad_norm": 0.6332584588781173, "kl": 0.0008721351623535156, "learning_rate": 6.538461538461538e-07, "loss": 0.0, "reward": 0.109375, "reward_std": 0.19749781489372253, "rewards/correctness_reward_func": 0.09375, "rewards/strict_format_reward_func": 0.015625, "step": 17 }, { "completion_length": 499.87890625, "epoch": 0.020942408376963352, "grad_norm": 6.416190324649871, "kl": 0.0064716339111328125, "learning_rate": 6.923076923076922e-07, "loss": 0.0003, "reward": 0.0859375, "reward_std": 0.1454593911767006, "rewards/correctness_reward_func": 0.078125, "rewards/strict_format_reward_func": 0.0078125, "step": 18 }, { "completion_length": 560.52734375, "epoch": 0.02210587550901687, "grad_norm": 0.18792971982824808, "kl": 0.0007505416870117188, "learning_rate": 7.307692307692307e-07, "loss": 0.0, "reward": 0.083984375, "reward_std": 0.16796875, "rewards/correctness_reward_func": 0.078125, "rewards/strict_format_reward_func": 0.005859375, "step": 19 }, { "completion_length": 475.859375, "epoch": 0.02326934264107039, "grad_norm": 57.89397166781348, "kl": 0.42646121978759766, "learning_rate": 7.692307692307693e-07, "loss": 0.0171, "reward": 0.119140625, "reward_std": 0.17447129637002945, "rewards/correctness_reward_func": 0.1015625, "rewards/strict_format_reward_func": 0.017578125, "step": 20 }, { "completion_length": 490.015625, "epoch": 0.02443280977312391, "grad_norm": 8.434807442056037, "kl": 0.023256301879882812, "learning_rate": 8.076923076923077e-07, "loss": 0.0009, "reward": 0.1171875, "reward_std": 0.19881487637758255, "rewards/correctness_reward_func": 0.09375, "rewards/strict_format_reward_func": 0.0234375, "step": 21 }, { "completion_length": 506.33984375, "epoch": 0.025596276905177427, "grad_norm": 1.1653419639417089, "kl": 0.00223541259765625, "learning_rate": 8.461538461538461e-07, "loss": 0.0001, "reward": 0.10546875, "reward_std": 0.18824483826756477, "rewards/correctness_reward_func": 0.09375, "rewards/strict_format_reward_func": 0.01171875, "step": 22 }, { "completion_length": 511.375, "epoch": 0.02675974403723095, "grad_norm": 0.41128365761978974, "kl": 0.0008344650268554688, "learning_rate": 8.846153846153846e-07, "loss": 0.0, "reward": 0.150390625, "reward_std": 0.25167298316955566, "rewards/correctness_reward_func": 0.1328125, "rewards/strict_format_reward_func": 0.017578125, "step": 23 }, { "completion_length": 528.53125, "epoch": 0.027923211169284468, "grad_norm": 1.4081194117907943, "kl": 0.0036749839782714844, "learning_rate": 9.230769230769231e-07, "loss": 0.0001, "reward": 0.09765625, "reward_std": 0.16415445879101753, "rewards/correctness_reward_func": 0.09375, "rewards/strict_format_reward_func": 0.00390625, "step": 24 }, { "completion_length": 566.2265625, "epoch": 0.029086678301337987, "grad_norm": 1.1799989005370515, "kl": 0.009830474853515625, "learning_rate": 9.615384615384615e-07, "loss": 0.0004, "reward": 0.107421875, "reward_std": 0.16424159705638885, "rewards/correctness_reward_func": 0.09375, "rewards/strict_format_reward_func": 0.013671875, "step": 25 }, { "completion_length": 438.703125, "epoch": 0.030250145433391506, "grad_norm": 3.104360216151301, "kl": 0.03290557861328125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 0.15234375, "reward_std": 0.20628703013062477, "rewards/correctness_reward_func": 0.140625, "rewards/strict_format_reward_func": 0.01171875, "step": 26 }, { "completion_length": 539.5859375, "epoch": 0.031413612565445025, "grad_norm": 0.5338598772393494, "kl": 0.005832672119140625, "learning_rate": 9.99996444102478e-07, "loss": 0.0002, "reward": 0.1015625, "reward_std": 0.17196696251630783, "rewards/correctness_reward_func": 0.0859375, "rewards/strict_format_reward_func": 0.015625, "step": 27 }, { "completion_length": 497.140625, "epoch": 0.03257707969749855, "grad_norm": 2.5666979824433844, "kl": 0.018198013305664062, "learning_rate": 9.999857764604895e-07, "loss": 0.0007, "reward": 0.140625, "reward_std": 0.21521097421646118, "rewards/correctness_reward_func": 0.125, "rewards/strict_format_reward_func": 0.015625, "step": 28 }, { "completion_length": 485.59375, "epoch": 0.03374054682955206, "grad_norm": 6.4345252573344816, "kl": 0.005001068115234375, "learning_rate": 9.999679972257667e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.2648322060704231, "rewards/correctness_reward_func": 0.1640625, "rewards/strict_format_reward_func": 0.0234375, "step": 29 }, { "completion_length": 416.53125, "epoch": 0.034904013961605584, "grad_norm": 2.602336527370174, "kl": 0.0277099609375, "learning_rate": 9.999431066511943e-07, "loss": 0.0011, "reward": 0.125, "reward_std": 0.21884196251630783, "rewards/correctness_reward_func": 0.109375, "rewards/strict_format_reward_func": 0.015625, "step": 30 }, { "completion_length": 485.08203125, "epoch": 0.03606748109365911, "grad_norm": 1.5342850964012629, "kl": 0.014495849609375, "learning_rate": 9.999111050908056e-07, "loss": 0.0006, "reward": 0.197265625, "reward_std": 0.3104500323534012, "rewards/correctness_reward_func": 0.1796875, "rewards/strict_format_reward_func": 0.017578125, "step": 31 }, { "completion_length": 531.640625, "epoch": 0.03723094822571262, "grad_norm": 20.359434755302512, "kl": 0.06761932373046875, "learning_rate": 9.998719929997773e-07, "loss": 0.0027, "reward": 0.142578125, "reward_std": 0.19669455289840698, "rewards/correctness_reward_func": 0.1171875, "rewards/strict_format_reward_func": 0.025390625, "step": 32 }, { "completion_length": 514.54296875, "epoch": 0.038394415357766144, "grad_norm": 0.44887804375426404, "kl": 0.003627777099609375, "learning_rate": 9.998257709344243e-07, "loss": 0.0001, "reward": 0.173828125, "reward_std": 0.24332467839121819, "rewards/correctness_reward_func": 0.171875, "rewards/strict_format_reward_func": 0.001953125, "step": 33 }, { "completion_length": 492.8125, "epoch": 0.03955788248981966, "grad_norm": 3.197350305028001, "kl": 0.007907867431640625, "learning_rate": 9.997724395521901e-07, "loss": 0.0003, "reward": 0.14453125, "reward_std": 0.24139471352100372, "rewards/correctness_reward_func": 0.1171875, "rewards/strict_format_reward_func": 0.02734375, "step": 34 }, { "completion_length": 407.99609375, "epoch": 0.04072134962187318, "grad_norm": 0.4960254389285239, "kl": 0.01204681396484375, "learning_rate": 9.997119996116382e-07, "loss": 0.0005, "reward": 0.240234375, "reward_std": 0.31166573986411095, "rewards/correctness_reward_func": 0.2109375, "rewards/strict_format_reward_func": 0.029296875, "step": 35 }, { "completion_length": 398.2421875, "epoch": 0.041884816753926704, "grad_norm": 9.032088364099023, "kl": 0.03583526611328125, "learning_rate": 9.996444519724418e-07, "loss": 0.0014, "reward": 0.15625, "reward_std": 0.23204976320266724, "rewards/correctness_reward_func": 0.140625, "rewards/strict_format_reward_func": 0.015625, "step": 36 }, { "completion_length": 434.36328125, "epoch": 0.04304828388598022, "grad_norm": 0.5171450751911104, "kl": 0.0065155029296875, "learning_rate": 9.995697975953707e-07, "loss": 0.0003, "reward": 0.193359375, "reward_std": 0.29260406643152237, "rewards/correctness_reward_func": 0.15625, "rewards/strict_format_reward_func": 0.037109375, "step": 37 }, { "completion_length": 488.390625, "epoch": 0.04421175101803374, "grad_norm": 0.40237380286765173, "kl": 0.00611114501953125, "learning_rate": 9.994880375422784e-07, "loss": 0.0002, "reward": 0.26171875, "reward_std": 0.3653857484459877, "rewards/correctness_reward_func": 0.203125, "rewards/strict_format_reward_func": 0.05859375, "step": 38 }, { "completion_length": 420.58984375, "epoch": 0.04537521815008726, "grad_norm": 5.104453757523126, "kl": 0.01647186279296875, "learning_rate": 9.99399172976086e-07, "loss": 0.0007, "reward": 0.29296875, "reward_std": 0.37514638155698776, "rewards/correctness_reward_func": 0.234375, "rewards/strict_format_reward_func": 0.05859375, "step": 39 }, { "completion_length": 394.3828125, "epoch": 0.04653868528214078, "grad_norm": 3.4950699311546694, "kl": 0.009918212890625, "learning_rate": 9.993032051607668e-07, "loss": 0.0004, "reward": 0.267578125, "reward_std": 0.31589680910110474, "rewards/correctness_reward_func": 0.2265625, "rewards/strict_format_reward_func": 0.041015625, "step": 40 }, { "completion_length": 413.97265625, "epoch": 0.0477021524141943, "grad_norm": 1.061976991134195, "kl": 0.0106964111328125, "learning_rate": 9.992001354613277e-07, "loss": 0.0004, "reward": 0.21484375, "reward_std": 0.2792557002976537, "rewards/correctness_reward_func": 0.171875, "rewards/strict_format_reward_func": 0.04296875, "step": 41 }, { "completion_length": 397.8515625, "epoch": 0.04886561954624782, "grad_norm": 0.765028003111525, "kl": 0.01513671875, "learning_rate": 9.990899653437901e-07, "loss": 0.0006, "reward": 0.388671875, "reward_std": 0.41248171031475067, "rewards/correctness_reward_func": 0.3125, "rewards/strict_format_reward_func": 0.076171875, "step": 42 }, { "completion_length": 397.07421875, "epoch": 0.05002908667830134, "grad_norm": 12.065798447865369, "kl": 0.06500244140625, "learning_rate": 9.989726963751682e-07, "loss": 0.0026, "reward": 0.3125, "reward_std": 0.4369198568165302, "rewards/correctness_reward_func": 0.234375, "rewards/strict_format_reward_func": 0.078125, "step": 43 }, { "completion_length": 384.0859375, "epoch": 0.051192553810354854, "grad_norm": 10.020233386348186, "kl": 0.1649932861328125, "learning_rate": 9.988483302234478e-07, "loss": 0.0066, "reward": 0.275390625, "reward_std": 0.3524422347545624, "rewards/correctness_reward_func": 0.1796875, "rewards/strict_format_reward_func": 0.095703125, "step": 44 }, { "completion_length": 424.24609375, "epoch": 0.05235602094240838, "grad_norm": 3809.223926800563, "kl": 24.387359619140625, "learning_rate": 9.987168686575623e-07, "loss": 0.9774, "reward": 0.26953125, "reward_std": 0.3424443453550339, "rewards/correctness_reward_func": 0.1796875, "rewards/strict_format_reward_func": 0.08984375, "step": 45 }, { "completion_length": 365.3359375, "epoch": 0.0535194880744619, "grad_norm": 4.815371512573417, "kl": 0.0827484130859375, "learning_rate": 9.98578313547367e-07, "loss": 0.0033, "reward": 0.39453125, "reward_std": 0.45618152618408203, "rewards/correctness_reward_func": 0.2890625, "rewards/strict_format_reward_func": 0.10546875, "step": 46 }, { "completion_length": 410.48828125, "epoch": 0.054682955206515414, "grad_norm": 3.1518114378957223, "kl": 0.03973388671875, "learning_rate": 9.98432666863613e-07, "loss": 0.0016, "reward": 0.365234375, "reward_std": 0.4011538214981556, "rewards/correctness_reward_func": 0.234375, "rewards/strict_format_reward_func": 0.130859375, "step": 47 }, { "completion_length": 409.66796875, "epoch": 0.055846422338568937, "grad_norm": 0.6202311602466106, "kl": 0.025848388671875, "learning_rate": 9.982799306779189e-07, "loss": 0.001, "reward": 0.591796875, "reward_std": 0.5869772136211395, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.169921875, "step": 48 }, { "completion_length": 375.16015625, "epoch": 0.05700988947062245, "grad_norm": 231.68110012914804, "kl": 0.854034423828125, "learning_rate": 9.98120107162742e-07, "loss": 0.0343, "reward": 0.42578125, "reward_std": 0.4635503552854061, "rewards/correctness_reward_func": 0.2421875, "rewards/strict_format_reward_func": 0.18359375, "step": 49 }, { "completion_length": 396.5, "epoch": 0.058173356602675974, "grad_norm": 8.054334866966464, "kl": 0.02935791015625, "learning_rate": 9.979531985913457e-07, "loss": 0.0012, "reward": 0.43359375, "reward_std": 0.5311293751001358, "rewards/correctness_reward_func": 0.2421875, "rewards/strict_format_reward_func": 0.19140625, "step": 50 }, { "completion_length": 341.55859375, "epoch": 0.059336823734729496, "grad_norm": 39.40136884647299, "kl": 0.515869140625, "learning_rate": 9.977792073377697e-07, "loss": 0.0206, "reward": 0.50390625, "reward_std": 0.5089018940925598, "rewards/correctness_reward_func": 0.3203125, "rewards/strict_format_reward_func": 0.18359375, "step": 51 }, { "completion_length": 375.640625, "epoch": 0.06050029086678301, "grad_norm": 20.902673745049487, "kl": 0.14080810546875, "learning_rate": 9.975981358767944e-07, "loss": 0.0056, "reward": 0.453125, "reward_std": 0.47038574516773224, "rewards/correctness_reward_func": 0.2578125, "rewards/strict_format_reward_func": 0.1953125, "step": 52 }, { "completion_length": 351.328125, "epoch": 0.061663757998836534, "grad_norm": 2.0754249823099595, "kl": 0.0267333984375, "learning_rate": 9.974099867839057e-07, "loss": 0.0011, "reward": 0.525390625, "reward_std": 0.46075718849897385, "rewards/correctness_reward_func": 0.2890625, "rewards/strict_format_reward_func": 0.236328125, "step": 53 }, { "completion_length": 385.4140625, "epoch": 0.06282722513089005, "grad_norm": 335.2509946112828, "kl": 1.28887939453125, "learning_rate": 9.972147627352593e-07, "loss": 0.0513, "reward": 0.533203125, "reward_std": 0.4158325716853142, "rewards/correctness_reward_func": 0.3125, "rewards/strict_format_reward_func": 0.220703125, "step": 54 }, { "completion_length": 315.51953125, "epoch": 0.06399069226294357, "grad_norm": 113.13047615199778, "kl": 0.79083251953125, "learning_rate": 9.970124665076417e-07, "loss": 0.0317, "reward": 0.5625, "reward_std": 0.5517344921827316, "rewards/correctness_reward_func": 0.3203125, "rewards/strict_format_reward_func": 0.2421875, "step": 55 }, { "completion_length": 313.81640625, "epoch": 0.0651541593949971, "grad_norm": 1.1795690608203528, "kl": 0.047210693359375, "learning_rate": 9.96803100978432e-07, "loss": 0.0019, "reward": 0.642578125, "reward_std": 0.47613072395324707, "rewards/correctness_reward_func": 0.34375, "rewards/strict_format_reward_func": 0.298828125, "step": 56 }, { "completion_length": 348.234375, "epoch": 0.06631762652705062, "grad_norm": 13.371406300782917, "kl": 0.11224365234375, "learning_rate": 9.965866691255597e-07, "loss": 0.0045, "reward": 0.66796875, "reward_std": 0.59251669049263, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.27734375, "step": 57 }, { "completion_length": 383.1796875, "epoch": 0.06748109365910412, "grad_norm": 1.6177251711954495, "kl": 0.030792236328125, "learning_rate": 9.963631740274622e-07, "loss": 0.0012, "reward": 0.5625, "reward_std": 0.47907302528619766, "rewards/correctness_reward_func": 0.2890625, "rewards/strict_format_reward_func": 0.2734375, "step": 58 }, { "completion_length": 355.9921875, "epoch": 0.06864456079115765, "grad_norm": 47.382528946856574, "kl": 0.235626220703125, "learning_rate": 9.961326188630425e-07, "loss": 0.0095, "reward": 0.607421875, "reward_std": 0.535815954208374, "rewards/correctness_reward_func": 0.34375, "rewards/strict_format_reward_func": 0.263671875, "step": 59 }, { "completion_length": 377.46875, "epoch": 0.06980802792321117, "grad_norm": 2.8906662162333294, "kl": 0.076416015625, "learning_rate": 9.95895006911623e-07, "loss": 0.0031, "reward": 0.6015625, "reward_std": 0.4677914008498192, "rewards/correctness_reward_func": 0.3046875, "rewards/strict_format_reward_func": 0.296875, "step": 60 }, { "completion_length": 307.0859375, "epoch": 0.07097149505526469, "grad_norm": 5.658248453671423, "kl": 0.06695556640625, "learning_rate": 9.956503415528982e-07, "loss": 0.0027, "reward": 0.60546875, "reward_std": 0.5141743049025536, "rewards/correctness_reward_func": 0.2734375, "rewards/strict_format_reward_func": 0.33203125, "step": 61 }, { "completion_length": 308.98046875, "epoch": 0.07213496218731821, "grad_norm": 4.0281675399539, "kl": 0.06524658203125, "learning_rate": 9.953986262668884e-07, "loss": 0.0026, "reward": 0.66015625, "reward_std": 0.4833526313304901, "rewards/correctness_reward_func": 0.28125, "rewards/strict_format_reward_func": 0.37890625, "step": 62 }, { "completion_length": 251.5625, "epoch": 0.07329842931937172, "grad_norm": 6.4708893162037455, "kl": 0.13079833984375, "learning_rate": 9.951398646338883e-07, "loss": 0.0052, "reward": 0.720703125, "reward_std": 0.4117320328950882, "rewards/correctness_reward_func": 0.3515625, "rewards/strict_format_reward_func": 0.369140625, "step": 63 }, { "completion_length": 299.58203125, "epoch": 0.07446189645142524, "grad_norm": 13.45087311669488, "kl": 0.15289306640625, "learning_rate": 9.948740603344172e-07, "loss": 0.0061, "reward": 0.81640625, "reward_std": 0.6002717912197113, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.33984375, "step": 64 }, { "completion_length": 302.1953125, "epoch": 0.07562536358347877, "grad_norm": 18.560697276437615, "kl": 0.20904541015625, "learning_rate": 9.946012171491668e-07, "loss": 0.0083, "reward": 0.796875, "reward_std": 0.4715307354927063, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.3828125, "step": 65 }, { "completion_length": 286.73828125, "epoch": 0.07678883071553229, "grad_norm": 14.086886383004378, "kl": 0.04949951171875, "learning_rate": 9.943213389589466e-07, "loss": 0.002, "reward": 0.82421875, "reward_std": 0.5994473099708557, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.37890625, "step": 66 }, { "completion_length": 323.93359375, "epoch": 0.07795229784758581, "grad_norm": 19.144971581802082, "kl": 0.1656494140625, "learning_rate": 9.940344297446292e-07, "loss": 0.0066, "reward": 0.73828125, "reward_std": 0.43859150260686874, "rewards/correctness_reward_func": 0.3671875, "rewards/strict_format_reward_func": 0.37109375, "step": 67 }, { "completion_length": 350.01171875, "epoch": 0.07911576497963932, "grad_norm": 77.24678279515392, "kl": 0.53643798828125, "learning_rate": 9.937404935870937e-07, "loss": 0.0215, "reward": 0.61328125, "reward_std": 0.4140402674674988, "rewards/correctness_reward_func": 0.21875, "rewards/strict_format_reward_func": 0.39453125, "step": 68 }, { "completion_length": 305.87109375, "epoch": 0.08027923211169284, "grad_norm": 12.883736811037531, "kl": 0.25457763671875, "learning_rate": 9.934395346671673e-07, "loss": 0.0102, "reward": 0.705078125, "reward_std": 0.48615749180316925, "rewards/correctness_reward_func": 0.3125, "rewards/strict_format_reward_func": 0.392578125, "step": 69 }, { "completion_length": 279.07421875, "epoch": 0.08144269924374636, "grad_norm": 4.8084774229944705, "kl": 0.158447265625, "learning_rate": 9.93131557265567e-07, "loss": 0.0063, "reward": 0.775390625, "reward_std": 0.43615715205669403, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.384765625, "step": 70 }, { "completion_length": 271.90625, "epoch": 0.08260616637579989, "grad_norm": 209.3718112774744, "kl": 1.3720703125, "learning_rate": 9.928165657628363e-07, "loss": 0.0552, "reward": 0.771484375, "reward_std": 0.4666217863559723, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.396484375, "step": 71 }, { "completion_length": 280.234375, "epoch": 0.08376963350785341, "grad_norm": 8.675435114113355, "kl": 0.08306884765625, "learning_rate": 9.924945646392856e-07, "loss": 0.0033, "reward": 0.724609375, "reward_std": 0.4121067523956299, "rewards/correctness_reward_func": 0.328125, "rewards/strict_format_reward_func": 0.396484375, "step": 72 }, { "completion_length": 298.13671875, "epoch": 0.08493310063990692, "grad_norm": 2.261657798977972, "kl": 0.05487060546875, "learning_rate": 9.92165558474927e-07, "loss": 0.0022, "reward": 0.79296875, "reward_std": 0.4348938390612602, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.38671875, "step": 73 }, { "completion_length": 275.25390625, "epoch": 0.08609656777196044, "grad_norm": 306.8537482842207, "kl": 2.76251220703125, "learning_rate": 9.918295519494089e-07, "loss": 0.1104, "reward": 0.853515625, "reward_std": 0.4975513890385628, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.416015625, "step": 74 }, { "completion_length": 303.42578125, "epoch": 0.08726003490401396, "grad_norm": 11.231963005062614, "kl": 0.20123291015625, "learning_rate": 9.91486549841951e-07, "loss": 0.008, "reward": 0.720703125, "reward_std": 0.46410517394542694, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.384765625, "step": 75 }, { "completion_length": 277.7578125, "epoch": 0.08842350203606748, "grad_norm": 160.47901765005733, "kl": 1.1749267578125, "learning_rate": 9.91136557031274e-07, "loss": 0.0472, "reward": 0.74609375, "reward_std": 0.4058147594332695, "rewards/correctness_reward_func": 0.3125, "rewards/strict_format_reward_func": 0.43359375, "step": 76 }, { "completion_length": 340.8515625, "epoch": 0.089586969168121, "grad_norm": 381.85614779047495, "kl": 1.22967529296875, "learning_rate": 9.907795784955326e-07, "loss": 0.0492, "reward": 0.765625, "reward_std": 0.42379553616046906, "rewards/correctness_reward_func": 0.3515625, "rewards/strict_format_reward_func": 0.4140625, "step": 77 }, { "completion_length": 274.44921875, "epoch": 0.09075043630017451, "grad_norm": 64.47137880223688, "kl": 0.67572021484375, "learning_rate": 9.904156193122431e-07, "loss": 0.027, "reward": 0.80078125, "reward_std": 0.4361772760748863, "rewards/correctness_reward_func": 0.3828125, "rewards/strict_format_reward_func": 0.41796875, "step": 78 }, { "completion_length": 265.8359375, "epoch": 0.09191390343222804, "grad_norm": 27.903843212195564, "kl": 0.46124267578125, "learning_rate": 9.900446846582119e-07, "loss": 0.0185, "reward": 0.92578125, "reward_std": 0.5412914976477623, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.41796875, "step": 79 }, { "completion_length": 320.8984375, "epoch": 0.09307737056428156, "grad_norm": 1.9761659190639995, "kl": 0.04864501953125, "learning_rate": 9.896667798094608e-07, "loss": 0.0019, "reward": 0.76953125, "reward_std": 0.4052440747618675, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.39453125, "step": 80 }, { "completion_length": 312.84765625, "epoch": 0.09424083769633508, "grad_norm": 57.910687963151915, "kl": 0.12591552734375, "learning_rate": 9.892819101411543e-07, "loss": 0.005, "reward": 0.75, "reward_std": 0.49195902049541473, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.4140625, "step": 81 }, { "completion_length": 271.83984375, "epoch": 0.0954043048283886, "grad_norm": 5.41836244806774, "kl": 0.08587646484375, "learning_rate": 9.888900811275203e-07, "loss": 0.0034, "reward": 0.71484375, "reward_std": 0.43381768465042114, "rewards/correctness_reward_func": 0.2890625, "rewards/strict_format_reward_func": 0.42578125, "step": 82 }, { "completion_length": 290.12109375, "epoch": 0.09656777196044211, "grad_norm": 0.8646498967374704, "kl": 0.0540771484375, "learning_rate": 9.884912983417743e-07, "loss": 0.0022, "reward": 0.892578125, "reward_std": 0.47797150164842606, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.423828125, "step": 83 }, { "completion_length": 224.84375, "epoch": 0.09773123909249563, "grad_norm": 2.249327867839398, "kl": 0.0767822265625, "learning_rate": 9.88085567456039e-07, "loss": 0.0031, "reward": 0.927734375, "reward_std": 0.5308554023504257, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.451171875, "step": 84 }, { "completion_length": 268.13671875, "epoch": 0.09889470622454916, "grad_norm": 5.4143471770769445, "kl": 0.10394287109375, "learning_rate": 9.876728942412642e-07, "loss": 0.0042, "reward": 0.849609375, "reward_std": 0.44555214792490005, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.435546875, "step": 85 }, { "completion_length": 310.32421875, "epoch": 0.10005817335660268, "grad_norm": 99.7419314506415, "kl": 0.43487548828125, "learning_rate": 9.872532845671449e-07, "loss": 0.0174, "reward": 0.619140625, "reward_std": 0.2979493774473667, "rewards/correctness_reward_func": 0.203125, "rewards/strict_format_reward_func": 0.416015625, "step": 86 }, { "completion_length": 336.62109375, "epoch": 0.1012216404886562, "grad_norm": 2.7658118091947093, "kl": 0.0557861328125, "learning_rate": 9.868267444020366e-07, "loss": 0.0022, "reward": 0.84375, "reward_std": 0.4424229711294174, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.4296875, "step": 87 }, { "completion_length": 267.109375, "epoch": 0.10238510762070971, "grad_norm": 1.3296554071596351, "kl": 0.05279541015625, "learning_rate": 9.86393279812872e-07, "loss": 0.0021, "reward": 0.763671875, "reward_std": 0.40575000643730164, "rewards/correctness_reward_func": 0.3203125, "rewards/strict_format_reward_func": 0.443359375, "step": 88 }, { "completion_length": 286.4453125, "epoch": 0.10354857475276323, "grad_norm": 1.9800959128263669, "kl": 0.0650634765625, "learning_rate": 9.859528969650737e-07, "loss": 0.0026, "reward": 0.70703125, "reward_std": 0.36570068448781967, "rewards/correctness_reward_func": 0.2734375, "rewards/strict_format_reward_func": 0.43359375, "step": 89 }, { "completion_length": 315.234375, "epoch": 0.10471204188481675, "grad_norm": 0.9855921470718585, "kl": 0.057861328125, "learning_rate": 9.855056021224671e-07, "loss": 0.0023, "reward": 0.78125, "reward_std": 0.38219955191016197, "rewards/correctness_reward_func": 0.359375, "rewards/strict_format_reward_func": 0.421875, "step": 90 }, { "completion_length": 283.63671875, "epoch": 0.10587550901687028, "grad_norm": 2.2979566856097633, "kl": 0.05584716796875, "learning_rate": 9.850514016471902e-07, "loss": 0.0022, "reward": 0.814453125, "reward_std": 0.4690292477607727, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.439453125, "step": 91 }, { "completion_length": 280.4921875, "epoch": 0.1070389761489238, "grad_norm": 54.88062525909732, "kl": 0.2271728515625, "learning_rate": 9.845903019996045e-07, "loss": 0.0091, "reward": 0.732421875, "reward_std": 0.3464353382587433, "rewards/correctness_reward_func": 0.296875, "rewards/strict_format_reward_func": 0.435546875, "step": 92 }, { "completion_length": 257.05078125, "epoch": 0.1082024432809773, "grad_norm": 2.4355438627389714, "kl": 0.16082763671875, "learning_rate": 9.841223097382027e-07, "loss": 0.0065, "reward": 0.921875, "reward_std": 0.5120889246463776, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.4375, "step": 93 }, { "completion_length": 330.67578125, "epoch": 0.10936591041303083, "grad_norm": 0.29946740274215317, "kl": 0.04864501953125, "learning_rate": 9.836474315195147e-07, "loss": 0.0019, "reward": 0.734375, "reward_std": 0.4280121922492981, "rewards/correctness_reward_func": 0.3125, "rewards/strict_format_reward_func": 0.421875, "step": 94 }, { "completion_length": 293.5234375, "epoch": 0.11052937754508435, "grad_norm": 0.3362629809809694, "kl": 0.045654296875, "learning_rate": 9.831656740980135e-07, "loss": 0.0018, "reward": 0.716796875, "reward_std": 0.4403715208172798, "rewards/correctness_reward_func": 0.296875, "rewards/strict_format_reward_func": 0.419921875, "step": 95 }, { "completion_length": 237.234375, "epoch": 0.11169284467713787, "grad_norm": 0.3082617927162893, "kl": 0.05224609375, "learning_rate": 9.826770443260193e-07, "loss": 0.0021, "reward": 0.818359375, "reward_std": 0.431897908449173, "rewards/correctness_reward_func": 0.3828125, "rewards/strict_format_reward_func": 0.435546875, "step": 96 }, { "completion_length": 272.08984375, "epoch": 0.1128563118091914, "grad_norm": 3.297879330447764, "kl": 0.06298828125, "learning_rate": 9.821815491536016e-07, "loss": 0.0025, "reward": 0.86328125, "reward_std": 0.4964246600866318, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.43359375, "step": 97 }, { "completion_length": 313.859375, "epoch": 0.1140197789412449, "grad_norm": 1.7917369827084497, "kl": 0.05108642578125, "learning_rate": 9.81679195628481e-07, "loss": 0.002, "reward": 0.69140625, "reward_std": 0.4303680807352066, "rewards/correctness_reward_func": 0.265625, "rewards/strict_format_reward_func": 0.42578125, "step": 98 }, { "completion_length": 234.34375, "epoch": 0.11518324607329843, "grad_norm": 2.373852143479517, "kl": 0.07373046875, "learning_rate": 9.811699908959275e-07, "loss": 0.0029, "reward": 0.927734375, "reward_std": 0.4275398887693882, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.451171875, "step": 99 }, { "completion_length": 267.76953125, "epoch": 0.11634671320535195, "grad_norm": 1.5428375901598974, "kl": 0.05035400390625, "learning_rate": 9.806539421986608e-07, "loss": 0.002, "reward": 0.763671875, "reward_std": 0.45994649082422256, "rewards/correctness_reward_func": 0.3203125, "rewards/strict_format_reward_func": 0.443359375, "step": 100 }, { "completion_length": 245.25390625, "epoch": 0.11751018033740547, "grad_norm": 2.3658036123831083, "kl": 0.04681396484375, "learning_rate": 9.80131056876746e-07, "loss": 0.0019, "reward": 0.8828125, "reward_std": 0.43228569626808167, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.4453125, "step": 101 }, { "completion_length": 229.6953125, "epoch": 0.11867364746945899, "grad_norm": 7.873285555480436, "kl": 0.05926513671875, "learning_rate": 9.796013423674898e-07, "loss": 0.0024, "reward": 0.75, "reward_std": 0.3967752233147621, "rewards/correctness_reward_func": 0.328125, "rewards/strict_format_reward_func": 0.421875, "step": 102 }, { "completion_length": 344.07421875, "epoch": 0.1198371146015125, "grad_norm": 15.56401233276747, "kl": 0.1546630859375, "learning_rate": 9.79064806205334e-07, "loss": 0.0062, "reward": 0.810546875, "reward_std": 0.44128578901290894, "rewards/correctness_reward_func": 0.3828125, "rewards/strict_format_reward_func": 0.427734375, "step": 103 }, { "completion_length": 282.15625, "epoch": 0.12100058173356602, "grad_norm": 5.111153394673512, "kl": 0.0589599609375, "learning_rate": 9.78521456021749e-07, "loss": 0.0024, "reward": 0.80078125, "reward_std": 0.48010821640491486, "rewards/correctness_reward_func": 0.3671875, "rewards/strict_format_reward_func": 0.43359375, "step": 104 }, { "completion_length": 242.20703125, "epoch": 0.12216404886561955, "grad_norm": 6.003504446851319, "kl": 0.05206298828125, "learning_rate": 9.779712995451252e-07, "loss": 0.0021, "reward": 0.80859375, "reward_std": 0.443262055516243, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.43359375, "step": 105 }, { "completion_length": 276.546875, "epoch": 0.12332751599767307, "grad_norm": 4.073486002796206, "kl": 0.054443359375, "learning_rate": 9.77414344600663e-07, "loss": 0.0022, "reward": 0.767578125, "reward_std": 0.46208247542381287, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.431640625, "step": 106 }, { "completion_length": 257.03515625, "epoch": 0.12449098312972659, "grad_norm": 18.5627783152072, "kl": 0.087646484375, "learning_rate": 9.76850599110261e-07, "loss": 0.0035, "reward": 0.794921875, "reward_std": 0.4035182222723961, "rewards/correctness_reward_func": 0.3671875, "rewards/strict_format_reward_func": 0.427734375, "step": 107 }, { "completion_length": 312.40625, "epoch": 0.1256544502617801, "grad_norm": 6.982791090576697, "kl": 0.115966796875, "learning_rate": 9.762800710924038e-07, "loss": 0.0046, "reward": 0.783203125, "reward_std": 0.5103202238678932, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.408203125, "step": 108 }, { "completion_length": 266.125, "epoch": 0.12681791739383363, "grad_norm": 1.474075928860927, "kl": 0.04534912109375, "learning_rate": 9.75702768662048e-07, "loss": 0.0018, "reward": 0.828125, "reward_std": 0.398033931851387, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.4375, "step": 109 }, { "completion_length": 291.98828125, "epoch": 0.12798138452588714, "grad_norm": 1082.0566595703972, "kl": 2.355712890625, "learning_rate": 9.751187000305074e-07, "loss": 0.0937, "reward": 0.890625, "reward_std": 0.4832841530442238, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.4296875, "step": 110 }, { "completion_length": 257.46484375, "epoch": 0.12914485165794065, "grad_norm": 8.260881771838365, "kl": 0.305908203125, "learning_rate": 9.745278735053343e-07, "loss": 0.0122, "reward": 0.791015625, "reward_std": 0.5271246433258057, "rewards/correctness_reward_func": 0.3671875, "rewards/strict_format_reward_func": 0.423828125, "step": 111 }, { "completion_length": 278.26171875, "epoch": 0.1303083187899942, "grad_norm": 7.127367987174519, "kl": 0.2362060546875, "learning_rate": 9.73930297490203e-07, "loss": 0.0094, "reward": 0.650390625, "reward_std": 0.3026326783001423, "rewards/correctness_reward_func": 0.2421875, "rewards/strict_format_reward_func": 0.408203125, "step": 112 }, { "completion_length": 264.4765625, "epoch": 0.1314717859220477, "grad_norm": 5.665522812893531, "kl": 0.190673828125, "learning_rate": 9.7332598048479e-07, "loss": 0.0076, "reward": 0.966796875, "reward_std": 0.4744589924812317, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.427734375, "step": 113 }, { "completion_length": 313.296875, "epoch": 0.13263525305410123, "grad_norm": 0.6167995629967288, "kl": 0.0430908203125, "learning_rate": 9.727149310846523e-07, "loss": 0.0017, "reward": 0.662109375, "reward_std": 0.4723682776093483, "rewards/correctness_reward_func": 0.25, "rewards/strict_format_reward_func": 0.412109375, "step": 114 }, { "completion_length": 238.8828125, "epoch": 0.13379872018615474, "grad_norm": 5932.568937857461, "kl": 31.500244140625, "learning_rate": 9.720971579811065e-07, "loss": 1.2647, "reward": 0.939453125, "reward_std": 0.4201255813241005, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.447265625, "step": 115 }, { "completion_length": 270.9375, "epoch": 0.13496218731820825, "grad_norm": 2.0818128486642893, "kl": 0.11199951171875, "learning_rate": 9.714726699611037e-07, "loss": 0.0045, "reward": 0.6328125, "reward_std": 0.34041667729616165, "rewards/correctness_reward_func": 0.234375, "rewards/strict_format_reward_func": 0.3984375, "step": 116 }, { "completion_length": 244.61328125, "epoch": 0.13612565445026178, "grad_norm": 29.72218794430255, "kl": 0.26605224609375, "learning_rate": 9.708414759071057e-07, "loss": 0.0106, "reward": 0.853515625, "reward_std": 0.4693729430437088, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.439453125, "step": 117 }, { "completion_length": 269.69140625, "epoch": 0.1372891215823153, "grad_norm": 9.459551717392767, "kl": 0.16796875, "learning_rate": 9.702035847969578e-07, "loss": 0.0067, "reward": 0.802734375, "reward_std": 0.45168111473321915, "rewards/correctness_reward_func": 0.3671875, "rewards/strict_format_reward_func": 0.435546875, "step": 118 }, { "completion_length": 263.65625, "epoch": 0.13845258871436883, "grad_norm": 3.6379354568837274, "kl": 0.19122314453125, "learning_rate": 9.695590057037618e-07, "loss": 0.0077, "reward": 0.71875, "reward_std": 0.4168992340564728, "rewards/correctness_reward_func": 0.2890625, "rewards/strict_format_reward_func": 0.4296875, "step": 119 }, { "completion_length": 255.9921875, "epoch": 0.13961605584642234, "grad_norm": 6.956294376892292, "kl": 0.4310302734375, "learning_rate": 9.689077477957468e-07, "loss": 0.0172, "reward": 0.85546875, "reward_std": 0.5010552629828453, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.41796875, "step": 120 }, { "completion_length": 302.72265625, "epoch": 0.14077952297847585, "grad_norm": 3.7055581750620203, "kl": 0.0943603515625, "learning_rate": 9.682498203361378e-07, "loss": 0.0038, "reward": 0.6796875, "reward_std": 0.431684710085392, "rewards/correctness_reward_func": 0.2734375, "rewards/strict_format_reward_func": 0.40625, "step": 121 }, { "completion_length": 287.06640625, "epoch": 0.14194299011052938, "grad_norm": 3.6994429011170293, "kl": 0.097412109375, "learning_rate": 9.675852326830254e-07, "loss": 0.0039, "reward": 0.599609375, "reward_std": 0.2685260437428951, "rewards/correctness_reward_func": 0.1796875, "rewards/strict_format_reward_func": 0.419921875, "step": 122 }, { "completion_length": 244.97265625, "epoch": 0.1431064572425829, "grad_norm": 5.330519684929318, "kl": 0.474609375, "learning_rate": 9.669139942892323e-07, "loss": 0.019, "reward": 0.904296875, "reward_std": 0.45676978677511215, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.443359375, "step": 123 }, { "completion_length": 244.953125, "epoch": 0.14426992437463643, "grad_norm": 7.439747574901674, "kl": 0.082763671875, "learning_rate": 9.66236114702178e-07, "loss": 0.0033, "reward": 0.830078125, "reward_std": 0.4560399353504181, "rewards/correctness_reward_func": 0.3828125, "rewards/strict_format_reward_func": 0.447265625, "step": 124 }, { "completion_length": 247.43359375, "epoch": 0.14543339150668994, "grad_norm": 8.444741434329583, "kl": 0.191650390625, "learning_rate": 9.655516035637436e-07, "loss": 0.0077, "reward": 0.7890625, "reward_std": 0.39387788623571396, "rewards/correctness_reward_func": 0.359375, "rewards/strict_format_reward_func": 0.4296875, "step": 125 }, { "completion_length": 286.15625, "epoch": 0.14659685863874344, "grad_norm": 1.7413985122485363, "kl": 0.1680908203125, "learning_rate": 9.648604706101354e-07, "loss": 0.0067, "reward": 0.775390625, "reward_std": 0.4273899048566818, "rewards/correctness_reward_func": 0.3515625, "rewards/strict_format_reward_func": 0.423828125, "step": 126 }, { "completion_length": 266.35546875, "epoch": 0.14776032577079698, "grad_norm": 23.48576289003154, "kl": 0.429443359375, "learning_rate": 9.641627256717452e-07, "loss": 0.0171, "reward": 0.904296875, "reward_std": 0.4860465005040169, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.443359375, "step": 127 }, { "completion_length": 280.5, "epoch": 0.1489237929028505, "grad_norm": 3.434249282079891, "kl": 0.12042236328125, "learning_rate": 9.634583786730108e-07, "loss": 0.0048, "reward": 0.912109375, "reward_std": 0.4505278691649437, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.435546875, "step": 128 }, { "completion_length": 252.86328125, "epoch": 0.15008726003490402, "grad_norm": 0.509688518372827, "kl": 0.05780029296875, "learning_rate": 9.627474396322753e-07, "loss": 0.0023, "reward": 0.833984375, "reward_std": 0.3295654430985451, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.443359375, "step": 129 }, { "completion_length": 244.61328125, "epoch": 0.15125072716695753, "grad_norm": 322.8549051555672, "kl": 0.65301513671875, "learning_rate": 9.62029918661644e-07, "loss": 0.0261, "reward": 0.849609375, "reward_std": 0.40418654680252075, "rewards/correctness_reward_func": 0.3984375, "rewards/strict_format_reward_func": 0.451171875, "step": 130 }, { "completion_length": 274.83203125, "epoch": 0.15241419429901104, "grad_norm": 2.5471313815643635, "kl": 0.08868408203125, "learning_rate": 9.613058259668414e-07, "loss": 0.0035, "reward": 0.732421875, "reward_std": 0.3115438222885132, "rewards/correctness_reward_func": 0.28125, "rewards/strict_format_reward_func": 0.451171875, "step": 131 }, { "completion_length": 263.140625, "epoch": 0.15357766143106458, "grad_norm": 0.4917581912639935, "kl": 0.05181884765625, "learning_rate": 9.60575171847065e-07, "loss": 0.0021, "reward": 0.8828125, "reward_std": 0.3844335228204727, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.4609375, "step": 132 }, { "completion_length": 240.77734375, "epoch": 0.15474112856311809, "grad_norm": 1.7256071643625732, "kl": 0.05010986328125, "learning_rate": 9.598379666948393e-07, "loss": 0.002, "reward": 0.923828125, "reward_std": 0.44481250643730164, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.455078125, "step": 133 }, { "completion_length": 262.2890625, "epoch": 0.15590459569517162, "grad_norm": 1.803011519361335, "kl": 0.06396484375, "learning_rate": 9.590942209958686e-07, "loss": 0.0026, "reward": 0.951171875, "reward_std": 0.5317578241229057, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.443359375, "step": 134 }, { "completion_length": 264.09375, "epoch": 0.15706806282722513, "grad_norm": 2.320201832882074, "kl": 0.052978515625, "learning_rate": 9.583439453288864e-07, "loss": 0.0021, "reward": 0.7890625, "reward_std": 0.4296160414814949, "rewards/correctness_reward_func": 0.3515625, "rewards/strict_format_reward_func": 0.4375, "step": 135 }, { "completion_length": 265.18359375, "epoch": 0.15823152995927864, "grad_norm": 0.8648510710425397, "kl": 0.0433349609375, "learning_rate": 9.575871503655067e-07, "loss": 0.0017, "reward": 1.017578125, "reward_std": 0.5687045827507973, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.462890625, "step": 136 }, { "completion_length": 313.328125, "epoch": 0.15939499709133217, "grad_norm": 7.228345785913302, "kl": 0.09490966796875, "learning_rate": 9.568238468700705e-07, "loss": 0.0038, "reward": 0.625, "reward_std": 0.32062922045588493, "rewards/correctness_reward_func": 0.1875, "rewards/strict_format_reward_func": 0.4375, "step": 137 }, { "completion_length": 272.78125, "epoch": 0.16055846422338568, "grad_norm": 4.791778485714245, "kl": 0.06451416015625, "learning_rate": 9.560540456994939e-07, "loss": 0.0026, "reward": 0.73828125, "reward_std": 0.3905741199851036, "rewards/correctness_reward_func": 0.2734375, "rewards/strict_format_reward_func": 0.46484375, "step": 138 }, { "completion_length": 284.91015625, "epoch": 0.16172193135543922, "grad_norm": 1.0070605023468475, "kl": 0.0789794921875, "learning_rate": 9.552777578031133e-07, "loss": 0.0032, "reward": 0.86328125, "reward_std": 0.45894617587327957, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.44921875, "step": 139 }, { "completion_length": 254.89453125, "epoch": 0.16288539848749273, "grad_norm": 0.26164597278669394, "kl": 0.0531005859375, "learning_rate": 9.544949942225295e-07, "loss": 0.0021, "reward": 0.888671875, "reward_std": 0.4409971535205841, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.443359375, "step": 140 }, { "completion_length": 297.02734375, "epoch": 0.16404886561954624, "grad_norm": 49.61137761643266, "kl": 0.19952392578125, "learning_rate": 9.537057660914508e-07, "loss": 0.008, "reward": 0.78515625, "reward_std": 0.2939911261200905, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.44921875, "step": 141 }, { "completion_length": 228.41015625, "epoch": 0.16521233275159977, "grad_norm": 2.4563226616711726, "kl": 0.13043212890625, "learning_rate": 9.529100846355345e-07, "loss": 0.0052, "reward": 0.982421875, "reward_std": 0.36371277645230293, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.466796875, "step": 142 }, { "completion_length": 253.48828125, "epoch": 0.16637579988365328, "grad_norm": 1.7492293672269494, "kl": 0.06890869140625, "learning_rate": 9.521079611722276e-07, "loss": 0.0028, "reward": 0.892578125, "reward_std": 0.3547302335500717, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.462890625, "step": 143 }, { "completion_length": 270.91015625, "epoch": 0.16753926701570682, "grad_norm": 186.25321871346733, "kl": 2.019775390625, "learning_rate": 9.512994071106054e-07, "loss": 0.0808, "reward": 0.876953125, "reward_std": 0.35532546043395996, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.455078125, "step": 144 }, { "completion_length": 251.24609375, "epoch": 0.16870273414776032, "grad_norm": 20.433967338967815, "kl": 0.61883544921875, "learning_rate": 9.504844339512094e-07, "loss": 0.0247, "reward": 1.021484375, "reward_std": 0.4558466151356697, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.466796875, "step": 145 }, { "completion_length": 258.37109375, "epoch": 0.16986620127981383, "grad_norm": 1.2588068639439651, "kl": 0.07843017578125, "learning_rate": 9.49663053285884e-07, "loss": 0.0031, "reward": 0.8203125, "reward_std": 0.40197764337062836, "rewards/correctness_reward_func": 0.3515625, "rewards/strict_format_reward_func": 0.46875, "step": 146 }, { "completion_length": 249.65625, "epoch": 0.17102966841186737, "grad_norm": 26.616164942367558, "kl": 0.16827392578125, "learning_rate": 9.488352767976109e-07, "loss": 0.0067, "reward": 0.951171875, "reward_std": 0.43833911418914795, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.466796875, "step": 147 }, { "completion_length": 266.21484375, "epoch": 0.17219313554392088, "grad_norm": 16.56862163073421, "kl": 0.13897705078125, "learning_rate": 9.480011162603434e-07, "loss": 0.0056, "reward": 0.994140625, "reward_std": 0.41204124689102173, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.455078125, "step": 148 }, { "completion_length": 305.5078125, "epoch": 0.1733566026759744, "grad_norm": 0.6862061857964759, "kl": 0.0863037109375, "learning_rate": 9.471605835388392e-07, "loss": 0.0035, "reward": 0.7421875, "reward_std": 0.3801525831222534, "rewards/correctness_reward_func": 0.28125, "rewards/strict_format_reward_func": 0.4609375, "step": 149 }, { "completion_length": 257.46484375, "epoch": 0.17452006980802792, "grad_norm": 7.269994609894895, "kl": 0.1292724609375, "learning_rate": 9.463136905884912e-07, "loss": 0.0052, "reward": 0.96484375, "reward_std": 0.4492557644844055, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.44921875, "step": 150 }, { "completion_length": 269.76171875, "epoch": 0.17568353694008143, "grad_norm": 3.546788791129262, "kl": 0.2747802734375, "learning_rate": 9.454604494551577e-07, "loss": 0.011, "reward": 0.794921875, "reward_std": 0.49609148502349854, "rewards/correctness_reward_func": 0.34375, "rewards/strict_format_reward_func": 0.451171875, "step": 151 }, { "completion_length": 241.5078125, "epoch": 0.17684700407213497, "grad_norm": 19.898589503994543, "kl": 0.2518310546875, "learning_rate": 9.446008722749905e-07, "loss": 0.0101, "reward": 0.912109375, "reward_std": 0.3499249704182148, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.466796875, "step": 152 }, { "completion_length": 312.5625, "epoch": 0.17801047120418848, "grad_norm": 7.082944795431713, "kl": 0.22540283203125, "learning_rate": 9.437349712742634e-07, "loss": 0.009, "reward": 0.93359375, "reward_std": 0.38279156386852264, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.47265625, "step": 153 }, { "completion_length": 290.33203125, "epoch": 0.179173938336242, "grad_norm": 2.204011656968964, "kl": 0.066650390625, "learning_rate": 9.428627587691971e-07, "loss": 0.0027, "reward": 0.783203125, "reward_std": 0.37652380019426346, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.447265625, "step": 154 }, { "completion_length": 291.9921875, "epoch": 0.18033740546829552, "grad_norm": 28.05008573486938, "kl": 0.36279296875, "learning_rate": 9.419842471657846e-07, "loss": 0.0145, "reward": 0.794921875, "reward_std": 0.45617077499628067, "rewards/correctness_reward_func": 0.3671875, "rewards/strict_format_reward_func": 0.427734375, "step": 155 }, { "completion_length": 286.06640625, "epoch": 0.18150087260034903, "grad_norm": 43.77812238882512, "kl": 2.6201171875, "learning_rate": 9.410994489596153e-07, "loss": 0.105, "reward": 0.904296875, "reward_std": 0.4255755990743637, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.451171875, "step": 156 }, { "completion_length": 241.26953125, "epoch": 0.18266433973240256, "grad_norm": 10.84079560138156, "kl": 1.619384765625, "learning_rate": 9.402083767356957e-07, "loss": 0.0646, "reward": 0.904296875, "reward_std": 0.3274926654994488, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.458984375, "step": 157 }, { "completion_length": 248.51171875, "epoch": 0.18382780686445607, "grad_norm": 65.24252743780556, "kl": 2.162109375, "learning_rate": 9.393110431682721e-07, "loss": 0.0867, "reward": 0.85546875, "reward_std": 0.3152204602956772, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.46484375, "step": 158 }, { "completion_length": 283.03125, "epoch": 0.1849912739965096, "grad_norm": 47.62423123432252, "kl": 0.57366943359375, "learning_rate": 9.384074610206493e-07, "loss": 0.023, "reward": 0.95703125, "reward_std": 0.458492249250412, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.45703125, "step": 159 }, { "completion_length": 286.9921875, "epoch": 0.18615474112856312, "grad_norm": 18.91448421977711, "kl": 1.2669677734375, "learning_rate": 9.374976431450094e-07, "loss": 0.0508, "reward": 0.943359375, "reward_std": 0.33836888894438744, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.458984375, "step": 160 }, { "completion_length": 279.0, "epoch": 0.18731820826061663, "grad_norm": 706.4286890306558, "kl": 2.40673828125, "learning_rate": 9.365816024822288e-07, "loss": 0.0961, "reward": 0.9609375, "reward_std": 0.39441975951194763, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.453125, "step": 161 }, { "completion_length": 292.0234375, "epoch": 0.18848167539267016, "grad_norm": 17.463248908977885, "kl": 0.15557861328125, "learning_rate": 9.356593520616946e-07, "loss": 0.0062, "reward": 0.943359375, "reward_std": 0.3664560765028, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.451171875, "step": 162 }, { "completion_length": 331.6328125, "epoch": 0.18964514252472367, "grad_norm": 2.249132481438475, "kl": 0.0911865234375, "learning_rate": 9.347309050011186e-07, "loss": 0.0036, "reward": 0.765625, "reward_std": 0.37423864006996155, "rewards/correctness_reward_func": 0.3125, "rewards/strict_format_reward_func": 0.453125, "step": 163 }, { "completion_length": 297.85546875, "epoch": 0.1908086096567772, "grad_norm": 0.5787880880323766, "kl": 0.0926513671875, "learning_rate": 9.337962745063512e-07, "loss": 0.0037, "reward": 0.810546875, "reward_std": 0.49850574135780334, "rewards/correctness_reward_func": 0.359375, "rewards/strict_format_reward_func": 0.451171875, "step": 164 }, { "completion_length": 265.62109375, "epoch": 0.19197207678883071, "grad_norm": 2.6837086070654985, "kl": 0.05865478515625, "learning_rate": 9.328554738711935e-07, "loss": 0.0023, "reward": 0.921875, "reward_std": 0.513519324362278, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.4609375, "step": 165 }, { "completion_length": 246.55859375, "epoch": 0.19313554392088422, "grad_norm": 3.4134385495340216, "kl": 0.2457275390625, "learning_rate": 9.31908516477208e-07, "loss": 0.0098, "reward": 0.89453125, "reward_std": 0.4189570024609566, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.45703125, "step": 166 }, { "completion_length": 246.66015625, "epoch": 0.19429901105293776, "grad_norm": 0.9230642941742443, "kl": 0.11737060546875, "learning_rate": 9.309554157935286e-07, "loss": 0.0047, "reward": 0.96875, "reward_std": 0.4920966625213623, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.4609375, "step": 167 }, { "completion_length": 225.83984375, "epoch": 0.19546247818499127, "grad_norm": 2.8669767245130577, "kl": 0.05242919921875, "learning_rate": 9.299961853766689e-07, "loss": 0.0021, "reward": 0.978515625, "reward_std": 0.443370521068573, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.455078125, "step": 168 }, { "completion_length": 273.70703125, "epoch": 0.1966259453170448, "grad_norm": 0.9798307739601261, "kl": 0.08355712890625, "learning_rate": 9.290308388703288e-07, "loss": 0.0033, "reward": 0.791015625, "reward_std": 0.32959311455488205, "rewards/correctness_reward_func": 0.328125, "rewards/strict_format_reward_func": 0.462890625, "step": 169 }, { "completion_length": 306.48828125, "epoch": 0.1977894124490983, "grad_norm": 16.018378185855333, "kl": 0.63409423828125, "learning_rate": 9.280593900052014e-07, "loss": 0.0254, "reward": 0.822265625, "reward_std": 0.4224512651562691, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.447265625, "step": 170 }, { "completion_length": 307.9375, "epoch": 0.19895287958115182, "grad_norm": 8.264210864082758, "kl": 0.29107666015625, "learning_rate": 9.270818525987771e-07, "loss": 0.0117, "reward": 0.87890625, "reward_std": 0.4745420068502426, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.43359375, "step": 171 }, { "completion_length": 273.25, "epoch": 0.20011634671320536, "grad_norm": 3200.4657340851295, "kl": 2.255615234375, "learning_rate": 9.260982405551476e-07, "loss": 0.0899, "reward": 0.892578125, "reward_std": 0.49684761464595795, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.455078125, "step": 172 }, { "completion_length": 301.10546875, "epoch": 0.20127981384525886, "grad_norm": 0.4106069684572662, "kl": 0.06402587890625, "learning_rate": 9.251085678648071e-07, "loss": 0.0026, "reward": 0.775390625, "reward_std": 0.3673408329486847, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.439453125, "step": 173 }, { "completion_length": 306.25, "epoch": 0.2024432809773124, "grad_norm": 2.4526371721107645, "kl": 0.0994873046875, "learning_rate": 9.241128486044542e-07, "loss": 0.004, "reward": 0.94140625, "reward_std": 0.5130884796380997, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.44921875, "step": 174 }, { "completion_length": 294.6015625, "epoch": 0.2036067481093659, "grad_norm": 7.724202386944696, "kl": 0.11065673828125, "learning_rate": 9.231110969367918e-07, "loss": 0.0044, "reward": 0.8359375, "reward_std": 0.4581919386982918, "rewards/correctness_reward_func": 0.3984375, "rewards/strict_format_reward_func": 0.4375, "step": 175 }, { "completion_length": 254.26953125, "epoch": 0.20477021524141942, "grad_norm": 4.0122059528505085, "kl": 0.31842041015625, "learning_rate": 9.221033271103249e-07, "loss": 0.0127, "reward": 0.92578125, "reward_std": 0.4053529165685177, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.46484375, "step": 176 }, { "completion_length": 309.66796875, "epoch": 0.20593368237347295, "grad_norm": 2.06130360097307, "kl": 0.1171875, "learning_rate": 9.210895534591582e-07, "loss": 0.0047, "reward": 0.8984375, "reward_std": 0.4733218848705292, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.453125, "step": 177 }, { "completion_length": 323.63671875, "epoch": 0.20709714950552646, "grad_norm": 5.004188106979899, "kl": 0.0877685546875, "learning_rate": 9.200697904027927e-07, "loss": 0.0035, "reward": 0.783203125, "reward_std": 0.4214430972933769, "rewards/correctness_reward_func": 0.34375, "rewards/strict_format_reward_func": 0.439453125, "step": 178 }, { "completion_length": 305.921875, "epoch": 0.20826061663758, "grad_norm": 1.6668849296659807, "kl": 0.0859375, "learning_rate": 9.190440524459202e-07, "loss": 0.0034, "reward": 0.916015625, "reward_std": 0.48984475433826447, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.447265625, "step": 179 }, { "completion_length": 246.859375, "epoch": 0.2094240837696335, "grad_norm": 3.1773447652134212, "kl": 0.0968017578125, "learning_rate": 9.18012354178217e-07, "loss": 0.0039, "reward": 1.044921875, "reward_std": 0.49948541074991226, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.458984375, "step": 180 }, { "completion_length": 270.046875, "epoch": 0.21058755090168702, "grad_norm": 12.240600235073, "kl": 0.146240234375, "learning_rate": 9.16974710274136e-07, "loss": 0.0059, "reward": 0.888671875, "reward_std": 0.4256081059575081, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.451171875, "step": 181 }, { "completion_length": 277.4921875, "epoch": 0.21175101803374055, "grad_norm": 41.948078009277715, "kl": 0.93603515625, "learning_rate": 9.159311354926989e-07, "loss": 0.0376, "reward": 0.994140625, "reward_std": 0.42895379662513733, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.462890625, "step": 182 }, { "completion_length": 270.48046875, "epoch": 0.21291448516579406, "grad_norm": 1.2973502608915635, "kl": 0.06439208984375, "learning_rate": 9.148816446772858e-07, "loss": 0.0026, "reward": 0.755859375, "reward_std": 0.29502545297145844, "rewards/correctness_reward_func": 0.3046875, "rewards/strict_format_reward_func": 0.451171875, "step": 183 }, { "completion_length": 297.19921875, "epoch": 0.2140779522978476, "grad_norm": 2.500567548574192, "kl": 0.09307861328125, "learning_rate": 9.138262527554237e-07, "loss": 0.0037, "reward": 0.91796875, "reward_std": 0.4057604745030403, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.45703125, "step": 184 }, { "completion_length": 226.44140625, "epoch": 0.2152414194299011, "grad_norm": 14.743103963897818, "kl": 0.131103515625, "learning_rate": 9.127649747385748e-07, "loss": 0.0052, "reward": 1.005859375, "reward_std": 0.48278648406267166, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.466796875, "step": 185 }, { "completion_length": 295.98046875, "epoch": 0.2164048865619546, "grad_norm": 2.599485686164368, "kl": 0.10174560546875, "learning_rate": 9.116978257219223e-07, "loss": 0.0041, "reward": 0.900390625, "reward_std": 0.4165002331137657, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.455078125, "step": 186 }, { "completion_length": 273.453125, "epoch": 0.21756835369400815, "grad_norm": 1.9090617402205379, "kl": 0.08551025390625, "learning_rate": 9.106248208841568e-07, "loss": 0.0034, "reward": 0.875, "reward_std": 0.3356376253068447, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.46875, "step": 187 }, { "completion_length": 262.65625, "epoch": 0.21873182082606166, "grad_norm": 144.5858579696907, "kl": 1.45574951171875, "learning_rate": 9.095459754872588e-07, "loss": 0.0584, "reward": 0.818359375, "reward_std": 0.437059473246336, "rewards/correctness_reward_func": 0.3515625, "rewards/strict_format_reward_func": 0.466796875, "step": 188 }, { "completion_length": 292.56640625, "epoch": 0.2198952879581152, "grad_norm": 1.0988973248113927, "kl": 0.07281494140625, "learning_rate": 9.084613048762833e-07, "loss": 0.0029, "reward": 0.90625, "reward_std": 0.5009823776781559, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.4453125, "step": 189 }, { "completion_length": 297.6640625, "epoch": 0.2210587550901687, "grad_norm": 0.7765766278689193, "kl": 0.0587158203125, "learning_rate": 9.073708244791405e-07, "loss": 0.0023, "reward": 0.9609375, "reward_std": 0.39268171787261963, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.46875, "step": 190 }, { "completion_length": 259.953125, "epoch": 0.2222222222222222, "grad_norm": 0.7753362260289501, "kl": 0.05169677734375, "learning_rate": 9.062745498063764e-07, "loss": 0.0021, "reward": 0.775390625, "reward_std": 0.2924235537648201, "rewards/correctness_reward_func": 0.3125, "rewards/strict_format_reward_func": 0.462890625, "step": 191 }, { "completion_length": 347.28515625, "epoch": 0.22338568935427575, "grad_norm": 1.6977885523114584, "kl": 0.0623779296875, "learning_rate": 9.051724964509526e-07, "loss": 0.0025, "reward": 0.88671875, "reward_std": 0.3994247317314148, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.45703125, "step": 192 }, { "completion_length": 265.41015625, "epoch": 0.22454915648632925, "grad_norm": 18.19050532587102, "kl": 0.7923583984375, "learning_rate": 9.040646800880242e-07, "loss": 0.0317, "reward": 0.892578125, "reward_std": 0.5228888541460037, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.439453125, "step": 193 }, { "completion_length": 312.88671875, "epoch": 0.2257126236183828, "grad_norm": 0.8340243461295392, "kl": 0.0703125, "learning_rate": 9.029511164747175e-07, "loss": 0.0028, "reward": 0.83984375, "reward_std": 0.3894122242927551, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.43359375, "step": 194 }, { "completion_length": 286.765625, "epoch": 0.2268760907504363, "grad_norm": 1.200161776433764, "kl": 0.05950927734375, "learning_rate": 9.018318214499041e-07, "loss": 0.0024, "reward": 0.8203125, "reward_std": 0.3945379853248596, "rewards/correctness_reward_func": 0.3671875, "rewards/strict_format_reward_func": 0.453125, "step": 195 }, { "completion_length": 247.5234375, "epoch": 0.2280395578824898, "grad_norm": 0.3212206681274319, "kl": 0.06475830078125, "learning_rate": 9.007068109339783e-07, "loss": 0.0026, "reward": 0.94921875, "reward_std": 0.41352738067507744, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.44921875, "step": 196 }, { "completion_length": 242.36328125, "epoch": 0.22920302501454334, "grad_norm": 98.99732039672644, "kl": 0.5511474609375, "learning_rate": 8.995761009286282e-07, "loss": 0.0221, "reward": 0.99609375, "reward_std": 0.35693345218896866, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.44140625, "step": 197 }, { "completion_length": 251.578125, "epoch": 0.23036649214659685, "grad_norm": 1.7688888702316456, "kl": 0.063720703125, "learning_rate": 8.984397075166095e-07, "loss": 0.0026, "reward": 0.904296875, "reward_std": 0.500565767288208, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.458984375, "step": 198 }, { "completion_length": 289.9921875, "epoch": 0.2315299592786504, "grad_norm": 1.0097855862170513, "kl": 0.0572509765625, "learning_rate": 8.97297646861516e-07, "loss": 0.0023, "reward": 0.830078125, "reward_std": 0.39669080078601837, "rewards/correctness_reward_func": 0.3828125, "rewards/strict_format_reward_func": 0.447265625, "step": 199 }, { "completion_length": 254.05859375, "epoch": 0.2326934264107039, "grad_norm": 1.2860517043327864, "kl": 0.0604248046875, "learning_rate": 8.96149935207551e-07, "loss": 0.0024, "reward": 0.96875, "reward_std": 0.3961385078728199, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.4609375, "step": 200 }, { "completion_length": 267.1796875, "epoch": 0.2338568935427574, "grad_norm": 216.07020895733496, "kl": 3.01824951171875, "learning_rate": 8.94996588879294e-07, "loss": 0.1211, "reward": 0.84765625, "reward_std": 0.3079614117741585, "rewards/correctness_reward_func": 0.3828125, "rewards/strict_format_reward_func": 0.46484375, "step": 201 }, { "completion_length": 277.359375, "epoch": 0.23502036067481094, "grad_norm": 14.236316898563077, "kl": 0.18560791015625, "learning_rate": 8.938376242814705e-07, "loss": 0.0074, "reward": 0.892578125, "reward_std": 0.41803978383541107, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.455078125, "step": 202 }, { "completion_length": 245.83984375, "epoch": 0.23618382780686445, "grad_norm": 1.8628070673857409, "kl": 0.06121826171875, "learning_rate": 8.926730578987181e-07, "loss": 0.0025, "reward": 0.89453125, "reward_std": 0.4060286432504654, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.46484375, "step": 203 }, { "completion_length": 244.58203125, "epoch": 0.23734729493891799, "grad_norm": 3.390211545940989, "kl": 0.11639404296875, "learning_rate": 8.915029062953513e-07, "loss": 0.0047, "reward": 0.767578125, "reward_std": 0.4031776264309883, "rewards/correctness_reward_func": 0.3125, "rewards/strict_format_reward_func": 0.455078125, "step": 204 }, { "completion_length": 275.49609375, "epoch": 0.2385107620709715, "grad_norm": 75.48602943120761, "kl": 0.275634765625, "learning_rate": 8.903271861151271e-07, "loss": 0.011, "reward": 0.85546875, "reward_std": 0.4410543665289879, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.44921875, "step": 205 }, { "completion_length": 279.046875, "epoch": 0.239674229203025, "grad_norm": 3.069393351490843, "kl": 0.0888671875, "learning_rate": 8.891459140810074e-07, "loss": 0.0036, "reward": 0.87109375, "reward_std": 0.42865990847349167, "rewards/correctness_reward_func": 0.3984375, "rewards/strict_format_reward_func": 0.47265625, "step": 206 }, { "completion_length": 229.71875, "epoch": 0.24083769633507854, "grad_norm": 257.1110061312289, "kl": 0.39166259765625, "learning_rate": 8.879591069949214e-07, "loss": 0.0156, "reward": 1.087890625, "reward_std": 0.41851507127285004, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.462890625, "step": 207 }, { "completion_length": 268.30859375, "epoch": 0.24200116346713205, "grad_norm": 5.18015788501216, "kl": 0.06884765625, "learning_rate": 8.867667817375265e-07, "loss": 0.0028, "reward": 1.0, "reward_std": 0.40127987414598465, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.46875, "step": 208 }, { "completion_length": 309.453125, "epoch": 0.24316463059918558, "grad_norm": 1.5247313005216812, "kl": 0.0855712890625, "learning_rate": 8.855689552679685e-07, "loss": 0.0034, "reward": 0.859375, "reward_std": 0.3662661015987396, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.4375, "step": 209 }, { "completion_length": 249.80859375, "epoch": 0.2443280977312391, "grad_norm": 39.736817702011244, "kl": 0.3192138671875, "learning_rate": 8.8436564462364e-07, "loss": 0.0128, "reward": 0.96484375, "reward_std": 0.4362768158316612, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.45703125, "step": 210 }, { "completion_length": 261.7265625, "epoch": 0.2454915648632926, "grad_norm": 15.893621049720416, "kl": 0.10369873046875, "learning_rate": 8.831568669199385e-07, "loss": 0.0041, "reward": 0.8671875, "reward_std": 0.4317528046667576, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.4609375, "step": 211 }, { "completion_length": 271.015625, "epoch": 0.24665503199534614, "grad_norm": 12.434936387080116, "kl": 0.25006103515625, "learning_rate": 8.819426393500225e-07, "loss": 0.01, "reward": 0.951171875, "reward_std": 0.37150320410728455, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.451171875, "step": 212 }, { "completion_length": 229.7265625, "epoch": 0.24781849912739964, "grad_norm": 0.24517256060093215, "kl": 0.06121826171875, "learning_rate": 8.807229791845671e-07, "loss": 0.0024, "reward": 1.048828125, "reward_std": 0.4287170581519604, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.478515625, "step": 213 }, { "completion_length": 249.890625, "epoch": 0.24898196625945318, "grad_norm": 10.424118290551732, "kl": 0.10345458984375, "learning_rate": 8.794979037715189e-07, "loss": 0.0041, "reward": 0.923828125, "reward_std": 0.4381677731871605, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.462890625, "step": 214 }, { "completion_length": 235.921875, "epoch": 0.2501454333915067, "grad_norm": 0.17927072689070433, "kl": 0.050537109375, "learning_rate": 8.782674305358479e-07, "loss": 0.002, "reward": 1.0, "reward_std": 0.3274492807686329, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.46875, "step": 215 }, { "completion_length": 245.80859375, "epoch": 0.2513089005235602, "grad_norm": 0.9707131681039443, "kl": 0.09100341796875, "learning_rate": 8.770315769793014e-07, "loss": 0.0036, "reward": 1.052734375, "reward_std": 0.5058201253414154, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.474609375, "step": 216 }, { "completion_length": 271.625, "epoch": 0.2524723676556137, "grad_norm": 41.59256359151637, "kl": 0.64581298828125, "learning_rate": 8.757903606801535e-07, "loss": 0.0259, "reward": 0.8515625, "reward_std": 0.3625856637954712, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.4609375, "step": 217 }, { "completion_length": 262.85546875, "epoch": 0.25363583478766727, "grad_norm": 1.8738118800483519, "kl": 0.14483642578125, "learning_rate": 8.745437992929565e-07, "loss": 0.0058, "reward": 0.810546875, "reward_std": 0.3701170086860657, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.435546875, "step": 218 }, { "completion_length": 257.62109375, "epoch": 0.2547993019197208, "grad_norm": 0.8914019290591263, "kl": 0.12396240234375, "learning_rate": 8.732919105482881e-07, "loss": 0.0049, "reward": 0.849609375, "reward_std": 0.4399448484182358, "rewards/correctness_reward_func": 0.3984375, "rewards/strict_format_reward_func": 0.451171875, "step": 219 }, { "completion_length": 277.71875, "epoch": 0.2559627690517743, "grad_norm": 2.969398572217837, "kl": 0.15771484375, "learning_rate": 8.720347122525009e-07, "loss": 0.0063, "reward": 1.0, "reward_std": 0.3545645773410797, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.4609375, "step": 220 }, { "completion_length": 279.62109375, "epoch": 0.2571262361838278, "grad_norm": 4.125833185783897, "kl": 0.09295654296875, "learning_rate": 8.707722222874682e-07, "loss": 0.0037, "reward": 0.880859375, "reward_std": 0.46208474040031433, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.451171875, "step": 221 }, { "completion_length": 259.5234375, "epoch": 0.2582897033158813, "grad_norm": 2.4114455395815115, "kl": 0.07281494140625, "learning_rate": 8.695044586103295e-07, "loss": 0.0029, "reward": 0.91796875, "reward_std": 0.5112674385309219, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.46484375, "step": 222 }, { "completion_length": 257.81640625, "epoch": 0.25945317044793487, "grad_norm": 57.81421224819658, "kl": 5.58099365234375, "learning_rate": 8.682314392532359e-07, "loss": 0.2241, "reward": 0.9609375, "reward_std": 0.44792595505714417, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.453125, "step": 223 }, { "completion_length": 251.2578125, "epoch": 0.2606166375799884, "grad_norm": 14.619825092697099, "kl": 0.201416015625, "learning_rate": 8.669531823230927e-07, "loss": 0.0081, "reward": 0.841796875, "reward_std": 0.44307927787303925, "rewards/correctness_reward_func": 0.3828125, "rewards/strict_format_reward_func": 0.458984375, "step": 224 }, { "completion_length": 276.9296875, "epoch": 0.2617801047120419, "grad_norm": 0.9309110503969785, "kl": 0.075439453125, "learning_rate": 8.656697060013027e-07, "loss": 0.003, "reward": 0.79296875, "reward_std": 0.3411271572113037, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.45703125, "step": 225 }, { "completion_length": 257.5625, "epoch": 0.2629435718440954, "grad_norm": 2.2623431917976444, "kl": 0.0533447265625, "learning_rate": 8.643810285435068e-07, "loss": 0.0021, "reward": 1.12890625, "reward_std": 0.44606152176856995, "rewards/correctness_reward_func": 0.6640625, "rewards/strict_format_reward_func": 0.46484375, "step": 226 }, { "completion_length": 251.50390625, "epoch": 0.2641070389761489, "grad_norm": 0.5070392787028914, "kl": 0.05908203125, "learning_rate": 8.630871682793251e-07, "loss": 0.0024, "reward": 0.876953125, "reward_std": 0.490783266723156, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.455078125, "step": 227 }, { "completion_length": 246.296875, "epoch": 0.26527050610820246, "grad_norm": 4.2956236196994295, "kl": 0.11309814453125, "learning_rate": 8.617881436120955e-07, "loss": 0.0045, "reward": 0.978515625, "reward_std": 0.4138845205307007, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.470703125, "step": 228 }, { "completion_length": 277.875, "epoch": 0.266433973240256, "grad_norm": 5.216760237054544, "kl": 0.07476806640625, "learning_rate": 8.604839730186124e-07, "loss": 0.003, "reward": 0.9609375, "reward_std": 0.34066715836524963, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.4609375, "step": 229 }, { "completion_length": 235.68359375, "epoch": 0.2675974403723095, "grad_norm": 0.37867574437516244, "kl": 0.05914306640625, "learning_rate": 8.591746750488637e-07, "loss": 0.0024, "reward": 0.966796875, "reward_std": 0.3957118056714535, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.458984375, "step": 230 }, { "completion_length": 213.26953125, "epoch": 0.268760907504363, "grad_norm": 1.9972681020554086, "kl": 0.0601806640625, "learning_rate": 8.578602683257672e-07, "loss": 0.0024, "reward": 1.01953125, "reward_std": 0.41779596731066704, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.46484375, "step": 231 }, { "completion_length": 242.1015625, "epoch": 0.2699243746364165, "grad_norm": 11.973552072884946, "kl": 0.23095703125, "learning_rate": 8.565407715449053e-07, "loss": 0.0093, "reward": 1.095703125, "reward_std": 0.4449344128370285, "rewards/correctness_reward_func": 0.6328125, "rewards/strict_format_reward_func": 0.462890625, "step": 232 }, { "completion_length": 279.46484375, "epoch": 0.27108784176847006, "grad_norm": 1.1122488695552584, "kl": 0.0611572265625, "learning_rate": 8.55216203474259e-07, "loss": 0.0024, "reward": 0.96875, "reward_std": 0.3511844053864479, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.453125, "step": 233 }, { "completion_length": 296.06640625, "epoch": 0.27225130890052357, "grad_norm": 7.656994517477282, "kl": 0.205078125, "learning_rate": 8.538865829539419e-07, "loss": 0.0082, "reward": 0.923828125, "reward_std": 0.4097987338900566, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.455078125, "step": 234 }, { "completion_length": 307.84375, "epoch": 0.2734147760325771, "grad_norm": 3.4082599384877392, "kl": 0.062255859375, "learning_rate": 8.525519288959313e-07, "loss": 0.0025, "reward": 0.87890625, "reward_std": 0.42585042119026184, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.44140625, "step": 235 }, { "completion_length": 253.703125, "epoch": 0.2745782431646306, "grad_norm": 2.4114918782987775, "kl": 0.05999755859375, "learning_rate": 8.512122602837992e-07, "loss": 0.0024, "reward": 0.904296875, "reward_std": 0.4316757544875145, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.474609375, "step": 236 }, { "completion_length": 272.37109375, "epoch": 0.2757417102966841, "grad_norm": 9.644382127722391, "kl": 0.19000244140625, "learning_rate": 8.498675961724429e-07, "loss": 0.0076, "reward": 0.884765625, "reward_std": 0.43766825273633003, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.470703125, "step": 237 }, { "completion_length": 261.38671875, "epoch": 0.27690517742873766, "grad_norm": 3.515642014039953, "kl": 0.08984375, "learning_rate": 8.485179556878138e-07, "loss": 0.0036, "reward": 0.875, "reward_std": 0.41589029133319855, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.4609375, "step": 238 }, { "completion_length": 271.96875, "epoch": 0.27806864456079117, "grad_norm": 14.948644348843615, "kl": 0.6575927734375, "learning_rate": 8.471633580266441e-07, "loss": 0.0262, "reward": 0.904296875, "reward_std": 0.4234832860529423, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.458984375, "step": 239 }, { "completion_length": 293.19921875, "epoch": 0.2792321116928447, "grad_norm": 0.3016247153844768, "kl": 0.0540771484375, "learning_rate": 8.458038224561768e-07, "loss": 0.0022, "reward": 1.005859375, "reward_std": 0.42349937558174133, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.474609375, "step": 240 }, { "completion_length": 305.3515625, "epoch": 0.2803955788248982, "grad_norm": 5.910672503037428, "kl": 0.08807373046875, "learning_rate": 8.444393683138875e-07, "loss": 0.0035, "reward": 0.939453125, "reward_std": 0.4484020173549652, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.447265625, "step": 241 }, { "completion_length": 261.95703125, "epoch": 0.2815590459569517, "grad_norm": 2.177794071987546, "kl": 0.0711669921875, "learning_rate": 8.430700150072134e-07, "loss": 0.0028, "reward": 0.91796875, "reward_std": 0.4672437012195587, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.44921875, "step": 242 }, { "completion_length": 241.3359375, "epoch": 0.28272251308900526, "grad_norm": 1.9734494196317345, "kl": 0.10162353515625, "learning_rate": 8.416957820132741e-07, "loss": 0.0041, "reward": 0.947265625, "reward_std": 0.4113655164837837, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.470703125, "step": 243 }, { "completion_length": 243.72265625, "epoch": 0.28388598022105876, "grad_norm": 5.9172328940788805, "kl": 0.145751953125, "learning_rate": 8.40316688878597e-07, "loss": 0.0058, "reward": 0.83984375, "reward_std": 0.2976169250905514, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.46484375, "step": 244 }, { "completion_length": 313.265625, "epoch": 0.2850494473531123, "grad_norm": 0.7045576031376994, "kl": 0.05126953125, "learning_rate": 8.389327552188367e-07, "loss": 0.0021, "reward": 0.76171875, "reward_std": 0.3399813622236252, "rewards/correctness_reward_func": 0.296875, "rewards/strict_format_reward_func": 0.46484375, "step": 245 }, { "completion_length": 278.32421875, "epoch": 0.2862129144851658, "grad_norm": 99.97606047197375, "kl": 1.1429443359375, "learning_rate": 8.375440007184991e-07, "loss": 0.0456, "reward": 0.818359375, "reward_std": 0.3586278632283211, "rewards/correctness_reward_func": 0.3515625, "rewards/strict_format_reward_func": 0.466796875, "step": 246 }, { "completion_length": 300.86328125, "epoch": 0.2873763816172193, "grad_norm": 1.5945868074054774, "kl": 0.06781005859375, "learning_rate": 8.361504451306584e-07, "loss": 0.0027, "reward": 0.85546875, "reward_std": 0.3837997391819954, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.46484375, "step": 247 }, { "completion_length": 258.60546875, "epoch": 0.28853984874927285, "grad_norm": 0.8197257017390418, "kl": 0.06689453125, "learning_rate": 8.347521082766782e-07, "loss": 0.0027, "reward": 0.97265625, "reward_std": 0.3784537613391876, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.46484375, "step": 248 }, { "completion_length": 292.2578125, "epoch": 0.28970331588132636, "grad_norm": 0.7309425742422451, "kl": 0.07720947265625, "learning_rate": 8.333490100459288e-07, "loss": 0.0031, "reward": 0.8046875, "reward_std": 0.35429640859365463, "rewards/correctness_reward_func": 0.34375, "rewards/strict_format_reward_func": 0.4609375, "step": 249 }, { "completion_length": 259.3515625, "epoch": 0.29086678301337987, "grad_norm": 4.322168877458447, "kl": 0.0703125, "learning_rate": 8.319411703955041e-07, "loss": 0.0028, "reward": 0.94921875, "reward_std": 0.4805869683623314, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.44921875, "step": 250 }, { "completion_length": 244.33984375, "epoch": 0.2920302501454334, "grad_norm": 5.481884722361933, "kl": 0.04974365234375, "learning_rate": 8.305286093499385e-07, "loss": 0.002, "reward": 0.90234375, "reward_std": 0.4274512976408005, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.46484375, "step": 251 }, { "completion_length": 261.0859375, "epoch": 0.2931937172774869, "grad_norm": 1.3985501287819047, "kl": 0.05426025390625, "learning_rate": 8.291113470009212e-07, "loss": 0.0022, "reward": 1.056640625, "reward_std": 0.45268743485212326, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.455078125, "step": 252 }, { "completion_length": 289.73828125, "epoch": 0.29435718440954045, "grad_norm": 1.714135055634622, "kl": 0.05889892578125, "learning_rate": 8.276894035070105e-07, "loss": 0.0024, "reward": 0.796875, "reward_std": 0.4572765380144119, "rewards/correctness_reward_func": 0.359375, "rewards/strict_format_reward_func": 0.4375, "step": 253 }, { "completion_length": 287.546875, "epoch": 0.29552065154159396, "grad_norm": 2.480524422047597, "kl": 0.2305908203125, "learning_rate": 8.262627990933484e-07, "loss": 0.0092, "reward": 0.896484375, "reward_std": 0.4636228308081627, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.443359375, "step": 254 }, { "completion_length": 290.16015625, "epoch": 0.29668411867364747, "grad_norm": 1.2452478379651943, "kl": 0.0716552734375, "learning_rate": 8.248315540513707e-07, "loss": 0.0029, "reward": 0.798828125, "reward_std": 0.40846820175647736, "rewards/correctness_reward_func": 0.3671875, "rewards/strict_format_reward_func": 0.431640625, "step": 255 }, { "completion_length": 253.73828125, "epoch": 0.297847585805701, "grad_norm": 5.075664922927079, "kl": 0.0672607421875, "learning_rate": 8.233956887385207e-07, "loss": 0.0027, "reward": 0.83203125, "reward_std": 0.4403337463736534, "rewards/correctness_reward_func": 0.3671875, "rewards/strict_format_reward_func": 0.46484375, "step": 256 }, { "completion_length": 276.9609375, "epoch": 0.2990110529377545, "grad_norm": 0.8226937343365959, "kl": 0.0517578125, "learning_rate": 8.219552235779578e-07, "loss": 0.0021, "reward": 0.970703125, "reward_std": 0.4804186299443245, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.462890625, "step": 257 }, { "completion_length": 289.42578125, "epoch": 0.30017452006980805, "grad_norm": 6.122444900156605, "kl": 0.17572021484375, "learning_rate": 8.20510179058268e-07, "loss": 0.007, "reward": 0.96875, "reward_std": 0.4253893867135048, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.4609375, "step": 258 }, { "completion_length": 280.1796875, "epoch": 0.30133798720186156, "grad_norm": 18.107601314399282, "kl": 0.3717041015625, "learning_rate": 8.190605757331721e-07, "loss": 0.0149, "reward": 0.912109375, "reward_std": 0.3370118737220764, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.458984375, "step": 259 }, { "completion_length": 346.0078125, "epoch": 0.30250145433391507, "grad_norm": 0.8312861253568518, "kl": 0.0518798828125, "learning_rate": 8.176064342212338e-07, "loss": 0.0021, "reward": 0.76171875, "reward_std": 0.3621474578976631, "rewards/correctness_reward_func": 0.3046875, "rewards/strict_format_reward_func": 0.45703125, "step": 260 }, { "completion_length": 267.34765625, "epoch": 0.3036649214659686, "grad_norm": 0.7855951706430839, "kl": 0.06011962890625, "learning_rate": 8.161477752055659e-07, "loss": 0.0024, "reward": 0.982421875, "reward_std": 0.47281666100025177, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.466796875, "step": 261 }, { "completion_length": 288.26953125, "epoch": 0.3048283885980221, "grad_norm": 23.20227789560969, "kl": 0.1376953125, "learning_rate": 8.14684619433536e-07, "loss": 0.0055, "reward": 0.755859375, "reward_std": 0.4141215980052948, "rewards/correctness_reward_func": 0.3203125, "rewards/strict_format_reward_func": 0.435546875, "step": 262 }, { "completion_length": 287.57421875, "epoch": 0.30599185573007565, "grad_norm": 0.7923305469166532, "kl": 0.08819580078125, "learning_rate": 8.132169877164722e-07, "loss": 0.0035, "reward": 0.9453125, "reward_std": 0.36005185544490814, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.4609375, "step": 263 }, { "completion_length": 301.91015625, "epoch": 0.30715532286212915, "grad_norm": 8.734166868820484, "kl": 0.37481689453125, "learning_rate": 8.117449009293668e-07, "loss": 0.015, "reward": 0.80859375, "reward_std": 0.3769505098462105, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.47265625, "step": 264 }, { "completion_length": 277.05078125, "epoch": 0.30831878999418266, "grad_norm": 124.18898467280192, "kl": 4.3785400390625, "learning_rate": 8.102683800105782e-07, "loss": 0.1749, "reward": 0.798828125, "reward_std": 0.32617636770009995, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.462890625, "step": 265 }, { "completion_length": 260.38671875, "epoch": 0.30948225712623617, "grad_norm": 0.3211595075420707, "kl": 0.0693359375, "learning_rate": 8.087874459615353e-07, "loss": 0.0028, "reward": 0.90234375, "reward_std": 0.4565294161438942, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.45703125, "step": 266 }, { "completion_length": 273.12890625, "epoch": 0.3106457242582897, "grad_norm": 2.846858265053996, "kl": 0.1624755859375, "learning_rate": 8.073021198464365e-07, "loss": 0.0065, "reward": 0.87890625, "reward_std": 0.43481577187776566, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.45703125, "step": 267 }, { "completion_length": 338.13671875, "epoch": 0.31180919139034324, "grad_norm": 66.18157859066437, "kl": 0.1065673828125, "learning_rate": 8.058124227919518e-07, "loss": 0.0043, "reward": 0.814453125, "reward_std": 0.3689689412713051, "rewards/correctness_reward_func": 0.3515625, "rewards/strict_format_reward_func": 0.462890625, "step": 268 }, { "completion_length": 270.06640625, "epoch": 0.31297265852239675, "grad_norm": 29.932958374882606, "kl": 0.4578857421875, "learning_rate": 8.043183759869215e-07, "loss": 0.0183, "reward": 0.9453125, "reward_std": 0.32830599695444107, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.46875, "step": 269 }, { "completion_length": 350.53515625, "epoch": 0.31413612565445026, "grad_norm": 1.2341233477369173, "kl": 0.16387939453125, "learning_rate": 8.02820000682055e-07, "loss": 0.0065, "reward": 0.767578125, "reward_std": 0.4540194571018219, "rewards/correctness_reward_func": 0.3125, "rewards/strict_format_reward_func": 0.455078125, "step": 270 }, { "completion_length": 298.56640625, "epoch": 0.31529959278650377, "grad_norm": 5.069296146615704, "kl": 0.2171630859375, "learning_rate": 8.013173181896282e-07, "loss": 0.0087, "reward": 0.802734375, "reward_std": 0.3336941972374916, "rewards/correctness_reward_func": 0.34375, "rewards/strict_format_reward_func": 0.458984375, "step": 271 }, { "completion_length": 278.13671875, "epoch": 0.3164630599185573, "grad_norm": 2.0297223324635882, "kl": 0.09344482421875, "learning_rate": 7.998103498831809e-07, "loss": 0.0037, "reward": 0.86328125, "reward_std": 0.44476018100976944, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.45703125, "step": 272 }, { "completion_length": 288.4609375, "epoch": 0.31762652705061084, "grad_norm": 14.673360254926452, "kl": 0.974609375, "learning_rate": 7.982991171972129e-07, "loss": 0.0389, "reward": 0.84375, "reward_std": 0.47124050557613373, "rewards/correctness_reward_func": 0.3984375, "rewards/strict_format_reward_func": 0.4453125, "step": 273 }, { "completion_length": 272.5703125, "epoch": 0.31878999418266435, "grad_norm": 191.1981437316452, "kl": 0.686767578125, "learning_rate": 7.967836416268783e-07, "loss": 0.0275, "reward": 0.9453125, "reward_std": 0.48988577723503113, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.453125, "step": 274 }, { "completion_length": 285.22265625, "epoch": 0.31995346131471786, "grad_norm": 418.5149241283848, "kl": 21.90869140625, "learning_rate": 7.952639447276801e-07, "loss": 0.875, "reward": 0.85546875, "reward_std": 0.34744718298316, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.44921875, "step": 275 }, { "completion_length": 352.55859375, "epoch": 0.32111692844677137, "grad_norm": 5.517463810139425, "kl": 0.1632080078125, "learning_rate": 7.937400481151643e-07, "loss": 0.0066, "reward": 0.87890625, "reward_std": 0.41798410564661026, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.44140625, "step": 276 }, { "completion_length": 303.28515625, "epoch": 0.3222803955788249, "grad_norm": 1.4008525063705992, "kl": 0.05145263671875, "learning_rate": 7.922119734646119e-07, "loss": 0.0021, "reward": 0.953125, "reward_std": 0.4892066866159439, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.453125, "step": 277 }, { "completion_length": 299.9375, "epoch": 0.32344386271087844, "grad_norm": 58.19330025177244, "kl": 2.9036865234375, "learning_rate": 7.906797425107298e-07, "loss": 0.1162, "reward": 0.8125, "reward_std": 0.38126061856746674, "rewards/correctness_reward_func": 0.359375, "rewards/strict_format_reward_func": 0.453125, "step": 278 }, { "completion_length": 296.296875, "epoch": 0.32460732984293195, "grad_norm": 4.988414219498022, "kl": 0.13031005859375, "learning_rate": 7.891433770473435e-07, "loss": 0.0052, "reward": 1.033203125, "reward_std": 0.5251075401902199, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.447265625, "step": 279 }, { "completion_length": 292.08984375, "epoch": 0.32577079697498545, "grad_norm": 20.782891094112664, "kl": 0.5985107421875, "learning_rate": 7.876028989270854e-07, "loss": 0.024, "reward": 0.87109375, "reward_std": 0.4835294857621193, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.44921875, "step": 280 }, { "completion_length": 283.0, "epoch": 0.32693426410703896, "grad_norm": 2.9478857276101578, "kl": 0.211669921875, "learning_rate": 7.860583300610847e-07, "loss": 0.0085, "reward": 0.755859375, "reward_std": 0.36347752064466476, "rewards/correctness_reward_func": 0.3046875, "rewards/strict_format_reward_func": 0.451171875, "step": 281 }, { "completion_length": 336.16015625, "epoch": 0.32809773123909247, "grad_norm": 28.642832260495947, "kl": 0.7401123046875, "learning_rate": 7.845096924186561e-07, "loss": 0.0296, "reward": 0.859375, "reward_std": 0.35883988067507744, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.4375, "step": 282 }, { "completion_length": 265.828125, "epoch": 0.32926119837114604, "grad_norm": 5.511291997886887, "kl": 0.37017822265625, "learning_rate": 7.829570080269863e-07, "loss": 0.0148, "reward": 0.982421875, "reward_std": 0.41490499675273895, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.451171875, "step": 283 }, { "completion_length": 271.140625, "epoch": 0.33042466550319954, "grad_norm": 2.2050689289934087, "kl": 0.21759033203125, "learning_rate": 7.814002989708218e-07, "loss": 0.0087, "reward": 1.03515625, "reward_std": 0.3657189831137657, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.45703125, "step": 284 }, { "completion_length": 318.1953125, "epoch": 0.33158813263525305, "grad_norm": 4.9603288763846605, "kl": 0.33056640625, "learning_rate": 7.798395873921541e-07, "loss": 0.0132, "reward": 0.90625, "reward_std": 0.4246979430317879, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.4453125, "step": 285 }, { "completion_length": 289.1015625, "epoch": 0.33275159976730656, "grad_norm": 1.1960041437968447, "kl": 0.19573974609375, "learning_rate": 7.782748954899049e-07, "loss": 0.0078, "reward": 0.853515625, "reward_std": 0.42991074174642563, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.447265625, "step": 286 }, { "completion_length": 323.0546875, "epoch": 0.33391506689936007, "grad_norm": 23.115982446058613, "kl": 0.27880859375, "learning_rate": 7.767062455196103e-07, "loss": 0.0111, "reward": 0.9140625, "reward_std": 0.5111727491021156, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.4296875, "step": 287 }, { "completion_length": 283.3125, "epoch": 0.33507853403141363, "grad_norm": 4.5311983331822505, "kl": 0.18994140625, "learning_rate": 7.751336597931048e-07, "loss": 0.0076, "reward": 0.984375, "reward_std": 0.491551011800766, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.4453125, "step": 288 }, { "completion_length": 282.125, "epoch": 0.33624200116346714, "grad_norm": 20.548421378281937, "kl": 0.68701171875, "learning_rate": 7.735571606782029e-07, "loss": 0.0275, "reward": 1.025390625, "reward_std": 0.4382254332304001, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.455078125, "step": 289 }, { "completion_length": 249.36328125, "epoch": 0.33740546829552065, "grad_norm": 39.49202617743575, "kl": 0.749755859375, "learning_rate": 7.719767705983819e-07, "loss": 0.03, "reward": 1.001953125, "reward_std": 0.4606599807739258, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.439453125, "step": 290 }, { "completion_length": 253.99609375, "epoch": 0.33856893542757416, "grad_norm": 5.972226611756975, "kl": 0.326171875, "learning_rate": 7.703925120324622e-07, "loss": 0.0131, "reward": 0.724609375, "reward_std": 0.419029638171196, "rewards/correctness_reward_func": 0.265625, "rewards/strict_format_reward_func": 0.458984375, "step": 291 }, { "completion_length": 242.32421875, "epoch": 0.33973240255962767, "grad_norm": 0.21582060476978154, "kl": 0.0677490234375, "learning_rate": 7.688044075142887e-07, "loss": 0.0027, "reward": 0.927734375, "reward_std": 0.4823564738035202, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.466796875, "step": 292 }, { "completion_length": 275.3671875, "epoch": 0.34089586969168123, "grad_norm": 3.9678316248011645, "kl": 0.18115234375, "learning_rate": 7.672124796324087e-07, "loss": 0.0072, "reward": 0.95703125, "reward_std": 0.27582550793886185, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.45703125, "step": 293 }, { "completion_length": 292.6171875, "epoch": 0.34205933682373474, "grad_norm": 3.2851559622966175, "kl": 0.27032470703125, "learning_rate": 7.656167510297519e-07, "loss": 0.0108, "reward": 0.892578125, "reward_std": 0.42609231919050217, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.455078125, "step": 294 }, { "completion_length": 313.98828125, "epoch": 0.34322280395578825, "grad_norm": 156.29073698965226, "kl": 1.1085205078125, "learning_rate": 7.640172444033082e-07, "loss": 0.0442, "reward": 0.92578125, "reward_std": 0.47067203372716904, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.44140625, "step": 295 }, { "completion_length": 290.3515625, "epoch": 0.34438627108784176, "grad_norm": 0.595612718703969, "kl": 0.0572509765625, "learning_rate": 7.624139825038039e-07, "loss": 0.0023, "reward": 0.896484375, "reward_std": 0.4766088277101517, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.435546875, "step": 296 }, { "completion_length": 299.75, "epoch": 0.34554973821989526, "grad_norm": 18.309563551611284, "kl": 0.791259765625, "learning_rate": 7.608069881353788e-07, "loss": 0.0316, "reward": 0.9296875, "reward_std": 0.5468408614397049, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.4296875, "step": 297 }, { "completion_length": 286.49609375, "epoch": 0.3467132053519488, "grad_norm": 1.7724310543634978, "kl": 0.122314453125, "learning_rate": 7.591962841552626e-07, "loss": 0.0049, "reward": 1.064453125, "reward_std": 0.3686642274260521, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.455078125, "step": 298 }, { "completion_length": 255.08984375, "epoch": 0.34787667248400234, "grad_norm": 13.25910288165939, "kl": 0.46435546875, "learning_rate": 7.575818934734479e-07, "loss": 0.0186, "reward": 0.908203125, "reward_std": 0.29331541061401367, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.455078125, "step": 299 }, { "completion_length": 293.7578125, "epoch": 0.34904013961605584, "grad_norm": 5.444338562580366, "kl": 0.56414794921875, "learning_rate": 7.559638390523666e-07, "loss": 0.0225, "reward": 0.833984375, "reward_std": 0.3685857132077217, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.458984375, "step": 300 }, { "completion_length": 292.828125, "epoch": 0.35020360674810935, "grad_norm": 6.100633991693037, "kl": 0.12646484375, "learning_rate": 7.543421439065612e-07, "loss": 0.0051, "reward": 0.89453125, "reward_std": 0.5025207027792931, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.45703125, "step": 301 }, { "completion_length": 268.34375, "epoch": 0.35136707388016286, "grad_norm": 1.8425490148011872, "kl": 0.10198974609375, "learning_rate": 7.527168311023587e-07, "loss": 0.0041, "reward": 0.958984375, "reward_std": 0.5489888489246368, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.451171875, "step": 302 }, { "completion_length": 293.58203125, "epoch": 0.3525305410122164, "grad_norm": 1.692360446617282, "kl": 0.14959716796875, "learning_rate": 7.510879237575422e-07, "loss": 0.006, "reward": 0.974609375, "reward_std": 0.4490493983030319, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.451171875, "step": 303 }, { "completion_length": 285.34765625, "epoch": 0.35369400814426993, "grad_norm": 3.508069185952451, "kl": 0.054443359375, "learning_rate": 7.494554450410221e-07, "loss": 0.0022, "reward": 0.875, "reward_std": 0.3934580013155937, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.453125, "step": 304 }, { "completion_length": 269.8828125, "epoch": 0.35485747527632344, "grad_norm": 1.5358802427463545, "kl": 0.098388671875, "learning_rate": 7.478194181725066e-07, "loss": 0.0039, "reward": 0.87890625, "reward_std": 0.49072904884815216, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.44921875, "step": 305 }, { "completion_length": 297.12109375, "epoch": 0.35602094240837695, "grad_norm": 0.3863181294560715, "kl": 0.05889892578125, "learning_rate": 7.46179866422171e-07, "loss": 0.0024, "reward": 0.994140625, "reward_std": 0.39648278057575226, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.455078125, "step": 306 }, { "completion_length": 251.53125, "epoch": 0.35718440954043046, "grad_norm": 4.131246383822348, "kl": 0.059814453125, "learning_rate": 7.445368131103276e-07, "loss": 0.0024, "reward": 0.986328125, "reward_std": 0.3874545693397522, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.462890625, "step": 307 }, { "completion_length": 293.72265625, "epoch": 0.358347876672484, "grad_norm": 5.357708593785944, "kl": 0.15216064453125, "learning_rate": 7.42890281607093e-07, "loss": 0.0061, "reward": 0.8046875, "reward_std": 0.4407268315553665, "rewards/correctness_reward_func": 0.359375, "rewards/strict_format_reward_func": 0.4453125, "step": 308 }, { "completion_length": 279.83984375, "epoch": 0.35951134380453753, "grad_norm": 7.6975082051259065, "kl": 0.204345703125, "learning_rate": 7.412402953320564e-07, "loss": 0.0082, "reward": 0.8828125, "reward_std": 0.43373846262693405, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.453125, "step": 309 }, { "completion_length": 272.859375, "epoch": 0.36067481093659104, "grad_norm": 0.336168313417199, "kl": 0.05364990234375, "learning_rate": 7.395868777539463e-07, "loss": 0.0021, "reward": 0.8515625, "reward_std": 0.46646444499492645, "rewards/correctness_reward_func": 0.3984375, "rewards/strict_format_reward_func": 0.453125, "step": 310 }, { "completion_length": 263.75, "epoch": 0.36183827806864455, "grad_norm": 1.2413743801727888, "kl": 0.084716796875, "learning_rate": 7.379300523902966e-07, "loss": 0.0034, "reward": 0.85546875, "reward_std": 0.2863091342151165, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.44921875, "step": 311 }, { "completion_length": 281.5234375, "epoch": 0.36300174520069806, "grad_norm": 0.3021891999645089, "kl": 0.101806640625, "learning_rate": 7.362698428071119e-07, "loss": 0.0041, "reward": 0.876953125, "reward_std": 0.4422207400202751, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.455078125, "step": 312 }, { "completion_length": 316.02734375, "epoch": 0.3641652123327516, "grad_norm": 2.655774036031787, "kl": 0.0721435546875, "learning_rate": 7.346062726185331e-07, "loss": 0.0029, "reward": 0.953125, "reward_std": 0.5705959796905518, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.4453125, "step": 313 }, { "completion_length": 267.1328125, "epoch": 0.36532867946480513, "grad_norm": 1.4298633836024373, "kl": 0.062255859375, "learning_rate": 7.329393654865003e-07, "loss": 0.0025, "reward": 0.83984375, "reward_std": 0.4409986585378647, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.44921875, "step": 314 }, { "completion_length": 302.203125, "epoch": 0.36649214659685864, "grad_norm": 1.6922942167009412, "kl": 0.0684814453125, "learning_rate": 7.312691451204177e-07, "loss": 0.0027, "reward": 0.888671875, "reward_std": 0.44465211778879166, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.466796875, "step": 315 }, { "completion_length": 328.37109375, "epoch": 0.36765561372891215, "grad_norm": 0.4767717665262143, "kl": 0.07391357421875, "learning_rate": 7.295956352768146e-07, "loss": 0.003, "reward": 0.8828125, "reward_std": 0.3359094634652138, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.453125, "step": 316 }, { "completion_length": 331.06640625, "epoch": 0.36881908086096565, "grad_norm": 0.6343187482121538, "kl": 0.0592041015625, "learning_rate": 7.279188597590096e-07, "loss": 0.0024, "reward": 0.8515625, "reward_std": 0.4217455983161926, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.4453125, "step": 317 }, { "completion_length": 250.39453125, "epoch": 0.3699825479930192, "grad_norm": 0.3529589969084002, "kl": 0.071044921875, "learning_rate": 7.262388424167696e-07, "loss": 0.0028, "reward": 0.8984375, "reward_std": 0.41545209661126137, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.46875, "step": 318 }, { "completion_length": 278.3515625, "epoch": 0.3711460151250727, "grad_norm": 2.472508242096947, "kl": 0.0982666015625, "learning_rate": 7.245556071459734e-07, "loss": 0.0039, "reward": 0.943359375, "reward_std": 0.3968650698661804, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.466796875, "step": 319 }, { "completion_length": 270.5703125, "epoch": 0.37230948225712623, "grad_norm": 2.686926977747703, "kl": 0.2490234375, "learning_rate": 7.228691778882692e-07, "loss": 0.01, "reward": 0.931640625, "reward_std": 0.47404584288597107, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.470703125, "step": 320 }, { "completion_length": 255.34765625, "epoch": 0.37347294938917974, "grad_norm": 2.99307737449873, "kl": 0.05657958984375, "learning_rate": 7.211795786307352e-07, "loss": 0.0023, "reward": 1.064453125, "reward_std": 0.4582252502441406, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.470703125, "step": 321 }, { "completion_length": 328.37890625, "epoch": 0.37463641652123325, "grad_norm": 2.551215435224044, "kl": 0.1298828125, "learning_rate": 7.19486833405539e-07, "loss": 0.0052, "reward": 0.78515625, "reward_std": 0.39224421232938766, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.44921875, "step": 322 }, { "completion_length": 287.5078125, "epoch": 0.3757998836532868, "grad_norm": 21.199099089340812, "kl": 0.093017578125, "learning_rate": 7.177909662895948e-07, "loss": 0.0037, "reward": 1.064453125, "reward_std": 0.5519165471196175, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.470703125, "step": 323 }, { "completion_length": 307.87890625, "epoch": 0.3769633507853403, "grad_norm": 0.36035536699172344, "kl": 0.0697021484375, "learning_rate": 7.160920014042211e-07, "loss": 0.0028, "reward": 0.849609375, "reward_std": 0.41664647683501244, "rewards/correctness_reward_func": 0.3984375, "rewards/strict_format_reward_func": 0.451171875, "step": 324 }, { "completion_length": 283.99609375, "epoch": 0.37812681791739383, "grad_norm": 1.207235159811352, "kl": 0.0706787109375, "learning_rate": 7.143899629147981e-07, "loss": 0.0028, "reward": 0.87890625, "reward_std": 0.3709994927048683, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.47265625, "step": 325 }, { "completion_length": 296.98828125, "epoch": 0.37929028504944734, "grad_norm": 0.9598405109051757, "kl": 0.07830810546875, "learning_rate": 7.126848750304237e-07, "loss": 0.0031, "reward": 0.935546875, "reward_std": 0.40895771980285645, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.458984375, "step": 326 }, { "completion_length": 266.03515625, "epoch": 0.38045375218150085, "grad_norm": 1.3772341166194801, "kl": 0.097412109375, "learning_rate": 7.109767620035688e-07, "loss": 0.0039, "reward": 1.06640625, "reward_std": 0.4886847510933876, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.46484375, "step": 327 }, { "completion_length": 266.921875, "epoch": 0.3816172193135544, "grad_norm": 0.4320806292365696, "kl": 0.1014404296875, "learning_rate": 7.092656481297331e-07, "loss": 0.0041, "reward": 1.0, "reward_std": 0.3651655316352844, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.4765625, "step": 328 }, { "completion_length": 257.58984375, "epoch": 0.3827806864456079, "grad_norm": 4.068656740039866, "kl": 0.1064453125, "learning_rate": 7.07551557747099e-07, "loss": 0.0043, "reward": 0.93359375, "reward_std": 0.4508674889802933, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.48046875, "step": 329 }, { "completion_length": 291.984375, "epoch": 0.38394415357766143, "grad_norm": 1.2629650587217804, "kl": 0.0921630859375, "learning_rate": 7.058345152361851e-07, "loss": 0.0037, "reward": 0.84375, "reward_std": 0.415913924574852, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.46875, "step": 330 }, { "completion_length": 253.37109375, "epoch": 0.38510762070971494, "grad_norm": 0.8995809103848483, "kl": 0.07159423828125, "learning_rate": 7.041145450195007e-07, "loss": 0.0029, "reward": 1.02734375, "reward_std": 0.5437674820423126, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.47265625, "step": 331 }, { "completion_length": 338.36328125, "epoch": 0.38627108784176845, "grad_norm": 0.8378342347040286, "kl": 0.0877685546875, "learning_rate": 7.023916715611968e-07, "loss": 0.0035, "reward": 0.74609375, "reward_std": 0.3441680669784546, "rewards/correctness_reward_func": 0.2734375, "rewards/strict_format_reward_func": 0.47265625, "step": 332 }, { "completion_length": 284.6015625, "epoch": 0.387434554973822, "grad_norm": 2.31360815468668, "kl": 0.0682373046875, "learning_rate": 7.006659193667195e-07, "loss": 0.0027, "reward": 0.96875, "reward_std": 0.5073177665472031, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.453125, "step": 333 }, { "completion_length": 292.765625, "epoch": 0.3885980221058755, "grad_norm": 4.3632220753584186, "kl": 0.052978515625, "learning_rate": 6.989373129824604e-07, "loss": 0.0021, "reward": 0.869140625, "reward_std": 0.33882325887680054, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.462890625, "step": 334 }, { "completion_length": 287.5234375, "epoch": 0.389761489237929, "grad_norm": 1.9923214109753067, "kl": 0.0863037109375, "learning_rate": 6.972058769954082e-07, "loss": 0.0035, "reward": 0.8984375, "reward_std": 0.3346223533153534, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.46875, "step": 335 }, { "completion_length": 285.7265625, "epoch": 0.39092495636998253, "grad_norm": 0.23461091590058605, "kl": 0.05474853515625, "learning_rate": 6.954716360327987e-07, "loss": 0.0022, "reward": 0.990234375, "reward_std": 0.3980119228363037, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.474609375, "step": 336 }, { "completion_length": 283.25390625, "epoch": 0.39208842350203604, "grad_norm": 4.617058862195678, "kl": 0.08746337890625, "learning_rate": 6.937346147617644e-07, "loss": 0.0035, "reward": 0.833984375, "reward_std": 0.3677009716629982, "rewards/correctness_reward_func": 0.359375, "rewards/strict_format_reward_func": 0.474609375, "step": 337 }, { "completion_length": 306.34765625, "epoch": 0.3932518906340896, "grad_norm": 0.6257497631604036, "kl": 0.08087158203125, "learning_rate": 6.919948378889838e-07, "loss": 0.0032, "reward": 1.01171875, "reward_std": 0.4332919120788574, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.45703125, "step": 338 }, { "completion_length": 296.1875, "epoch": 0.3944153577661431, "grad_norm": 4.139094610924436, "kl": 0.0716552734375, "learning_rate": 6.902523301603302e-07, "loss": 0.0029, "reward": 1.0546875, "reward_std": 0.5247118473052979, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.46875, "step": 339 }, { "completion_length": 315.74609375, "epoch": 0.3955788248981966, "grad_norm": 1.4531866984187511, "kl": 0.1959228515625, "learning_rate": 6.885071163605189e-07, "loss": 0.0078, "reward": 0.916015625, "reward_std": 0.4672032818198204, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.455078125, "step": 340 }, { "completion_length": 267.40234375, "epoch": 0.39674229203025013, "grad_norm": 4.295801116482015, "kl": 0.255126953125, "learning_rate": 6.867592213127557e-07, "loss": 0.0102, "reward": 0.892578125, "reward_std": 0.4448869302868843, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.478515625, "step": 341 }, { "completion_length": 300.69140625, "epoch": 0.39790575916230364, "grad_norm": 1.1895438599471349, "kl": 0.059814453125, "learning_rate": 6.850086698783835e-07, "loss": 0.0024, "reward": 0.958984375, "reward_std": 0.4693913161754608, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.466796875, "step": 342 }, { "completion_length": 263.73046875, "epoch": 0.3990692262943572, "grad_norm": 73.91046284848984, "kl": 2.537109375, "learning_rate": 6.832554869565283e-07, "loss": 0.1012, "reward": 1.09765625, "reward_std": 0.5176768451929092, "rewards/correctness_reward_func": 0.6328125, "rewards/strict_format_reward_func": 0.46484375, "step": 343 }, { "completion_length": 336.515625, "epoch": 0.4002326934264107, "grad_norm": 4.680729319132339, "kl": 0.1307373046875, "learning_rate": 6.814996974837453e-07, "loss": 0.0052, "reward": 1.025390625, "reward_std": 0.526333324611187, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.447265625, "step": 344 }, { "completion_length": 295.015625, "epoch": 0.4013961605584642, "grad_norm": 6.036469995091413, "kl": 0.47979736328125, "learning_rate": 6.797413264336639e-07, "loss": 0.0192, "reward": 0.9921875, "reward_std": 0.5116188377141953, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.4609375, "step": 345 }, { "completion_length": 328.43359375, "epoch": 0.40255962769051773, "grad_norm": 486.30133585488943, "kl": 6.2724609375, "learning_rate": 6.779803988166336e-07, "loss": 0.2509, "reward": 0.87109375, "reward_std": 0.44400743395090103, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.45703125, "step": 346 }, { "completion_length": 287.21484375, "epoch": 0.40372309482257124, "grad_norm": 154.33776176325156, "kl": 6.2060546875, "learning_rate": 6.762169396793669e-07, "loss": 0.2496, "reward": 0.87890625, "reward_std": 0.38374919444322586, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.45703125, "step": 347 }, { "completion_length": 383.50390625, "epoch": 0.4048865619546248, "grad_norm": 37.42405415072771, "kl": 2.22607421875, "learning_rate": 6.744509741045834e-07, "loss": 0.0892, "reward": 0.87890625, "reward_std": 0.4110454022884369, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.42578125, "step": 348 }, { "completion_length": 336.7109375, "epoch": 0.4060500290866783, "grad_norm": 108.03467717955267, "kl": 5.091064453125, "learning_rate": 6.726825272106538e-07, "loss": 0.2026, "reward": 0.923828125, "reward_std": 0.4757932648062706, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.439453125, "step": 349 }, { "completion_length": 258.1171875, "epoch": 0.4072134962187318, "grad_norm": 7.0192066805763575, "kl": 0.2354736328125, "learning_rate": 6.709116241512418e-07, "loss": 0.0094, "reward": 1.10546875, "reward_std": 0.47969014942646027, "rewards/correctness_reward_func": 0.640625, "rewards/strict_format_reward_func": 0.46484375, "step": 350 }, { "completion_length": 301.58203125, "epoch": 0.4083769633507853, "grad_norm": 29.956718885980813, "kl": 0.07318115234375, "learning_rate": 6.691382901149466e-07, "loss": 0.0029, "reward": 0.7890625, "reward_std": 0.31258631870150566, "rewards/correctness_reward_func": 0.3203125, "rewards/strict_format_reward_func": 0.46875, "step": 351 }, { "completion_length": 302.5, "epoch": 0.40954043048283884, "grad_norm": 0.6045757295720975, "kl": 0.078369140625, "learning_rate": 6.673625503249446e-07, "loss": 0.0031, "reward": 0.87890625, "reward_std": 0.34792211651802063, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.47265625, "step": 352 }, { "completion_length": 290.171875, "epoch": 0.4107038976148924, "grad_norm": 1.3494720327478775, "kl": 0.117431640625, "learning_rate": 6.655844300386307e-07, "loss": 0.0047, "reward": 0.96484375, "reward_std": 0.3973870575428009, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.45703125, "step": 353 }, { "completion_length": 294.5625, "epoch": 0.4118673647469459, "grad_norm": 3.027373038825209, "kl": 0.084228515625, "learning_rate": 6.638039545472589e-07, "loss": 0.0034, "reward": 0.962890625, "reward_std": 0.46778035163879395, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.462890625, "step": 354 }, { "completion_length": 318.7109375, "epoch": 0.4130308318789994, "grad_norm": 2.465870038521231, "kl": 0.1236572265625, "learning_rate": 6.62021149175583e-07, "loss": 0.0049, "reward": 1.013671875, "reward_std": 0.5298342257738113, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.466796875, "step": 355 }, { "completion_length": 340.37109375, "epoch": 0.4141942990110529, "grad_norm": 1.663606615690322, "kl": 0.07403564453125, "learning_rate": 6.602360392814954e-07, "loss": 0.003, "reward": 0.806640625, "reward_std": 0.3749229460954666, "rewards/correctness_reward_func": 0.3515625, "rewards/strict_format_reward_func": 0.455078125, "step": 356 }, { "completion_length": 323.73046875, "epoch": 0.41535776614310643, "grad_norm": 2.21234660035983, "kl": 0.228515625, "learning_rate": 6.584486502556679e-07, "loss": 0.0091, "reward": 0.85546875, "reward_std": 0.4007824957370758, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.44921875, "step": 357 }, { "completion_length": 304.79296875, "epoch": 0.41652123327516, "grad_norm": 1.4789340519359608, "kl": 0.0728759765625, "learning_rate": 6.56659007521189e-07, "loss": 0.0029, "reward": 0.9375, "reward_std": 0.36531662568449974, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.4765625, "step": 358 }, { "completion_length": 316.609375, "epoch": 0.4176847004072135, "grad_norm": 6.966041743056745, "kl": 0.25665283203125, "learning_rate": 6.548671365332036e-07, "loss": 0.0103, "reward": 0.80078125, "reward_std": 0.3164653554558754, "rewards/correctness_reward_func": 0.34375, "rewards/strict_format_reward_func": 0.45703125, "step": 359 }, { "completion_length": 356.16015625, "epoch": 0.418848167539267, "grad_norm": 401.69862466934006, "kl": 27.628662109375, "learning_rate": 6.530730627785499e-07, "loss": 1.1049, "reward": 0.865234375, "reward_std": 0.3799719959497452, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.458984375, "step": 360 }, { "completion_length": 298.40234375, "epoch": 0.4200116346713205, "grad_norm": 0.3008996194527992, "kl": 0.05902099609375, "learning_rate": 6.512768117753979e-07, "loss": 0.0024, "reward": 1.041015625, "reward_std": 0.530546247959137, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.462890625, "step": 361 }, { "completion_length": 242.09375, "epoch": 0.42117510180337403, "grad_norm": 2.203466870636346, "kl": 0.2286376953125, "learning_rate": 6.494784090728851e-07, "loss": 0.0092, "reward": 1.1171875, "reward_std": 0.4545508846640587, "rewards/correctness_reward_func": 0.640625, "rewards/strict_format_reward_func": 0.4765625, "step": 362 }, { "completion_length": 316.88671875, "epoch": 0.4223385689354276, "grad_norm": 0.40508537390077043, "kl": 0.09515380859375, "learning_rate": 6.476778802507549e-07, "loss": 0.0038, "reward": 0.85546875, "reward_std": 0.44702593982219696, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.44921875, "step": 363 }, { "completion_length": 308.7421875, "epoch": 0.4235020360674811, "grad_norm": 0.5807159725668048, "kl": 0.083740234375, "learning_rate": 6.458752509189908e-07, "loss": 0.0033, "reward": 1.060546875, "reward_std": 0.3982198238372803, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.466796875, "step": 364 }, { "completion_length": 294.26171875, "epoch": 0.4246655031995346, "grad_norm": 3.5228408503441244, "kl": 0.055419921875, "learning_rate": 6.440705467174536e-07, "loss": 0.0022, "reward": 0.9921875, "reward_std": 0.475242480635643, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.4453125, "step": 365 }, { "completion_length": 272.81640625, "epoch": 0.4258289703315881, "grad_norm": 2.617684986312585, "kl": 0.1051025390625, "learning_rate": 6.422637933155162e-07, "loss": 0.0042, "reward": 0.9140625, "reward_std": 0.5034167245030403, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.4609375, "step": 366 }, { "completion_length": 339.28515625, "epoch": 0.4269924374636416, "grad_norm": 0.264052304619519, "kl": 0.05450439453125, "learning_rate": 6.404550164116983e-07, "loss": 0.0022, "reward": 0.8828125, "reward_std": 0.5367200821638107, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.453125, "step": 367 }, { "completion_length": 297.54296875, "epoch": 0.4281559045956952, "grad_norm": 0.2540420723395575, "kl": 0.060791015625, "learning_rate": 6.386442417333011e-07, "loss": 0.0024, "reward": 0.875, "reward_std": 0.40020179748535156, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.4453125, "step": 368 }, { "completion_length": 313.375, "epoch": 0.4293193717277487, "grad_norm": 162.3259507292585, "kl": 2.98175048828125, "learning_rate": 6.368314950360415e-07, "loss": 0.119, "reward": 0.900390625, "reward_std": 0.4489917606115341, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.455078125, "step": 369 }, { "completion_length": 273.625, "epoch": 0.4304828388598022, "grad_norm": 0.4830388141059742, "kl": 0.06939697265625, "learning_rate": 6.350168021036852e-07, "loss": 0.0028, "reward": 1.1484375, "reward_std": 0.46489571034908295, "rewards/correctness_reward_func": 0.6796875, "rewards/strict_format_reward_func": 0.46875, "step": 370 }, { "completion_length": 321.390625, "epoch": 0.4316463059918557, "grad_norm": 8.52264845976738, "kl": 0.25244140625, "learning_rate": 6.33200188747681e-07, "loss": 0.0102, "reward": 0.9140625, "reward_std": 0.4011785313487053, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.4609375, "step": 371 }, { "completion_length": 359.1640625, "epoch": 0.4328097731239092, "grad_norm": 11.86514432387896, "kl": 0.363037109375, "learning_rate": 6.313816808067921e-07, "loss": 0.0146, "reward": 0.826171875, "reward_std": 0.46916285157203674, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.451171875, "step": 372 }, { "completion_length": 301.23828125, "epoch": 0.4339732402559628, "grad_norm": 0.4691828608962919, "kl": 0.0726318359375, "learning_rate": 6.295613041467306e-07, "loss": 0.0029, "reward": 0.947265625, "reward_std": 0.44325540214776993, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.455078125, "step": 373 }, { "completion_length": 303.23046875, "epoch": 0.4351367073880163, "grad_norm": 0.8583196693880277, "kl": 0.06494140625, "learning_rate": 6.277390846597875e-07, "loss": 0.0026, "reward": 0.8828125, "reward_std": 0.33091457933187485, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.4609375, "step": 374 }, { "completion_length": 360.6796875, "epoch": 0.4363001745200698, "grad_norm": 2.748050483908299, "kl": 0.18170166015625, "learning_rate": 6.25915048264466e-07, "loss": 0.0073, "reward": 0.927734375, "reward_std": 0.4525812417268753, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.443359375, "step": 375 }, { "completion_length": 309.7734375, "epoch": 0.4374636416521233, "grad_norm": 8.114908690781276, "kl": 0.2568359375, "learning_rate": 6.240892209051119e-07, "loss": 0.0103, "reward": 1.056640625, "reward_std": 0.45491015911102295, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.447265625, "step": 376 }, { "completion_length": 320.3125, "epoch": 0.4386271087841768, "grad_norm": 7.430330193815299, "kl": 1.0621337890625, "learning_rate": 6.222616285515455e-07, "loss": 0.0424, "reward": 1.0859375, "reward_std": 0.5184299722313881, "rewards/correctness_reward_func": 0.6328125, "rewards/strict_format_reward_func": 0.453125, "step": 377 }, { "completion_length": 310.64453125, "epoch": 0.4397905759162304, "grad_norm": 177.08020141837235, "kl": 0.979248046875, "learning_rate": 6.204322971986909e-07, "loss": 0.0391, "reward": 0.90625, "reward_std": 0.48444219678640366, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.453125, "step": 378 }, { "completion_length": 291.13671875, "epoch": 0.4409540430482839, "grad_norm": 1.5196913588642722, "kl": 0.1044921875, "learning_rate": 6.186012528662076e-07, "loss": 0.0042, "reward": 0.923828125, "reward_std": 0.42555277049541473, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.447265625, "step": 379 }, { "completion_length": 301.5390625, "epoch": 0.4421175101803374, "grad_norm": 8.472221811733819, "kl": 0.68304443359375, "learning_rate": 6.167685215981198e-07, "loss": 0.0273, "reward": 0.99609375, "reward_std": 0.40058234333992004, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.45703125, "step": 380 }, { "completion_length": 291.40625, "epoch": 0.4432809773123909, "grad_norm": 1.6225883424891956, "kl": 0.18682861328125, "learning_rate": 6.149341294624455e-07, "loss": 0.0074, "reward": 0.935546875, "reward_std": 0.3684495612978935, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.451171875, "step": 381 }, { "completion_length": 353.41796875, "epoch": 0.4444444444444444, "grad_norm": 4.9438686730768655, "kl": 0.2646484375, "learning_rate": 6.130981025508265e-07, "loss": 0.0106, "reward": 0.83984375, "reward_std": 0.34920288622379303, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.44921875, "step": 382 }, { "completion_length": 294.0234375, "epoch": 0.445607911576498, "grad_norm": 9.725534621456799, "kl": 0.2392578125, "learning_rate": 6.112604669781572e-07, "loss": 0.0096, "reward": 0.998046875, "reward_std": 0.45386840403079987, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.458984375, "step": 383 }, { "completion_length": 250.5625, "epoch": 0.4467713787085515, "grad_norm": 1.229405341288895, "kl": 0.1507568359375, "learning_rate": 6.094212488822125e-07, "loss": 0.006, "reward": 1.07421875, "reward_std": 0.4449135288596153, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.46484375, "step": 384 }, { "completion_length": 236.640625, "epoch": 0.447934845840605, "grad_norm": 767.0365941199435, "kl": 15.11199951171875, "learning_rate": 6.075804744232769e-07, "loss": 0.6046, "reward": 1.0859375, "reward_std": 0.39552851021289825, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.4609375, "step": 385 }, { "completion_length": 339.96875, "epoch": 0.4490983129726585, "grad_norm": 3.1586317511791666, "kl": 0.10491943359375, "learning_rate": 6.057381697837715e-07, "loss": 0.0042, "reward": 1.01953125, "reward_std": 0.43619707971811295, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.44140625, "step": 386 }, { "completion_length": 305.3203125, "epoch": 0.450261780104712, "grad_norm": 2.2174074170945963, "kl": 0.1192626953125, "learning_rate": 6.038943611678824e-07, "loss": 0.0048, "reward": 1.04296875, "reward_std": 0.4128425717353821, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.44921875, "step": 387 }, { "completion_length": 311.61328125, "epoch": 0.4514252472367656, "grad_norm": 2.1136172190869167, "kl": 0.0699462890625, "learning_rate": 6.020490748011877e-07, "loss": 0.0028, "reward": 0.84375, "reward_std": 0.4692077562212944, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.4375, "step": 388 }, { "completion_length": 310.796875, "epoch": 0.4525887143688191, "grad_norm": 52.72235107687105, "kl": 2.4122314453125, "learning_rate": 6.002023369302841e-07, "loss": 0.0963, "reward": 0.978515625, "reward_std": 0.47980162501335144, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.455078125, "step": 389 }, { "completion_length": 282.51171875, "epoch": 0.4537521815008726, "grad_norm": 0.5813593535047655, "kl": 0.0821533203125, "learning_rate": 5.98354173822414e-07, "loss": 0.0033, "reward": 0.916015625, "reward_std": 0.45927415788173676, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.455078125, "step": 390 }, { "completion_length": 310.62890625, "epoch": 0.4549156486329261, "grad_norm": 0.564718122615681, "kl": 0.08038330078125, "learning_rate": 5.965046117650923e-07, "loss": 0.0032, "reward": 0.90234375, "reward_std": 0.4799669235944748, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.45703125, "step": 391 }, { "completion_length": 362.08984375, "epoch": 0.4560791157649796, "grad_norm": 1.32993502681409, "kl": 0.0748291015625, "learning_rate": 5.946536770657309e-07, "loss": 0.003, "reward": 0.87109375, "reward_std": 0.39080600440502167, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.44140625, "step": 392 }, { "completion_length": 404.21484375, "epoch": 0.4572425828970332, "grad_norm": 5.073118486772105, "kl": 0.09478759765625, "learning_rate": 5.928013960512668e-07, "loss": 0.0038, "reward": 0.90234375, "reward_std": 0.5122874453663826, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.43359375, "step": 393 }, { "completion_length": 333.53125, "epoch": 0.4584060500290867, "grad_norm": 0.6719568069575846, "kl": 0.05718994140625, "learning_rate": 5.909477950677856e-07, "loss": 0.0023, "reward": 0.978515625, "reward_std": 0.4164212718605995, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.455078125, "step": 394 }, { "completion_length": 303.9765625, "epoch": 0.4595695171611402, "grad_norm": 13.406291564535044, "kl": 0.5201416015625, "learning_rate": 5.890929004801479e-07, "loss": 0.0208, "reward": 0.896484375, "reward_std": 0.3555869460105896, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.451171875, "step": 395 }, { "completion_length": 329.015625, "epoch": 0.4607329842931937, "grad_norm": 4.4148343440026485, "kl": 0.08428955078125, "learning_rate": 5.87236738671614e-07, "loss": 0.0034, "reward": 0.80859375, "reward_std": 0.49852630496025085, "rewards/correctness_reward_func": 0.3671875, "rewards/strict_format_reward_func": 0.44140625, "step": 396 }, { "completion_length": 324.203125, "epoch": 0.4618964514252472, "grad_norm": 4.648521114269383, "kl": 0.13623046875, "learning_rate": 5.853793360434687e-07, "loss": 0.0055, "reward": 0.98828125, "reward_std": 0.48889076709747314, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.44140625, "step": 397 }, { "completion_length": 346.81640625, "epoch": 0.4630599185573008, "grad_norm": 3.634456828522352, "kl": 0.283203125, "learning_rate": 5.835207190146456e-07, "loss": 0.0113, "reward": 0.880859375, "reward_std": 0.464506097137928, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.458984375, "step": 398 }, { "completion_length": 278.3046875, "epoch": 0.4642233856893543, "grad_norm": 0.3585571106029912, "kl": 0.06488037109375, "learning_rate": 5.816609140213512e-07, "loss": 0.0026, "reward": 0.849609375, "reward_std": 0.43717610090970993, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.458984375, "step": 399 }, { "completion_length": 363.80078125, "epoch": 0.4653868528214078, "grad_norm": 28.531558319570884, "kl": 0.533935546875, "learning_rate": 5.797999475166896e-07, "loss": 0.0214, "reward": 0.87890625, "reward_std": 0.4710187017917633, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.42578125, "step": 400 }, { "completion_length": 292.96484375, "epoch": 0.4665503199534613, "grad_norm": 2.6676429025884767, "kl": 0.15399169921875, "learning_rate": 5.779378459702855e-07, "loss": 0.0061, "reward": 0.869140625, "reward_std": 0.40309911221265793, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.462890625, "step": 401 }, { "completion_length": 362.390625, "epoch": 0.4677137870855148, "grad_norm": 0.49641621730261415, "kl": 0.069091796875, "learning_rate": 5.760746358679079e-07, "loss": 0.0028, "reward": 0.939453125, "reward_std": 0.3901231810450554, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.462890625, "step": 402 }, { "completion_length": 255.140625, "epoch": 0.4688772542175684, "grad_norm": 0.8385768237990343, "kl": 0.0543212890625, "learning_rate": 5.742103437110937e-07, "loss": 0.0022, "reward": 1.033203125, "reward_std": 0.4182577580213547, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.462890625, "step": 403 }, { "completion_length": 292.5703125, "epoch": 0.4700407213496219, "grad_norm": 1.6455144920059204, "kl": 0.06744384765625, "learning_rate": 5.723449960167703e-07, "loss": 0.0027, "reward": 0.955078125, "reward_std": 0.503605879843235, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.439453125, "step": 404 }, { "completion_length": 299.74609375, "epoch": 0.4712041884816754, "grad_norm": 7.276242829061657, "kl": 0.135009765625, "learning_rate": 5.704786193168784e-07, "loss": 0.0054, "reward": 1.015625, "reward_std": 0.35178013145923615, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.4453125, "step": 405 }, { "completion_length": 310.5546875, "epoch": 0.4723676556137289, "grad_norm": 1.434406403970823, "kl": 0.1162109375, "learning_rate": 5.686112401579955e-07, "loss": 0.0047, "reward": 1.021484375, "reward_std": 0.4066329449415207, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.443359375, "step": 406 }, { "completion_length": 334.7421875, "epoch": 0.4735311227457824, "grad_norm": 1.3760348483259057, "kl": 0.0770263671875, "learning_rate": 5.667428851009572e-07, "loss": 0.0031, "reward": 0.9296875, "reward_std": 0.5173530131578445, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.4453125, "step": 407 }, { "completion_length": 322.8515625, "epoch": 0.47469458987783597, "grad_norm": 7.438689931460173, "kl": 0.1707763671875, "learning_rate": 5.648735807204799e-07, "loss": 0.0069, "reward": 0.89453125, "reward_std": 0.3707299157977104, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.45703125, "step": 408 }, { "completion_length": 254.0, "epoch": 0.4758580570098895, "grad_norm": 0.6429987727975838, "kl": 0.0975341796875, "learning_rate": 5.630033536047829e-07, "loss": 0.0039, "reward": 1.03125, "reward_std": 0.43288323283195496, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.46875, "step": 409 }, { "completion_length": 324.8359375, "epoch": 0.477021524141943, "grad_norm": 2.480657201851094, "kl": 0.1240234375, "learning_rate": 5.611322303552101e-07, "loss": 0.005, "reward": 0.841796875, "reward_std": 0.44687486439943314, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.451171875, "step": 410 }, { "completion_length": 332.86328125, "epoch": 0.4781849912739965, "grad_norm": 0.3444014768733152, "kl": 0.05511474609375, "learning_rate": 5.592602375858513e-07, "loss": 0.0022, "reward": 0.796875, "reward_std": 0.3020646646618843, "rewards/correctness_reward_func": 0.3515625, "rewards/strict_format_reward_func": 0.4453125, "step": 411 }, { "completion_length": 301.046875, "epoch": 0.47934845840605, "grad_norm": 1.140373545574664, "kl": 0.06671142578125, "learning_rate": 5.573874019231647e-07, "loss": 0.0027, "reward": 1.015625, "reward_std": 0.3652488887310028, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.453125, "step": 412 }, { "completion_length": 301.7890625, "epoch": 0.48051192553810357, "grad_norm": 1.638173172126681, "kl": 0.11962890625, "learning_rate": 5.555137500055971e-07, "loss": 0.0048, "reward": 1.033203125, "reward_std": 0.43288997933268547, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.455078125, "step": 413 }, { "completion_length": 319.25, "epoch": 0.4816753926701571, "grad_norm": 3.4913674145713824, "kl": 0.15203857421875, "learning_rate": 5.536393084832049e-07, "loss": 0.0061, "reward": 0.849609375, "reward_std": 0.3662107475101948, "rewards/correctness_reward_func": 0.3828125, "rewards/strict_format_reward_func": 0.466796875, "step": 414 }, { "completion_length": 311.15234375, "epoch": 0.4828388598022106, "grad_norm": 0.6463617936977485, "kl": 0.0892333984375, "learning_rate": 5.517641040172763e-07, "loss": 0.0036, "reward": 0.896484375, "reward_std": 0.39099887758493423, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.451171875, "step": 415 }, { "completion_length": 293.93359375, "epoch": 0.4840023269342641, "grad_norm": 0.3772566583562422, "kl": 0.05682373046875, "learning_rate": 5.49888163279951e-07, "loss": 0.0023, "reward": 0.9921875, "reward_std": 0.5062239915132523, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.453125, "step": 416 }, { "completion_length": 320.78125, "epoch": 0.4851657940663176, "grad_norm": 1.410787743709544, "kl": 0.0819091796875, "learning_rate": 5.480115129538409e-07, "loss": 0.0033, "reward": 0.978515625, "reward_std": 0.5293768420815468, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.447265625, "step": 417 }, { "completion_length": 267.73828125, "epoch": 0.48632926119837117, "grad_norm": 12.315853680412278, "kl": 0.160888671875, "learning_rate": 5.46134179731651e-07, "loss": 0.0064, "reward": 1.033203125, "reward_std": 0.4649043008685112, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.462890625, "step": 418 }, { "completion_length": 290.37890625, "epoch": 0.4874927283304247, "grad_norm": 1.5998395006610135, "kl": 0.1121826171875, "learning_rate": 5.442561903157995e-07, "loss": 0.0045, "reward": 0.9140625, "reward_std": 0.36004550382494926, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.4453125, "step": 419 }, { "completion_length": 308.48828125, "epoch": 0.4886561954624782, "grad_norm": 0.5630839156691942, "kl": 0.1064453125, "learning_rate": 5.423775714180382e-07, "loss": 0.0043, "reward": 0.87890625, "reward_std": 0.4294242113828659, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.45703125, "step": 420 }, { "completion_length": 280.94140625, "epoch": 0.4898196625945317, "grad_norm": 3.9641698260470375, "kl": 0.0775146484375, "learning_rate": 5.404983497590721e-07, "loss": 0.0031, "reward": 0.884765625, "reward_std": 0.39371785894036293, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.455078125, "step": 421 }, { "completion_length": 294.078125, "epoch": 0.4909831297265852, "grad_norm": 9.043777921571046, "kl": 0.103515625, "learning_rate": 5.386185520681798e-07, "loss": 0.0041, "reward": 0.91796875, "reward_std": 0.3717608377337456, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.45703125, "step": 422 }, { "completion_length": 311.29296875, "epoch": 0.49214659685863876, "grad_norm": 0.27050064480184305, "kl": 0.06707763671875, "learning_rate": 5.367382050828329e-07, "loss": 0.0027, "reward": 1.015625, "reward_std": 0.5074534937739372, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.453125, "step": 423 }, { "completion_length": 339.5078125, "epoch": 0.49331006399069227, "grad_norm": 2.9038494674421216, "kl": 0.0753173828125, "learning_rate": 5.348573355483166e-07, "loss": 0.003, "reward": 0.990234375, "reward_std": 0.5217199623584747, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.451171875, "step": 424 }, { "completion_length": 363.34375, "epoch": 0.4944735311227458, "grad_norm": 2.8503473658455447, "kl": 0.10009765625, "learning_rate": 5.329759702173476e-07, "loss": 0.004, "reward": 1.001953125, "reward_std": 0.43435803055763245, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.447265625, "step": 425 }, { "completion_length": 294.34765625, "epoch": 0.4956369982547993, "grad_norm": 5.512156476465259, "kl": 0.189208984375, "learning_rate": 5.310941358496958e-07, "loss": 0.0076, "reward": 1.03125, "reward_std": 0.564673013985157, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.4609375, "step": 426 }, { "completion_length": 303.1640625, "epoch": 0.4968004653868528, "grad_norm": 10.449790800159649, "kl": 0.21466064453125, "learning_rate": 5.292118592118012e-07, "loss": 0.0086, "reward": 0.9140625, "reward_std": 0.4374491199851036, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.4609375, "step": 427 }, { "completion_length": 312.53515625, "epoch": 0.49796393251890636, "grad_norm": 0.5620885568871857, "kl": 0.0740966796875, "learning_rate": 5.273291670763957e-07, "loss": 0.003, "reward": 1.03515625, "reward_std": 0.5012250021100044, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.46484375, "step": 428 }, { "completion_length": 303.296875, "epoch": 0.49912739965095987, "grad_norm": 3.4388668931417596, "kl": 0.07611083984375, "learning_rate": 5.254460862221202e-07, "loss": 0.0031, "reward": 1.04296875, "reward_std": 0.45925281941890717, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.45703125, "step": 429 }, { "completion_length": 273.765625, "epoch": 0.5002908667830134, "grad_norm": 0.21652473414282541, "kl": 0.07037353515625, "learning_rate": 5.23562643433145e-07, "loss": 0.0028, "reward": 1.0625, "reward_std": 0.48165448009967804, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.453125, "step": 430 }, { "completion_length": 264.4921875, "epoch": 0.5014543339150669, "grad_norm": 5.753162525298037, "kl": 0.165771484375, "learning_rate": 5.216788654987881e-07, "loss": 0.0066, "reward": 0.931640625, "reward_std": 0.4870760515332222, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.470703125, "step": 431 }, { "completion_length": 299.109375, "epoch": 0.5026178010471204, "grad_norm": 0.22642741914275885, "kl": 0.0606689453125, "learning_rate": 5.197947792131348e-07, "loss": 0.0024, "reward": 1.0390625, "reward_std": 0.4534728229045868, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.453125, "step": 432 }, { "completion_length": 281.42578125, "epoch": 0.5037812681791739, "grad_norm": 0.641640588414818, "kl": 0.06524658203125, "learning_rate": 5.179104113746559e-07, "loss": 0.0026, "reward": 0.8984375, "reward_std": 0.3567136079072952, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.4609375, "step": 433 }, { "completion_length": 277.52734375, "epoch": 0.5049447353112274, "grad_norm": 0.8235859352296484, "kl": 0.1002197265625, "learning_rate": 5.160257887858277e-07, "loss": 0.004, "reward": 1.1015625, "reward_std": 0.5091729164123535, "rewards/correctness_reward_func": 0.6328125, "rewards/strict_format_reward_func": 0.46875, "step": 434 }, { "completion_length": 329.9375, "epoch": 0.506108202443281, "grad_norm": 2.147763272129784, "kl": 0.0938720703125, "learning_rate": 5.141409382527486e-07, "loss": 0.0038, "reward": 0.8828125, "reward_std": 0.42651040852069855, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.4375, "step": 435 }, { "completion_length": 329.62890625, "epoch": 0.5072716695753345, "grad_norm": 0.5438196282420968, "kl": 0.078857421875, "learning_rate": 5.122558865847606e-07, "loss": 0.0032, "reward": 0.91015625, "reward_std": 0.3513568602502346, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.45703125, "step": 436 }, { "completion_length": 282.29296875, "epoch": 0.508435136707388, "grad_norm": 3.0259543137829876, "kl": 0.148193359375, "learning_rate": 5.103706605940654e-07, "loss": 0.0059, "reward": 0.943359375, "reward_std": 0.4452047646045685, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.458984375, "step": 437 }, { "completion_length": 309.7734375, "epoch": 0.5095986038394416, "grad_norm": 1.9424934594165677, "kl": 0.10552978515625, "learning_rate": 5.084852870953452e-07, "loss": 0.0042, "reward": 1.0234375, "reward_std": 0.5030554793775082, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.4375, "step": 438 }, { "completion_length": 350.74609375, "epoch": 0.5107620709714951, "grad_norm": 0.37379037433480744, "kl": 0.0599365234375, "learning_rate": 5.065997929053795e-07, "loss": 0.0024, "reward": 0.98828125, "reward_std": 0.5056476891040802, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.46484375, "step": 439 }, { "completion_length": 264.26171875, "epoch": 0.5119255381035486, "grad_norm": 0.5558696968004029, "kl": 0.063720703125, "learning_rate": 5.047142048426648e-07, "loss": 0.0026, "reward": 0.916015625, "reward_std": 0.4817412793636322, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.462890625, "step": 440 }, { "completion_length": 271.04296875, "epoch": 0.5130890052356021, "grad_norm": 1.5654617677454246, "kl": 0.0906982421875, "learning_rate": 5.028285497270328e-07, "loss": 0.0036, "reward": 1.00390625, "reward_std": 0.46462948620319366, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.44921875, "step": 441 }, { "completion_length": 318.265625, "epoch": 0.5142524723676556, "grad_norm": 1386.5139722660963, "kl": 22.45867919921875, "learning_rate": 5.00942854379269e-07, "loss": 0.8975, "reward": 1.13671875, "reward_std": 0.6208238005638123, "rewards/correctness_reward_func": 0.6640625, "rewards/strict_format_reward_func": 0.47265625, "step": 442 }, { "completion_length": 320.33984375, "epoch": 0.5154159394997091, "grad_norm": 0.42811510504973976, "kl": 0.06072998046875, "learning_rate": 4.99057145620731e-07, "loss": 0.0024, "reward": 0.779296875, "reward_std": 0.3708675317466259, "rewards/correctness_reward_func": 0.34375, "rewards/strict_format_reward_func": 0.435546875, "step": 443 }, { "completion_length": 332.6484375, "epoch": 0.5165794066317626, "grad_norm": 5.767041875621239, "kl": 0.0621337890625, "learning_rate": 4.971714502729672e-07, "loss": 0.0025, "reward": 1.02734375, "reward_std": 0.491852805018425, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.44921875, "step": 444 }, { "completion_length": 306.2421875, "epoch": 0.5177428737638162, "grad_norm": 0.38326169048660724, "kl": 0.07586669921875, "learning_rate": 4.952857951573353e-07, "loss": 0.003, "reward": 0.93359375, "reward_std": 0.46532756835222244, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.43359375, "step": 445 }, { "completion_length": 288.8984375, "epoch": 0.5189063408958697, "grad_norm": 6.452710878023263, "kl": 0.143310546875, "learning_rate": 4.934002070946206e-07, "loss": 0.0057, "reward": 0.974609375, "reward_std": 0.41858839988708496, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.466796875, "step": 446 }, { "completion_length": 288.546875, "epoch": 0.5200698080279232, "grad_norm": 7.489115658836143, "kl": 0.08538818359375, "learning_rate": 4.915147129046548e-07, "loss": 0.0034, "reward": 0.93359375, "reward_std": 0.41533728688955307, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.46484375, "step": 447 }, { "completion_length": 358.33203125, "epoch": 0.5212332751599767, "grad_norm": 1.1162067247203227, "kl": 0.0811767578125, "learning_rate": 4.896293394059345e-07, "loss": 0.0033, "reward": 0.84375, "reward_std": 0.40488430112600327, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.4375, "step": 448 }, { "completion_length": 314.9921875, "epoch": 0.5223967422920303, "grad_norm": 22.97188896639372, "kl": 0.46563720703125, "learning_rate": 4.877441134152395e-07, "loss": 0.0186, "reward": 0.869140625, "reward_std": 0.397512748837471, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.439453125, "step": 449 }, { "completion_length": 291.59375, "epoch": 0.5235602094240838, "grad_norm": 0.5696022951369997, "kl": 0.06829833984375, "learning_rate": 4.858590617472514e-07, "loss": 0.0027, "reward": 1.017578125, "reward_std": 0.3987056389451027, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.447265625, "step": 450 }, { "completion_length": 267.73828125, "epoch": 0.5247236765561373, "grad_norm": 0.6466191942144175, "kl": 0.06671142578125, "learning_rate": 4.839742112141724e-07, "loss": 0.0027, "reward": 0.96484375, "reward_std": 0.4528322294354439, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.46484375, "step": 451 }, { "completion_length": 341.46484375, "epoch": 0.5258871436881908, "grad_norm": 1.2401600639138002, "kl": 0.10247802734375, "learning_rate": 4.820895886253439e-07, "loss": 0.0041, "reward": 0.87890625, "reward_std": 0.3971017822623253, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.43359375, "step": 452 }, { "completion_length": 305.20703125, "epoch": 0.5270506108202443, "grad_norm": 10.208117044084325, "kl": 0.16357421875, "learning_rate": 4.802052207868654e-07, "loss": 0.0065, "reward": 1.01171875, "reward_std": 0.37910616397857666, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.44921875, "step": 453 }, { "completion_length": 300.2578125, "epoch": 0.5282140779522978, "grad_norm": 9.393290503672768, "kl": 0.065185546875, "learning_rate": 4.78321134501212e-07, "loss": 0.0026, "reward": 0.84375, "reward_std": 0.3766629546880722, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.453125, "step": 454 }, { "completion_length": 293.38671875, "epoch": 0.5293775450843514, "grad_norm": 3.8659686723703754, "kl": 0.0889892578125, "learning_rate": 4.764373565668551e-07, "loss": 0.0036, "reward": 1.01171875, "reward_std": 0.3682662658393383, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.44921875, "step": 455 }, { "completion_length": 279.65234375, "epoch": 0.5305410122164049, "grad_norm": 4.042111221873373, "kl": 0.2183837890625, "learning_rate": 4.745539137778797e-07, "loss": 0.0087, "reward": 1.1640625, "reward_std": 0.5641819983720779, "rewards/correctness_reward_func": 0.6953125, "rewards/strict_format_reward_func": 0.46875, "step": 456 }, { "completion_length": 314.37109375, "epoch": 0.5317044793484584, "grad_norm": 0.7557656203899156, "kl": 0.0653076171875, "learning_rate": 4.7267083292360437e-07, "loss": 0.0026, "reward": 0.880859375, "reward_std": 0.40571190416812897, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.458984375, "step": 457 }, { "completion_length": 306.7421875, "epoch": 0.532867946480512, "grad_norm": 1.3015212128767846, "kl": 0.0732421875, "learning_rate": 4.7078814078819875e-07, "loss": 0.0029, "reward": 0.830078125, "reward_std": 0.42451316118240356, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.439453125, "step": 458 }, { "completion_length": 319.578125, "epoch": 0.5340314136125655, "grad_norm": 493.9888153088309, "kl": 33.6861572265625, "learning_rate": 4.6890586415030426e-07, "loss": 1.3547, "reward": 0.890625, "reward_std": 0.41667793691158295, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.453125, "step": 459 }, { "completion_length": 300.41015625, "epoch": 0.535194880744619, "grad_norm": 9.353090953242775, "kl": 0.976806640625, "learning_rate": 4.6702402978265226e-07, "loss": 0.0391, "reward": 0.994140625, "reward_std": 0.48338544368743896, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.455078125, "step": 460 }, { "completion_length": 313.20703125, "epoch": 0.5363583478766725, "grad_norm": 2.4488174499127906, "kl": 0.07196044921875, "learning_rate": 4.6514266445168357e-07, "loss": 0.0029, "reward": 0.890625, "reward_std": 0.4273727908730507, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.4609375, "step": 461 }, { "completion_length": 308.078125, "epoch": 0.537521815008726, "grad_norm": 0.5827517301198424, "kl": 0.064697265625, "learning_rate": 4.632617949171671e-07, "loss": 0.0026, "reward": 0.76171875, "reward_std": 0.383041687309742, "rewards/correctness_reward_func": 0.3203125, "rewards/strict_format_reward_func": 0.44140625, "step": 462 }, { "completion_length": 301.8984375, "epoch": 0.5386852821407795, "grad_norm": 1.8639320626375049, "kl": 0.0982666015625, "learning_rate": 4.613814479318203e-07, "loss": 0.0039, "reward": 0.822265625, "reward_std": 0.406594380736351, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.447265625, "step": 463 }, { "completion_length": 305.671875, "epoch": 0.539848749272833, "grad_norm": 2.939412233079058, "kl": 0.15728759765625, "learning_rate": 4.595016502409279e-07, "loss": 0.0063, "reward": 0.8984375, "reward_std": 0.41379060596227646, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.4453125, "step": 464 }, { "completion_length": 266.09375, "epoch": 0.5410122164048866, "grad_norm": 0.2583659683822398, "kl": 0.05816650390625, "learning_rate": 4.5762242858196194e-07, "loss": 0.0023, "reward": 1.15625, "reward_std": 0.5720360279083252, "rewards/correctness_reward_func": 0.6875, "rewards/strict_format_reward_func": 0.46875, "step": 465 }, { "completion_length": 352.6015625, "epoch": 0.5421756835369401, "grad_norm": 1.7537541998304222, "kl": 0.07427978515625, "learning_rate": 4.5574380968420053e-07, "loss": 0.003, "reward": 0.978515625, "reward_std": 0.390750952064991, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.439453125, "step": 466 }, { "completion_length": 310.8359375, "epoch": 0.5433391506689936, "grad_norm": 1.0831469617883793, "kl": 0.120849609375, "learning_rate": 4.5386582026834904e-07, "loss": 0.0048, "reward": 0.99609375, "reward_std": 0.3034508638083935, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.47265625, "step": 467 }, { "completion_length": 269.56640625, "epoch": 0.5445026178010471, "grad_norm": 4.1180941799706385, "kl": 0.0921630859375, "learning_rate": 4.519884870461591e-07, "loss": 0.0037, "reward": 0.84375, "reward_std": 0.34880052506923676, "rewards/correctness_reward_func": 0.3828125, "rewards/strict_format_reward_func": 0.4609375, "step": 468 }, { "completion_length": 316.1796875, "epoch": 0.5456660849331006, "grad_norm": 41.12900001018758, "kl": 1.962646484375, "learning_rate": 4.5011183672004904e-07, "loss": 0.0784, "reward": 0.85546875, "reward_std": 0.3830237053334713, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.44140625, "step": 469 }, { "completion_length": 274.5625, "epoch": 0.5468295520651542, "grad_norm": 4.930180526094055, "kl": 0.1124267578125, "learning_rate": 4.482358959827237e-07, "loss": 0.0045, "reward": 0.98828125, "reward_std": 0.46707436069846153, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.47265625, "step": 470 }, { "completion_length": 281.66796875, "epoch": 0.5479930191972077, "grad_norm": 0.3254372196970626, "kl": 0.0618896484375, "learning_rate": 4.4636069151679513e-07, "loss": 0.0025, "reward": 1.0234375, "reward_std": 0.417631059885025, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.46875, "step": 471 }, { "completion_length": 414.08203125, "epoch": 0.5491564863292612, "grad_norm": 3.8982742704371827, "kl": 0.22772216796875, "learning_rate": 4.44486249994403e-07, "loss": 0.0091, "reward": 0.845703125, "reward_std": 0.4051319286227226, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.439453125, "step": 472 }, { "completion_length": 283.47265625, "epoch": 0.5503199534613147, "grad_norm": 3.4655805773543356, "kl": 0.1307373046875, "learning_rate": 4.4261259807683533e-07, "loss": 0.0052, "reward": 0.876953125, "reward_std": 0.4953066222369671, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.462890625, "step": 473 }, { "completion_length": 317.5078125, "epoch": 0.5514834205933682, "grad_norm": 0.48782584214039304, "kl": 0.091552734375, "learning_rate": 4.4073976241414866e-07, "loss": 0.0037, "reward": 0.873046875, "reward_std": 0.4310942143201828, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.451171875, "step": 474 }, { "completion_length": 293.953125, "epoch": 0.5526468877254218, "grad_norm": 0.23507295953109886, "kl": 0.085205078125, "learning_rate": 4.3886776964479e-07, "loss": 0.0034, "reward": 0.984375, "reward_std": 0.5014909356832504, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.4609375, "step": 475 }, { "completion_length": 282.88671875, "epoch": 0.5538103548574753, "grad_norm": 0.673587616115569, "kl": 0.12457275390625, "learning_rate": 4.369966463952172e-07, "loss": 0.005, "reward": 0.96875, "reward_std": 0.36010831594467163, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.4609375, "step": 476 }, { "completion_length": 324.25, "epoch": 0.5549738219895288, "grad_norm": 8.678352355812883, "kl": 0.1812744140625, "learning_rate": 4.351264192795202e-07, "loss": 0.0073, "reward": 0.8046875, "reward_std": 0.39850369840860367, "rewards/correctness_reward_func": 0.34375, "rewards/strict_format_reward_func": 0.4609375, "step": 477 }, { "completion_length": 294.7890625, "epoch": 0.5561372891215823, "grad_norm": 1.5980672547523225, "kl": 0.0810546875, "learning_rate": 4.332571148990428e-07, "loss": 0.0032, "reward": 0.8984375, "reward_std": 0.32709740102291107, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.46875, "step": 478 }, { "completion_length": 316.92578125, "epoch": 0.5573007562536358, "grad_norm": 0.5977531236480902, "kl": 0.0809326171875, "learning_rate": 4.313887598420045e-07, "loss": 0.0032, "reward": 0.953125, "reward_std": 0.4281370937824249, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.453125, "step": 479 }, { "completion_length": 303.82421875, "epoch": 0.5584642233856894, "grad_norm": 19.039937012860378, "kl": 0.10833740234375, "learning_rate": 4.295213806831215e-07, "loss": 0.0043, "reward": 1.080078125, "reward_std": 0.4378567263484001, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.462890625, "step": 480 }, { "completion_length": 262.73828125, "epoch": 0.5596276905177429, "grad_norm": 17.57583704393828, "kl": 0.127197265625, "learning_rate": 4.276550039832299e-07, "loss": 0.0051, "reward": 1.033203125, "reward_std": 0.38480594754219055, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.470703125, "step": 481 }, { "completion_length": 348.82421875, "epoch": 0.5607911576497964, "grad_norm": 94.13087063972335, "kl": 2.42138671875, "learning_rate": 4.257896562889064e-07, "loss": 0.097, "reward": 0.841796875, "reward_std": 0.46834196895360947, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.451171875, "step": 482 }, { "completion_length": 350.00390625, "epoch": 0.5619546247818499, "grad_norm": 2.3502967746623256, "kl": 0.0682373046875, "learning_rate": 4.2392536413209206e-07, "loss": 0.0027, "reward": 0.751953125, "reward_std": 0.306599460542202, "rewards/correctness_reward_func": 0.296875, "rewards/strict_format_reward_func": 0.455078125, "step": 483 }, { "completion_length": 350.57421875, "epoch": 0.5631180919139034, "grad_norm": 0.5542446030499187, "kl": 0.05633544921875, "learning_rate": 4.2206215402971443e-07, "loss": 0.0023, "reward": 0.9765625, "reward_std": 0.37827198952436447, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.453125, "step": 484 }, { "completion_length": 345.12890625, "epoch": 0.564281559045957, "grad_norm": 0.7187296718689014, "kl": 0.09326171875, "learning_rate": 4.202000524833105e-07, "loss": 0.0037, "reward": 0.931640625, "reward_std": 0.43541865050792694, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.455078125, "step": 485 }, { "completion_length": 321.4765625, "epoch": 0.5654450261780105, "grad_norm": 1.232074302331301, "kl": 0.0653076171875, "learning_rate": 4.183390859786489e-07, "loss": 0.0026, "reward": 0.998046875, "reward_std": 0.5729693621397018, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.443359375, "step": 486 }, { "completion_length": 327.33203125, "epoch": 0.566608493310064, "grad_norm": 10.654969280256518, "kl": 0.7200927734375, "learning_rate": 4.1647928098535445e-07, "loss": 0.0288, "reward": 0.79296875, "reward_std": 0.3501325398683548, "rewards/correctness_reward_func": 0.34375, "rewards/strict_format_reward_func": 0.44921875, "step": 487 }, { "completion_length": 339.00390625, "epoch": 0.5677719604421175, "grad_norm": 0.7427020109650712, "kl": 0.0703125, "learning_rate": 4.146206639565312e-07, "loss": 0.0028, "reward": 0.98828125, "reward_std": 0.47387974709272385, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.46484375, "step": 488 }, { "completion_length": 333.625, "epoch": 0.568935427574171, "grad_norm": 3.2775088610928784, "kl": 0.10986328125, "learning_rate": 4.127632613283858e-07, "loss": 0.0044, "reward": 1.00390625, "reward_std": 0.38937077671289444, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.45703125, "step": 489 }, { "completion_length": 360.5625, "epoch": 0.5700988947062245, "grad_norm": 6.1719024943408485, "kl": 0.0828857421875, "learning_rate": 4.1090709951985215e-07, "loss": 0.0033, "reward": 0.880859375, "reward_std": 0.41572393476963043, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.451171875, "step": 490 }, { "completion_length": 323.0859375, "epoch": 0.571262361838278, "grad_norm": 0.7013029219174678, "kl": 0.06793212890625, "learning_rate": 4.090522049322144e-07, "loss": 0.0027, "reward": 0.99609375, "reward_std": 0.40485580265522003, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.45703125, "step": 491 }, { "completion_length": 314.95703125, "epoch": 0.5724258289703316, "grad_norm": 0.695642100398697, "kl": 0.06439208984375, "learning_rate": 4.071986039487332e-07, "loss": 0.0026, "reward": 0.9296875, "reward_std": 0.41494838893413544, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.4609375, "step": 492 }, { "completion_length": 301.078125, "epoch": 0.5735892961023851, "grad_norm": 0.7425077620435919, "kl": 0.07415771484375, "learning_rate": 4.0534632293426894e-07, "loss": 0.003, "reward": 0.912109375, "reward_std": 0.41000480204820633, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.451171875, "step": 493 }, { "completion_length": 313.7890625, "epoch": 0.5747527632344386, "grad_norm": 0.6386394485743819, "kl": 0.066162109375, "learning_rate": 4.0349538823490794e-07, "loss": 0.0026, "reward": 0.96875, "reward_std": 0.4953623190522194, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.453125, "step": 494 }, { "completion_length": 294.171875, "epoch": 0.5759162303664922, "grad_norm": 3.3816684023530232, "kl": 0.2059326171875, "learning_rate": 4.0164582617758596e-07, "loss": 0.0083, "reward": 1.005859375, "reward_std": 0.5383393317461014, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.466796875, "step": 495 }, { "completion_length": 307.96484375, "epoch": 0.5770796974985457, "grad_norm": 0.8310620537473502, "kl": 0.09033203125, "learning_rate": 3.9979766306971596e-07, "loss": 0.0036, "reward": 1.048828125, "reward_std": 0.4854975938796997, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.470703125, "step": 496 }, { "completion_length": 367.953125, "epoch": 0.5782431646305992, "grad_norm": 0.5463594099958543, "kl": 0.07421875, "learning_rate": 3.979509251988123e-07, "loss": 0.003, "reward": 0.990234375, "reward_std": 0.4779988303780556, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.435546875, "step": 497 }, { "completion_length": 340.80859375, "epoch": 0.5794066317626527, "grad_norm": 0.5689758833932441, "kl": 0.07464599609375, "learning_rate": 3.961056388321177e-07, "loss": 0.003, "reward": 0.84375, "reward_std": 0.4671352282166481, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.4375, "step": 498 }, { "completion_length": 300.88671875, "epoch": 0.5805700988947062, "grad_norm": 20.749971158538724, "kl": 0.29876708984375, "learning_rate": 3.942618302162286e-07, "loss": 0.012, "reward": 0.95703125, "reward_std": 0.44838595390319824, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.45703125, "step": 499 }, { "completion_length": 346.6328125, "epoch": 0.5817335660267597, "grad_norm": 1.7117302286733234, "kl": 0.18804931640625, "learning_rate": 3.924195255767232e-07, "loss": 0.0075, "reward": 0.779296875, "reward_std": 0.45758427679538727, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.443359375, "step": 500 }, { "completion_length": 294.54296875, "epoch": 0.5828970331588132, "grad_norm": 0.7087334302436424, "kl": 0.0670166015625, "learning_rate": 3.905787511177874e-07, "loss": 0.0027, "reward": 0.8828125, "reward_std": 0.3799057751893997, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.453125, "step": 501 }, { "completion_length": 265.7421875, "epoch": 0.5840605002908668, "grad_norm": 0.774831778277949, "kl": 0.08935546875, "learning_rate": 3.8873953302184283e-07, "loss": 0.0036, "reward": 1.009765625, "reward_std": 0.38948260247707367, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.462890625, "step": 502 }, { "completion_length": 308.82421875, "epoch": 0.5852239674229203, "grad_norm": 3.3625973030061873, "kl": 0.0748291015625, "learning_rate": 3.8690189744917353e-07, "loss": 0.003, "reward": 0.982421875, "reward_std": 0.4775698333978653, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.443359375, "step": 503 }, { "completion_length": 314.015625, "epoch": 0.5863874345549738, "grad_norm": 0.2472382741844868, "kl": 0.06201171875, "learning_rate": 3.850658705375545e-07, "loss": 0.0025, "reward": 1.0546875, "reward_std": 0.4615103006362915, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.4609375, "step": 504 }, { "completion_length": 324.41015625, "epoch": 0.5875509016870274, "grad_norm": 2.100648502202178, "kl": 0.158447265625, "learning_rate": 3.832314784018801e-07, "loss": 0.0064, "reward": 0.998046875, "reward_std": 0.397512748837471, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.458984375, "step": 505 }, { "completion_length": 314.81640625, "epoch": 0.5887143688190809, "grad_norm": 0.4519748125959194, "kl": 0.08099365234375, "learning_rate": 3.8139874713379226e-07, "loss": 0.0032, "reward": 0.99609375, "reward_std": 0.5033816024661064, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.44921875, "step": 506 }, { "completion_length": 342.44921875, "epoch": 0.5898778359511344, "grad_norm": 4.352534156683732, "kl": 0.078857421875, "learning_rate": 3.7956770280130903e-07, "loss": 0.0032, "reward": 0.984375, "reward_std": 0.5326347276568413, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.4453125, "step": 507 }, { "completion_length": 304.33203125, "epoch": 0.5910413030831879, "grad_norm": 4.450942021565789, "kl": 0.0650634765625, "learning_rate": 3.777383714484545e-07, "loss": 0.0026, "reward": 0.8984375, "reward_std": 0.4589155316352844, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.4453125, "step": 508 }, { "completion_length": 356.65234375, "epoch": 0.5922047702152414, "grad_norm": 0.4935969843844182, "kl": 0.074462890625, "learning_rate": 3.7591077909488813e-07, "loss": 0.003, "reward": 0.8671875, "reward_std": 0.416704423725605, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.4453125, "step": 509 }, { "completion_length": 302.95703125, "epoch": 0.5933682373472949, "grad_norm": 0.2721177345894813, "kl": 0.081298828125, "learning_rate": 3.740849517355342e-07, "loss": 0.0033, "reward": 0.853515625, "reward_std": 0.41943125426769257, "rewards/correctness_reward_func": 0.3984375, "rewards/strict_format_reward_func": 0.455078125, "step": 510 }, { "completion_length": 344.1328125, "epoch": 0.5945317044793484, "grad_norm": 11.095469526018869, "kl": 0.13262939453125, "learning_rate": 3.722609153402126e-07, "loss": 0.0053, "reward": 0.876953125, "reward_std": 0.2991979420185089, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.439453125, "step": 511 }, { "completion_length": 362.63671875, "epoch": 0.595695171611402, "grad_norm": 1.0784599285876548, "kl": 0.0941162109375, "learning_rate": 3.704386958532695e-07, "loss": 0.0038, "reward": 0.87890625, "reward_std": 0.4891737848520279, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.44140625, "step": 512 }, { "completion_length": 301.9921875, "epoch": 0.5968586387434555, "grad_norm": 4.180625446712727, "kl": 0.07080078125, "learning_rate": 3.6861831919320776e-07, "loss": 0.0028, "reward": 1.087890625, "reward_std": 0.43876205012202263, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.470703125, "step": 513 }, { "completion_length": 291.3828125, "epoch": 0.598022105875509, "grad_norm": 0.6248247810161895, "kl": 0.08477783203125, "learning_rate": 3.667998112523192e-07, "loss": 0.0034, "reward": 1.046875, "reward_std": 0.4108787551522255, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.4609375, "step": 514 }, { "completion_length": 328.42578125, "epoch": 0.5991855730075626, "grad_norm": 1.1776803304982586, "kl": 0.0770263671875, "learning_rate": 3.6498319789631484e-07, "loss": 0.0031, "reward": 0.966796875, "reward_std": 0.4406397007405758, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.443359375, "step": 515 }, { "completion_length": 311.4375, "epoch": 0.6003490401396161, "grad_norm": 1.516778319478155, "kl": 0.08447265625, "learning_rate": 3.6316850496395855e-07, "loss": 0.0034, "reward": 0.921875, "reward_std": 0.4371376186609268, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.4453125, "step": 516 }, { "completion_length": 320.83984375, "epoch": 0.6015125072716696, "grad_norm": 1.1186753827867948, "kl": 0.0767822265625, "learning_rate": 3.613557582666988e-07, "loss": 0.0031, "reward": 0.951171875, "reward_std": 0.5421983748674393, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.435546875, "step": 517 }, { "completion_length": 295.8828125, "epoch": 0.6026759744037231, "grad_norm": 0.915188023270065, "kl": 0.0703125, "learning_rate": 3.595449835883018e-07, "loss": 0.0028, "reward": 1.1171875, "reward_std": 0.5337450057268143, "rewards/correctness_reward_func": 0.6640625, "rewards/strict_format_reward_func": 0.453125, "step": 518 }, { "completion_length": 307.59375, "epoch": 0.6038394415357766, "grad_norm": 7.1638585191796516, "kl": 0.0626220703125, "learning_rate": 3.577362066844838e-07, "loss": 0.0025, "reward": 1.07421875, "reward_std": 0.4509744197130203, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.45703125, "step": 519 }, { "completion_length": 309.13671875, "epoch": 0.6050029086678301, "grad_norm": 0.30982308099915906, "kl": 0.0799560546875, "learning_rate": 3.5592945328254633e-07, "loss": 0.0032, "reward": 0.974609375, "reward_std": 0.44932789355516434, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.435546875, "step": 520 }, { "completion_length": 288.8828125, "epoch": 0.6061663757998836, "grad_norm": 0.345131241066477, "kl": 0.0758056640625, "learning_rate": 3.5412474908100914e-07, "loss": 0.003, "reward": 0.9765625, "reward_std": 0.43881209194660187, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.4453125, "step": 521 }, { "completion_length": 332.23046875, "epoch": 0.6073298429319371, "grad_norm": 0.44693914010656827, "kl": 0.08709716796875, "learning_rate": 3.523221197492452e-07, "loss": 0.0035, "reward": 0.97265625, "reward_std": 0.3985489085316658, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.44921875, "step": 522 }, { "completion_length": 349.33203125, "epoch": 0.6084933100639907, "grad_norm": 21.06894554962686, "kl": 0.7742919921875, "learning_rate": 3.5052159092711487e-07, "loss": 0.031, "reward": 0.8515625, "reward_std": 0.4210582450032234, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.4375, "step": 523 }, { "completion_length": 337.16796875, "epoch": 0.6096567771960442, "grad_norm": 1.2846969052715376, "kl": 0.06781005859375, "learning_rate": 3.4872318822460215e-07, "loss": 0.0027, "reward": 0.994140625, "reward_std": 0.4894775226712227, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.455078125, "step": 524 }, { "completion_length": 298.73046875, "epoch": 0.6108202443280978, "grad_norm": 8.980743899819853, "kl": 0.35552978515625, "learning_rate": 3.4692693722145004e-07, "loss": 0.0142, "reward": 0.9375, "reward_std": 0.4888709560036659, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.4765625, "step": 525 }, { "completion_length": 320.046875, "epoch": 0.6119837114601513, "grad_norm": 2.9878570918390435, "kl": 0.1015625, "learning_rate": 3.451328634667966e-07, "loss": 0.0041, "reward": 1.056640625, "reward_std": 0.5141864120960236, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.447265625, "step": 526 }, { "completion_length": 306.86328125, "epoch": 0.6131471785922048, "grad_norm": 1.977662731824247, "kl": 0.07440185546875, "learning_rate": 3.433409924788111e-07, "loss": 0.003, "reward": 0.970703125, "reward_std": 0.5523869842290878, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.455078125, "step": 527 }, { "completion_length": 295.8203125, "epoch": 0.6143106457242583, "grad_norm": 0.4792462358024875, "kl": 0.0755615234375, "learning_rate": 3.415513497443322e-07, "loss": 0.003, "reward": 1.173828125, "reward_std": 0.4759978875517845, "rewards/correctness_reward_func": 0.71875, "rewards/strict_format_reward_func": 0.455078125, "step": 528 }, { "completion_length": 310.734375, "epoch": 0.6154741128563118, "grad_norm": 2.9916352572710125, "kl": 0.098876953125, "learning_rate": 3.397639607185046e-07, "loss": 0.004, "reward": 0.916015625, "reward_std": 0.3694341778755188, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.462890625, "step": 529 }, { "completion_length": 298.1328125, "epoch": 0.6166375799883653, "grad_norm": 0.32219076225777804, "kl": 0.06103515625, "learning_rate": 3.3797885082441714e-07, "loss": 0.0024, "reward": 1.1171875, "reward_std": 0.49028225243091583, "rewards/correctness_reward_func": 0.65625, "rewards/strict_format_reward_func": 0.4609375, "step": 530 }, { "completion_length": 308.76953125, "epoch": 0.6178010471204188, "grad_norm": 4.631204530727076, "kl": 0.080322265625, "learning_rate": 3.361960454527411e-07, "loss": 0.0032, "reward": 0.890625, "reward_std": 0.35588153451681137, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.4375, "step": 531 }, { "completion_length": 309.89453125, "epoch": 0.6189645142524723, "grad_norm": 0.6324879369152546, "kl": 0.07562255859375, "learning_rate": 3.344155699613693e-07, "loss": 0.003, "reward": 0.873046875, "reward_std": 0.4058569148182869, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.451171875, "step": 532 }, { "completion_length": 307.296875, "epoch": 0.6201279813845259, "grad_norm": 0.9047691607236, "kl": 0.07232666015625, "learning_rate": 3.326374496750553e-07, "loss": 0.0029, "reward": 0.890625, "reward_std": 0.3971716836094856, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.4609375, "step": 533 }, { "completion_length": 346.5546875, "epoch": 0.6212914485165794, "grad_norm": 0.33659561889767203, "kl": 0.07318115234375, "learning_rate": 3.3086170988505334e-07, "loss": 0.0029, "reward": 0.7734375, "reward_std": 0.3312998227775097, "rewards/correctness_reward_func": 0.328125, "rewards/strict_format_reward_func": 0.4453125, "step": 534 }, { "completion_length": 344.375, "epoch": 0.622454915648633, "grad_norm": 1.9512738282830062, "kl": 0.0848388671875, "learning_rate": 3.2908837584875817e-07, "loss": 0.0034, "reward": 0.84765625, "reward_std": 0.3345860540866852, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.45703125, "step": 535 }, { "completion_length": 326.125, "epoch": 0.6236183827806865, "grad_norm": 1.4936170410636416, "kl": 0.1143798828125, "learning_rate": 3.2731747278934623e-07, "loss": 0.0046, "reward": 1.0625, "reward_std": 0.5833890065550804, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.4453125, "step": 536 }, { "completion_length": 318.58984375, "epoch": 0.62478184991274, "grad_norm": 0.36827291739245316, "kl": 0.065185546875, "learning_rate": 3.2554902589541664e-07, "loss": 0.0026, "reward": 0.958984375, "reward_std": 0.4000133126974106, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.458984375, "step": 537 }, { "completion_length": 315.1015625, "epoch": 0.6259453170447935, "grad_norm": 0.27713934231804577, "kl": 0.07440185546875, "learning_rate": 3.237830603206333e-07, "loss": 0.003, "reward": 1.0234375, "reward_std": 0.4221845492720604, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.453125, "step": 538 }, { "completion_length": 284.32421875, "epoch": 0.627108784176847, "grad_norm": 1.1203184397436559, "kl": 0.0704345703125, "learning_rate": 3.220196011833664e-07, "loss": 0.0028, "reward": 1.103515625, "reward_std": 0.48732276260852814, "rewards/correctness_reward_func": 0.640625, "rewards/strict_format_reward_func": 0.462890625, "step": 539 }, { "completion_length": 351.41796875, "epoch": 0.6282722513089005, "grad_norm": 2.174561751300081, "kl": 0.07220458984375, "learning_rate": 3.20258673566336e-07, "loss": 0.0029, "reward": 0.95703125, "reward_std": 0.429298534989357, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.44140625, "step": 540 }, { "completion_length": 323.80859375, "epoch": 0.629435718440954, "grad_norm": 0.6149387889029069, "kl": 0.096923828125, "learning_rate": 3.185003025162547e-07, "loss": 0.0039, "reward": 1.013671875, "reward_std": 0.5528299808502197, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.443359375, "step": 541 }, { "completion_length": 305.84375, "epoch": 0.6305991855730075, "grad_norm": 2.8708590986391633, "kl": 0.072509765625, "learning_rate": 3.167445130434717e-07, "loss": 0.0029, "reward": 0.921875, "reward_std": 0.41802073270082474, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.4609375, "step": 542 }, { "completion_length": 290.9921875, "epoch": 0.631762652705061, "grad_norm": 0.6877811587242905, "kl": 0.09271240234375, "learning_rate": 3.149913301216164e-07, "loss": 0.0037, "reward": 0.994140625, "reward_std": 0.5313326120376587, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.447265625, "step": 543 }, { "completion_length": 305.8515625, "epoch": 0.6329261198371146, "grad_norm": 4.86213618653618, "kl": 0.1104736328125, "learning_rate": 3.132407786872442e-07, "loss": 0.0044, "reward": 0.9140625, "reward_std": 0.4709458351135254, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.453125, "step": 544 }, { "completion_length": 327.94140625, "epoch": 0.6340895869691682, "grad_norm": 0.7043505946465998, "kl": 0.0960693359375, "learning_rate": 3.114928836394811e-07, "loss": 0.0038, "reward": 1.0078125, "reward_std": 0.49380627274513245, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.4453125, "step": 545 }, { "completion_length": 299.7890625, "epoch": 0.6352530541012217, "grad_norm": 0.6108560198985266, "kl": 0.090087890625, "learning_rate": 3.0974766983967005e-07, "loss": 0.0036, "reward": 1.0625, "reward_std": 0.4104010909795761, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.4453125, "step": 546 }, { "completion_length": 283.0, "epoch": 0.6364165212332752, "grad_norm": 0.49979820386065515, "kl": 0.07122802734375, "learning_rate": 3.0800516211101626e-07, "loss": 0.0029, "reward": 0.978515625, "reward_std": 0.3626069948077202, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.447265625, "step": 547 }, { "completion_length": 309.0234375, "epoch": 0.6375799883653287, "grad_norm": 0.2879837927415632, "kl": 0.07666015625, "learning_rate": 3.0626538523823563e-07, "loss": 0.0031, "reward": 0.919921875, "reward_std": 0.3396223969757557, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.458984375, "step": 548 }, { "completion_length": 266.01953125, "epoch": 0.6387434554973822, "grad_norm": 0.5936636886319869, "kl": 0.089111328125, "learning_rate": 3.045283639672013e-07, "loss": 0.0036, "reward": 1.078125, "reward_std": 0.4631276950240135, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.46875, "step": 549 }, { "completion_length": 304.1640625, "epoch": 0.6399069226294357, "grad_norm": 1.293489484183133, "kl": 0.0919189453125, "learning_rate": 3.027941230045918e-07, "loss": 0.0037, "reward": 0.94140625, "reward_std": 0.46988004446029663, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.45703125, "step": 550 }, { "completion_length": 342.16796875, "epoch": 0.6410703897614892, "grad_norm": 0.6553139894221404, "kl": 0.0721435546875, "learning_rate": 3.010626870175396e-07, "loss": 0.0029, "reward": 0.97265625, "reward_std": 0.5038033723831177, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.44921875, "step": 551 }, { "completion_length": 318.28515625, "epoch": 0.6422338568935427, "grad_norm": 0.2564615408211416, "kl": 0.0838623046875, "learning_rate": 2.993340806332805e-07, "loss": 0.0033, "reward": 1.046875, "reward_std": 0.5574891120195389, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.453125, "step": 552 }, { "completion_length": 313.5703125, "epoch": 0.6433973240255962, "grad_norm": 3.0381331132888043, "kl": 0.091552734375, "learning_rate": 2.976083284388031e-07, "loss": 0.0037, "reward": 0.841796875, "reward_std": 0.449553444981575, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.451171875, "step": 553 }, { "completion_length": 314.65625, "epoch": 0.6445607911576497, "grad_norm": 7.441514145849095, "kl": 0.0833740234375, "learning_rate": 2.9588545498049936e-07, "loss": 0.0033, "reward": 1.07421875, "reward_std": 0.4466160088777542, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.44921875, "step": 554 }, { "completion_length": 319.16015625, "epoch": 0.6457242582897034, "grad_norm": 0.5548725704006117, "kl": 0.1014404296875, "learning_rate": 2.9416548476381484e-07, "loss": 0.0041, "reward": 0.845703125, "reward_std": 0.331865556538105, "rewards/correctness_reward_func": 0.390625, "rewards/strict_format_reward_func": 0.455078125, "step": 555 }, { "completion_length": 372.64453125, "epoch": 0.6468877254217569, "grad_norm": 0.2542142409478772, "kl": 0.07373046875, "learning_rate": 2.924484422529011e-07, "loss": 0.0029, "reward": 0.921875, "reward_std": 0.5186318531632423, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.4375, "step": 556 }, { "completion_length": 282.87890625, "epoch": 0.6480511925538104, "grad_norm": 0.4672015030542821, "kl": 0.057373046875, "learning_rate": 2.907343518702668e-07, "loss": 0.0023, "reward": 0.935546875, "reward_std": 0.4991537630558014, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.458984375, "step": 557 }, { "completion_length": 283.0, "epoch": 0.6492146596858639, "grad_norm": 0.34752141132348385, "kl": 0.07672119140625, "learning_rate": 2.8902323799643116e-07, "loss": 0.0031, "reward": 1.001953125, "reward_std": 0.4459878280758858, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.462890625, "step": 558 }, { "completion_length": 314.671875, "epoch": 0.6503781268179174, "grad_norm": 0.43853321393334627, "kl": 0.0743408203125, "learning_rate": 2.873151249695764e-07, "loss": 0.003, "reward": 1.029296875, "reward_std": 0.4193507581949234, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.451171875, "step": 559 }, { "completion_length": 296.14453125, "epoch": 0.6515415939499709, "grad_norm": 11.979628103142147, "kl": 0.094970703125, "learning_rate": 2.856100370852018e-07, "loss": 0.0038, "reward": 0.994140625, "reward_std": 0.38707733899354935, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.462890625, "step": 560 }, { "completion_length": 293.72265625, "epoch": 0.6527050610820244, "grad_norm": 0.35207186807499713, "kl": 0.06011962890625, "learning_rate": 2.8390799859577883e-07, "loss": 0.0024, "reward": 1.0703125, "reward_std": 0.47403161600232124, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.4609375, "step": 561 }, { "completion_length": 339.27734375, "epoch": 0.6538685282140779, "grad_norm": 0.8213153516683395, "kl": 0.1256103515625, "learning_rate": 2.822090337104053e-07, "loss": 0.005, "reward": 0.931640625, "reward_std": 0.4038376063108444, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.439453125, "step": 562 }, { "completion_length": 331.34375, "epoch": 0.6550319953461314, "grad_norm": 5.666905197694292, "kl": 0.0833740234375, "learning_rate": 2.80513166594461e-07, "loss": 0.0033, "reward": 0.98828125, "reward_std": 0.4280727207660675, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.44140625, "step": 563 }, { "completion_length": 285.375, "epoch": 0.6561954624781849, "grad_norm": 1.5342238287051615, "kl": 0.0845947265625, "learning_rate": 2.788204213692647e-07, "loss": 0.0034, "reward": 1.095703125, "reward_std": 0.48028550297021866, "rewards/correctness_reward_func": 0.6328125, "rewards/strict_format_reward_func": 0.462890625, "step": 564 }, { "completion_length": 378.7890625, "epoch": 0.6573589296102386, "grad_norm": 0.3008827309317213, "kl": 0.05828857421875, "learning_rate": 2.771308221117309e-07, "loss": 0.0023, "reward": 0.95703125, "reward_std": 0.43702252209186554, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.44140625, "step": 565 }, { "completion_length": 328.265625, "epoch": 0.6585223967422921, "grad_norm": 1.3238975497804353, "kl": 0.0965576171875, "learning_rate": 2.7544439285402667e-07, "loss": 0.0039, "reward": 1.04296875, "reward_std": 0.3864259719848633, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.45703125, "step": 566 }, { "completion_length": 305.67578125, "epoch": 0.6596858638743456, "grad_norm": 0.3004649431077894, "kl": 0.076416015625, "learning_rate": 2.7376115758323036e-07, "loss": 0.0031, "reward": 1.0625, "reward_std": 0.4624190628528595, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.46875, "step": 567 }, { "completion_length": 304.76171875, "epoch": 0.6608493310063991, "grad_norm": 0.2902731340852037, "kl": 0.0770263671875, "learning_rate": 2.720811402409905e-07, "loss": 0.0031, "reward": 0.98046875, "reward_std": 0.486492320895195, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.46484375, "step": 568 }, { "completion_length": 284.83203125, "epoch": 0.6620127981384526, "grad_norm": 11.585177450718325, "kl": 0.2117919921875, "learning_rate": 2.704043647231854e-07, "loss": 0.0085, "reward": 0.8984375, "reward_std": 0.4928325042128563, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.4453125, "step": 569 }, { "completion_length": 275.03515625, "epoch": 0.6631762652705061, "grad_norm": 4.395549566748196, "kl": 0.1033935546875, "learning_rate": 2.687308548795825e-07, "loss": 0.0041, "reward": 0.998046875, "reward_std": 0.2986785061657429, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.466796875, "step": 570 }, { "completion_length": 325.27734375, "epoch": 0.6643397324025596, "grad_norm": 1.3276165820688186, "kl": 0.072265625, "learning_rate": 2.670606345134997e-07, "loss": 0.0029, "reward": 0.962890625, "reward_std": 0.4722949117422104, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.439453125, "step": 571 }, { "completion_length": 331.0625, "epoch": 0.6655031995346131, "grad_norm": 0.21518922775709115, "kl": 0.06396484375, "learning_rate": 2.6539372738146694e-07, "loss": 0.0026, "reward": 0.89453125, "reward_std": 0.3893834985792637, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.44140625, "step": 572 }, { "completion_length": 327.8203125, "epoch": 0.6666666666666666, "grad_norm": 1.0577062733318436, "kl": 0.083984375, "learning_rate": 2.6373015719288807e-07, "loss": 0.0034, "reward": 0.88671875, "reward_std": 0.5334196239709854, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.43359375, "step": 573 }, { "completion_length": 293.0625, "epoch": 0.6678301337987201, "grad_norm": 2.2150706193548926, "kl": 0.1007080078125, "learning_rate": 2.620699476097035e-07, "loss": 0.004, "reward": 0.923828125, "reward_std": 0.44804419577121735, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.455078125, "step": 574 }, { "completion_length": 331.30078125, "epoch": 0.6689936009307738, "grad_norm": 9.77990370462344, "kl": 0.6058349609375, "learning_rate": 2.6041312224605384e-07, "loss": 0.0242, "reward": 0.87109375, "reward_std": 0.5051840096712112, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.44921875, "step": 575 }, { "completion_length": 292.2109375, "epoch": 0.6701570680628273, "grad_norm": 2.353217406510171, "kl": 0.07110595703125, "learning_rate": 2.5875970466794356e-07, "loss": 0.0028, "reward": 0.861328125, "reward_std": 0.5226795524358749, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.455078125, "step": 576 }, { "completion_length": 335.7890625, "epoch": 0.6713205351948808, "grad_norm": 1.7824875412061192, "kl": 0.072021484375, "learning_rate": 2.5710971839290707e-07, "loss": 0.0029, "reward": 0.970703125, "reward_std": 0.35836298018693924, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.439453125, "step": 577 }, { "completion_length": 312.53515625, "epoch": 0.6724840023269343, "grad_norm": 0.552991076593908, "kl": 0.0872802734375, "learning_rate": 2.554631868896725e-07, "loss": 0.0035, "reward": 0.951171875, "reward_std": 0.4157777726650238, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.458984375, "step": 578 }, { "completion_length": 308.00390625, "epoch": 0.6736474694589878, "grad_norm": 0.7798028086009522, "kl": 0.07568359375, "learning_rate": 2.5382013357782887e-07, "loss": 0.003, "reward": 0.9921875, "reward_std": 0.41130349412560463, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.4609375, "step": 579 }, { "completion_length": 357.01953125, "epoch": 0.6748109365910413, "grad_norm": 2.4769162243553886, "kl": 0.08746337890625, "learning_rate": 2.521805818274934e-07, "loss": 0.0035, "reward": 0.890625, "reward_std": 0.46277467906475067, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.4453125, "step": 580 }, { "completion_length": 315.48828125, "epoch": 0.6759744037230948, "grad_norm": 5.791571129747102, "kl": 0.1533203125, "learning_rate": 2.5054455495897785e-07, "loss": 0.0061, "reward": 0.958984375, "reward_std": 0.4360744208097458, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.451171875, "step": 581 }, { "completion_length": 315.5703125, "epoch": 0.6771378708551483, "grad_norm": 1.7983395825656483, "kl": 0.07568359375, "learning_rate": 2.489120762424577e-07, "loss": 0.003, "reward": 0.974609375, "reward_std": 0.4472910016775131, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.458984375, "step": 582 }, { "completion_length": 354.0703125, "epoch": 0.6783013379872018, "grad_norm": 6.247421444391173, "kl": 0.08502197265625, "learning_rate": 2.472831688976413e-07, "loss": 0.0034, "reward": 0.86328125, "reward_std": 0.46518294513225555, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.44921875, "step": 583 }, { "completion_length": 341.01171875, "epoch": 0.6794648051192553, "grad_norm": 0.7651729029102948, "kl": 0.06817626953125, "learning_rate": 2.456578560934388e-07, "loss": 0.0027, "reward": 1.134765625, "reward_std": 0.47977209091186523, "rewards/correctness_reward_func": 0.6796875, "rewards/strict_format_reward_func": 0.455078125, "step": 584 }, { "completion_length": 293.0859375, "epoch": 0.680628272251309, "grad_norm": 1.336292231393871, "kl": 0.084716796875, "learning_rate": 2.4403616094763325e-07, "loss": 0.0034, "reward": 1.1171875, "reward_std": 0.41681064665317535, "rewards/correctness_reward_func": 0.6640625, "rewards/strict_format_reward_func": 0.453125, "step": 585 }, { "completion_length": 284.1171875, "epoch": 0.6817917393833625, "grad_norm": 0.47257643928088244, "kl": 0.0953369140625, "learning_rate": 2.424181065265519e-07, "loss": 0.0038, "reward": 1.11328125, "reward_std": 0.5111069530248642, "rewards/correctness_reward_func": 0.640625, "rewards/strict_format_reward_func": 0.47265625, "step": 586 }, { "completion_length": 320.359375, "epoch": 0.682955206515416, "grad_norm": 41.48720861376097, "kl": 1.0439453125, "learning_rate": 2.4080371584473745e-07, "loss": 0.0418, "reward": 0.896484375, "reward_std": 0.3856069967150688, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.443359375, "step": 587 }, { "completion_length": 349.2421875, "epoch": 0.6841186736474695, "grad_norm": 0.4976133401908696, "kl": 0.0723876953125, "learning_rate": 2.391930118646212e-07, "loss": 0.0029, "reward": 0.908203125, "reward_std": 0.46742136776447296, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.439453125, "step": 588 }, { "completion_length": 277.43359375, "epoch": 0.685282140779523, "grad_norm": 10.790822521412565, "kl": 0.16650390625, "learning_rate": 2.375860174961961e-07, "loss": 0.0066, "reward": 1.009765625, "reward_std": 0.2983868792653084, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.470703125, "step": 589 }, { "completion_length": 292.015625, "epoch": 0.6864456079115765, "grad_norm": 15.329072485418074, "kl": 0.2255859375, "learning_rate": 2.3598275559669174e-07, "loss": 0.009, "reward": 1.05859375, "reward_std": 0.4833955764770508, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.45703125, "step": 590 }, { "completion_length": 257.1328125, "epoch": 0.68760907504363, "grad_norm": 2.1470880354803503, "kl": 0.0831298828125, "learning_rate": 2.343832489702479e-07, "loss": 0.0033, "reward": 1.2578125, "reward_std": 0.5756734162569046, "rewards/correctness_reward_func": 0.796875, "rewards/strict_format_reward_func": 0.4609375, "step": 591 }, { "completion_length": 355.25, "epoch": 0.6887725421756835, "grad_norm": 0.4048771867810421, "kl": 0.0533447265625, "learning_rate": 2.327875203675913e-07, "loss": 0.0021, "reward": 0.935546875, "reward_std": 0.4061272516846657, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.435546875, "step": 592 }, { "completion_length": 289.2421875, "epoch": 0.689936009307737, "grad_norm": 2.1689646615852185, "kl": 0.1282958984375, "learning_rate": 2.3119559248571125e-07, "loss": 0.0051, "reward": 1.05859375, "reward_std": 0.5009367018938065, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.44921875, "step": 593 }, { "completion_length": 305.31640625, "epoch": 0.6910994764397905, "grad_norm": 7.630915631829698, "kl": 0.07861328125, "learning_rate": 2.296074879675377e-07, "loss": 0.0031, "reward": 1.013671875, "reward_std": 0.4695633202791214, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.458984375, "step": 594 }, { "completion_length": 331.921875, "epoch": 0.6922629435718441, "grad_norm": 2.1013870969701625, "kl": 0.080322265625, "learning_rate": 2.2802322940161822e-07, "loss": 0.0032, "reward": 1.046875, "reward_std": 0.44919370114803314, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.4765625, "step": 595 }, { "completion_length": 322.9296875, "epoch": 0.6934264107038977, "grad_norm": 6.983190036892845, "kl": 0.100830078125, "learning_rate": 2.264428393217972e-07, "loss": 0.004, "reward": 0.919921875, "reward_std": 0.5484766066074371, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.443359375, "step": 596 }, { "completion_length": 282.8515625, "epoch": 0.6945898778359512, "grad_norm": 1.666606937745039, "kl": 0.0938720703125, "learning_rate": 2.2486634020689517e-07, "loss": 0.0037, "reward": 1.107421875, "reward_std": 0.4512006863951683, "rewards/correctness_reward_func": 0.640625, "rewards/strict_format_reward_func": 0.466796875, "step": 597 }, { "completion_length": 305.89453125, "epoch": 0.6957533449680047, "grad_norm": 0.4394703843593804, "kl": 0.06671142578125, "learning_rate": 2.2329375448038962e-07, "loss": 0.0027, "reward": 1.060546875, "reward_std": 0.537133663892746, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.458984375, "step": 598 }, { "completion_length": 330.65234375, "epoch": 0.6969168121000582, "grad_norm": 2.6802507711498564, "kl": 0.142822265625, "learning_rate": 2.217251045100952e-07, "loss": 0.0057, "reward": 0.875, "reward_std": 0.4543527141213417, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.4375, "step": 599 }, { "completion_length": 299.72265625, "epoch": 0.6980802792321117, "grad_norm": 1.4012984415450411, "kl": 0.0960693359375, "learning_rate": 2.20160412607846e-07, "loss": 0.0038, "reward": 0.806640625, "reward_std": 0.3000826947391033, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.470703125, "step": 600 }, { "completion_length": 298.74609375, "epoch": 0.6992437463641652, "grad_norm": 1.2013280643666493, "kl": 0.07940673828125, "learning_rate": 2.1859970102917813e-07, "loss": 0.0032, "reward": 1.015625, "reward_std": 0.43591149151325226, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.4765625, "step": 601 }, { "completion_length": 324.9375, "epoch": 0.7004072134962187, "grad_norm": 1.9895678070479064, "kl": 0.1412353515625, "learning_rate": 2.1704299197301368e-07, "loss": 0.0057, "reward": 1.005859375, "reward_std": 0.5580507963895798, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.435546875, "step": 602 }, { "completion_length": 288.953125, "epoch": 0.7015706806282722, "grad_norm": 1.0590615468494062, "kl": 0.088134765625, "learning_rate": 2.1549030758134395e-07, "loss": 0.0035, "reward": 0.990234375, "reward_std": 0.4236598610877991, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.458984375, "step": 603 }, { "completion_length": 319.46484375, "epoch": 0.7027341477603257, "grad_norm": 0.4398605833070239, "kl": 0.09661865234375, "learning_rate": 2.1394166993891526e-07, "loss": 0.0039, "reward": 0.908203125, "reward_std": 0.3706042245030403, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.439453125, "step": 604 }, { "completion_length": 330.9765625, "epoch": 0.7038976148923793, "grad_norm": 0.1748540259907411, "kl": 0.06390380859375, "learning_rate": 2.1239710107291454e-07, "loss": 0.0026, "reward": 0.791015625, "reward_std": 0.4353453032672405, "rewards/correctness_reward_func": 0.3359375, "rewards/strict_format_reward_func": 0.455078125, "step": 605 }, { "completion_length": 307.27734375, "epoch": 0.7050610820244329, "grad_norm": 0.4450062570888811, "kl": 0.06939697265625, "learning_rate": 2.1085662295265645e-07, "loss": 0.0028, "reward": 1.0703125, "reward_std": 0.492043137550354, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.453125, "step": 606 }, { "completion_length": 278.15234375, "epoch": 0.7062245491564864, "grad_norm": 1.161794210918606, "kl": 0.107666015625, "learning_rate": 2.0932025748927014e-07, "loss": 0.0043, "reward": 1.0703125, "reward_std": 0.46188992261886597, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.4609375, "step": 607 }, { "completion_length": 281.51171875, "epoch": 0.7073880162885399, "grad_norm": 1.0417371734275742, "kl": 0.1461181640625, "learning_rate": 2.0778802653538824e-07, "loss": 0.0058, "reward": 1.076171875, "reward_std": 0.3824049159884453, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.458984375, "step": 608 }, { "completion_length": 330.5859375, "epoch": 0.7085514834205934, "grad_norm": 0.9214944576472075, "kl": 0.13623046875, "learning_rate": 2.0625995188483553e-07, "loss": 0.0055, "reward": 0.990234375, "reward_std": 0.46736860275268555, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.435546875, "step": 609 }, { "completion_length": 278.16796875, "epoch": 0.7097149505526469, "grad_norm": 0.17174339407342445, "kl": 0.06475830078125, "learning_rate": 2.047360552723199e-07, "loss": 0.0026, "reward": 0.869140625, "reward_std": 0.4365595430135727, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.462890625, "step": 610 }, { "completion_length": 314.0390625, "epoch": 0.7108784176847004, "grad_norm": 0.310841263990261, "kl": 0.075439453125, "learning_rate": 2.0321635837312184e-07, "loss": 0.003, "reward": 1.01171875, "reward_std": 0.4891914315521717, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.45703125, "step": 611 }, { "completion_length": 288.11328125, "epoch": 0.7120418848167539, "grad_norm": 4.070051357956161, "kl": 0.245849609375, "learning_rate": 2.0170088280278703e-07, "loss": 0.0098, "reward": 1.01171875, "reward_std": 0.4956648051738739, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.44921875, "step": 612 }, { "completion_length": 386.31640625, "epoch": 0.7132053519488074, "grad_norm": 2.860704910871931, "kl": 0.091796875, "learning_rate": 2.00189650116819e-07, "loss": 0.0037, "reward": 0.912109375, "reward_std": 0.46065816283226013, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.435546875, "step": 613 }, { "completion_length": 341.2109375, "epoch": 0.7143688190808609, "grad_norm": 372.2192011233767, "kl": 0.24835205078125, "learning_rate": 1.9868268181037184e-07, "loss": 0.0099, "reward": 0.888671875, "reward_std": 0.46524087339639664, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.443359375, "step": 614 }, { "completion_length": 297.76953125, "epoch": 0.7155322862129145, "grad_norm": 1.088173527098763, "kl": 0.1053466796875, "learning_rate": 1.971799993179451e-07, "loss": 0.0042, "reward": 1.203125, "reward_std": 0.4539450705051422, "rewards/correctness_reward_func": 0.7421875, "rewards/strict_format_reward_func": 0.4609375, "step": 615 }, { "completion_length": 247.765625, "epoch": 0.716695753344968, "grad_norm": 1.1056929313577715, "kl": 0.093994140625, "learning_rate": 1.9568162401307835e-07, "loss": 0.0038, "reward": 1.03125, "reward_std": 0.5009337291121483, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.453125, "step": 616 }, { "completion_length": 235.36328125, "epoch": 0.7178592204770216, "grad_norm": 3.565588097626741, "kl": 0.0787353515625, "learning_rate": 1.9418757720804812e-07, "loss": 0.0031, "reward": 1.10546875, "reward_std": 0.42327461764216423, "rewards/correctness_reward_func": 0.640625, "rewards/strict_format_reward_func": 0.46484375, "step": 617 }, { "completion_length": 303.7890625, "epoch": 0.7190226876090751, "grad_norm": 1.2306759274391301, "kl": 0.083251953125, "learning_rate": 1.9269788015356335e-07, "loss": 0.0033, "reward": 0.923828125, "reward_std": 0.43111933022737503, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.447265625, "step": 618 }, { "completion_length": 339.7578125, "epoch": 0.7201861547411286, "grad_norm": 1.3035675507963673, "kl": 0.076416015625, "learning_rate": 1.912125540384648e-07, "loss": 0.0031, "reward": 0.8515625, "reward_std": 0.44212011992931366, "rewards/correctness_reward_func": 0.3984375, "rewards/strict_format_reward_func": 0.453125, "step": 619 }, { "completion_length": 266.37109375, "epoch": 0.7213496218731821, "grad_norm": 0.3725160073747834, "kl": 0.0703125, "learning_rate": 1.8973161998942166e-07, "loss": 0.0028, "reward": 1.166015625, "reward_std": 0.4920080229640007, "rewards/correctness_reward_func": 0.71875, "rewards/strict_format_reward_func": 0.447265625, "step": 620 }, { "completion_length": 287.125, "epoch": 0.7225130890052356, "grad_norm": 0.6659052224959165, "kl": 0.0802001953125, "learning_rate": 1.8825509907063326e-07, "loss": 0.0032, "reward": 1.021484375, "reward_std": 0.474176362156868, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.458984375, "step": 621 }, { "completion_length": 275.88671875, "epoch": 0.7236765561372891, "grad_norm": 2.3215515730228455, "kl": 0.18328857421875, "learning_rate": 1.8678301228352755e-07, "loss": 0.0073, "reward": 1.048828125, "reward_std": 0.44828126579523087, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.455078125, "step": 622 }, { "completion_length": 305.8984375, "epoch": 0.7248400232693426, "grad_norm": 0.8693066594917055, "kl": 0.0928955078125, "learning_rate": 1.8531538056646413e-07, "loss": 0.0037, "reward": 0.849609375, "reward_std": 0.549469493329525, "rewards/correctness_reward_func": 0.3984375, "rewards/strict_format_reward_func": 0.451171875, "step": 623 }, { "completion_length": 294.6796875, "epoch": 0.7260034904013961, "grad_norm": 1.84925939534275, "kl": 0.1939697265625, "learning_rate": 1.8385222479443412e-07, "loss": 0.0077, "reward": 1.060546875, "reward_std": 0.5771784484386444, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.451171875, "step": 624 }, { "completion_length": 321.46484375, "epoch": 0.7271669575334497, "grad_norm": 4.1238263783828915, "kl": 0.10028076171875, "learning_rate": 1.823935657787662e-07, "loss": 0.004, "reward": 0.892578125, "reward_std": 0.4531635195016861, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.447265625, "step": 625 }, { "completion_length": 328.4765625, "epoch": 0.7283304246655032, "grad_norm": 1.9983677708998484, "kl": 0.0885009765625, "learning_rate": 1.8093942426682775e-07, "loss": 0.0035, "reward": 1.048828125, "reward_std": 0.41148973256349564, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.447265625, "step": 626 }, { "completion_length": 296.1953125, "epoch": 0.7294938917975567, "grad_norm": 1.1820570605543372, "kl": 0.07415771484375, "learning_rate": 1.7948982094173214e-07, "loss": 0.003, "reward": 1.015625, "reward_std": 0.42093075066804886, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.4609375, "step": 627 }, { "completion_length": 300.0390625, "epoch": 0.7306573589296103, "grad_norm": 0.9276773590408368, "kl": 0.0843505859375, "learning_rate": 1.780447764220422e-07, "loss": 0.0034, "reward": 0.91796875, "reward_std": 0.5018439590930939, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.46484375, "step": 628 }, { "completion_length": 270.94140625, "epoch": 0.7318208260616638, "grad_norm": 0.3159271606627362, "kl": 0.0794677734375, "learning_rate": 1.7660431126147928e-07, "loss": 0.0032, "reward": 1.0625, "reward_std": 0.4512241557240486, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.46875, "step": 629 }, { "completion_length": 318.1796875, "epoch": 0.7329842931937173, "grad_norm": 0.30172776857707617, "kl": 0.067626953125, "learning_rate": 1.7516844594862912e-07, "loss": 0.0027, "reward": 1.03515625, "reward_std": 0.5974105447530746, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.44140625, "step": 630 }, { "completion_length": 305.43359375, "epoch": 0.7341477603257708, "grad_norm": 3.7065625525939963, "kl": 0.13525390625, "learning_rate": 1.7373720090665178e-07, "loss": 0.0054, "reward": 0.951171875, "reward_std": 0.39801375567913055, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.451171875, "step": 631 }, { "completion_length": 290.71875, "epoch": 0.7353112274578243, "grad_norm": 0.2505612225990488, "kl": 0.093994140625, "learning_rate": 1.7231059649298946e-07, "loss": 0.0038, "reward": 1.09375, "reward_std": 0.41467833518981934, "rewards/correctness_reward_func": 0.65625, "rewards/strict_format_reward_func": 0.4375, "step": 632 }, { "completion_length": 272.52734375, "epoch": 0.7364746945898778, "grad_norm": 3.213080378864827, "kl": 0.0745849609375, "learning_rate": 1.7088865299907896e-07, "loss": 0.003, "reward": 1.08203125, "reward_std": 0.3735494688153267, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.46484375, "step": 633 }, { "completion_length": 327.921875, "epoch": 0.7376381617219313, "grad_norm": 0.22627194216819638, "kl": 0.09326171875, "learning_rate": 1.6947139065006137e-07, "loss": 0.0037, "reward": 0.771484375, "reward_std": 0.37508559226989746, "rewards/correctness_reward_func": 0.328125, "rewards/strict_format_reward_func": 0.443359375, "step": 634 }, { "completion_length": 370.4921875, "epoch": 0.7388016288539849, "grad_norm": 0.2901720866615809, "kl": 0.08935546875, "learning_rate": 1.6805882960449591e-07, "loss": 0.0036, "reward": 0.92578125, "reward_std": 0.34715963155031204, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.43359375, "step": 635 }, { "completion_length": 283.31640625, "epoch": 0.7399650959860384, "grad_norm": 4.102572051898977, "kl": 0.0931396484375, "learning_rate": 1.6665098995407122e-07, "loss": 0.0037, "reward": 1.029296875, "reward_std": 0.42396605759859085, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.451171875, "step": 636 }, { "completion_length": 266.25390625, "epoch": 0.7411285631180919, "grad_norm": 14.745891989325418, "kl": 0.103759765625, "learning_rate": 1.6524789172332183e-07, "loss": 0.0042, "reward": 1.0625, "reward_std": 0.44412969052791595, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.453125, "step": 637 }, { "completion_length": 264.68359375, "epoch": 0.7422920302501455, "grad_norm": 0.7912540732469606, "kl": 0.11590576171875, "learning_rate": 1.6384955486934154e-07, "loss": 0.0046, "reward": 1.095703125, "reward_std": 0.43286868184804916, "rewards/correctness_reward_func": 0.6328125, "rewards/strict_format_reward_func": 0.462890625, "step": 638 }, { "completion_length": 318.66796875, "epoch": 0.743455497382199, "grad_norm": 0.7042290003173404, "kl": 0.07318115234375, "learning_rate": 1.624559992815009e-07, "loss": 0.0029, "reward": 1.099609375, "reward_std": 0.5046915486454964, "rewards/correctness_reward_func": 0.6484375, "rewards/strict_format_reward_func": 0.451171875, "step": 639 }, { "completion_length": 339.4921875, "epoch": 0.7446189645142525, "grad_norm": 1.7860632319946128, "kl": 0.11474609375, "learning_rate": 1.6106724478116317e-07, "loss": 0.0046, "reward": 0.767578125, "reward_std": 0.4360281303524971, "rewards/correctness_reward_func": 0.3203125, "rewards/strict_format_reward_func": 0.447265625, "step": 640 }, { "completion_length": 294.16796875, "epoch": 0.745782431646306, "grad_norm": 0.5060815898105797, "kl": 0.08966064453125, "learning_rate": 1.5968331112140315e-07, "loss": 0.0036, "reward": 1.173828125, "reward_std": 0.498068243265152, "rewards/correctness_reward_func": 0.71875, "rewards/strict_format_reward_func": 0.455078125, "step": 641 }, { "completion_length": 346.44140625, "epoch": 0.7469458987783595, "grad_norm": 0.9742006784046773, "kl": 0.1334228515625, "learning_rate": 1.5830421798672565e-07, "loss": 0.0053, "reward": 0.87109375, "reward_std": 0.39891765639185905, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.43359375, "step": 642 }, { "completion_length": 299.953125, "epoch": 0.748109365910413, "grad_norm": 0.560592389081217, "kl": 0.148193359375, "learning_rate": 1.5692998499278652e-07, "loss": 0.0059, "reward": 0.9453125, "reward_std": 0.3721243888139725, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.4765625, "step": 643 }, { "completion_length": 301.66015625, "epoch": 0.7492728330424665, "grad_norm": 0.43704512418253966, "kl": 0.0892333984375, "learning_rate": 1.555606316861124e-07, "loss": 0.0036, "reward": 1.060546875, "reward_std": 0.5021995007991791, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.466796875, "step": 644 }, { "completion_length": 280.95703125, "epoch": 0.7504363001745201, "grad_norm": 1.053480154093678, "kl": 0.107177734375, "learning_rate": 1.541961775438232e-07, "loss": 0.0043, "reward": 1.087890625, "reward_std": 0.4468654841184616, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.470703125, "step": 645 }, { "completion_length": 336.94140625, "epoch": 0.7515997673065736, "grad_norm": 3.1055841682985514, "kl": 0.0780029296875, "learning_rate": 1.528366419733557e-07, "loss": 0.0031, "reward": 0.990234375, "reward_std": 0.5013555735349655, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.451171875, "step": 646 }, { "completion_length": 311.03515625, "epoch": 0.7527632344386271, "grad_norm": 3.2558612871110806, "kl": 0.0830078125, "learning_rate": 1.5148204431218636e-07, "loss": 0.0033, "reward": 0.927734375, "reward_std": 0.48823411017656326, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.451171875, "step": 647 }, { "completion_length": 376.9453125, "epoch": 0.7539267015706806, "grad_norm": 4.1013093262531015, "kl": 0.0902099609375, "learning_rate": 1.501324038275571e-07, "loss": 0.0036, "reward": 0.94921875, "reward_std": 0.47036390006542206, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.43359375, "step": 648 }, { "completion_length": 340.015625, "epoch": 0.7550901687027342, "grad_norm": 8.906475110791467, "kl": 0.23681640625, "learning_rate": 1.4878773971620074e-07, "loss": 0.0095, "reward": 0.9453125, "reward_std": 0.49929626286029816, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.4453125, "step": 649 }, { "completion_length": 326.78125, "epoch": 0.7562536358347877, "grad_norm": 0.49502798143085436, "kl": 0.086181640625, "learning_rate": 1.474480711040687e-07, "loss": 0.0035, "reward": 1.046875, "reward_std": 0.4579222984611988, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.4609375, "step": 650 }, { "completion_length": 328.9453125, "epoch": 0.7574171029668412, "grad_norm": 0.26055938653841965, "kl": 0.10498046875, "learning_rate": 1.4611341704605806e-07, "loss": 0.0042, "reward": 0.953125, "reward_std": 0.45946378260850906, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.453125, "step": 651 }, { "completion_length": 346.2890625, "epoch": 0.7585805700988947, "grad_norm": 3.5757009816685676, "kl": 0.14208984375, "learning_rate": 1.4478379652574108e-07, "loss": 0.0057, "reward": 0.833984375, "reward_std": 0.4789082631468773, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.427734375, "step": 652 }, { "completion_length": 364.3515625, "epoch": 0.7597440372309482, "grad_norm": 0.8753438593294685, "kl": 0.0909423828125, "learning_rate": 1.4345922845509473e-07, "loss": 0.0036, "reward": 1.001953125, "reward_std": 0.5316587388515472, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.439453125, "step": 653 }, { "completion_length": 332.546875, "epoch": 0.7609075043630017, "grad_norm": 1.8651879300770104, "kl": 0.082763671875, "learning_rate": 1.4213973167423278e-07, "loss": 0.0033, "reward": 0.921875, "reward_std": 0.3874565213918686, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.4453125, "step": 654 }, { "completion_length": 271.69921875, "epoch": 0.7620709714950553, "grad_norm": 16.513312726463408, "kl": 0.13525390625, "learning_rate": 1.4082532495113624e-07, "loss": 0.0054, "reward": 1.212890625, "reward_std": 0.4803393520414829, "rewards/correctness_reward_func": 0.7578125, "rewards/strict_format_reward_func": 0.455078125, "step": 655 }, { "completion_length": 339.74609375, "epoch": 0.7632344386271088, "grad_norm": 5.027507764171796, "kl": 0.15966796875, "learning_rate": 1.395160269813877e-07, "loss": 0.0064, "reward": 0.869140625, "reward_std": 0.4287051036953926, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.439453125, "step": 656 }, { "completion_length": 335.28515625, "epoch": 0.7643979057591623, "grad_norm": 0.7368916006265884, "kl": 0.1114501953125, "learning_rate": 1.382118563879045e-07, "loss": 0.0045, "reward": 0.9140625, "reward_std": 0.4082162156701088, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.4453125, "step": 657 }, { "completion_length": 346.140625, "epoch": 0.7655613728912158, "grad_norm": 0.6263280408903034, "kl": 0.08221435546875, "learning_rate": 1.3691283172067493e-07, "loss": 0.0033, "reward": 0.955078125, "reward_std": 0.4840616434812546, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.462890625, "step": 658 }, { "completion_length": 371.3515625, "epoch": 0.7667248400232693, "grad_norm": 0.7052583229225715, "kl": 0.1112060546875, "learning_rate": 1.3561897145649326e-07, "loss": 0.0045, "reward": 1.0390625, "reward_std": 0.4716608002781868, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.4296875, "step": 659 }, { "completion_length": 283.73828125, "epoch": 0.7678883071553229, "grad_norm": 0.4211524990286586, "kl": 0.0955810546875, "learning_rate": 1.3433029399869744e-07, "loss": 0.0038, "reward": 1.01171875, "reward_std": 0.4480849653482437, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.44921875, "step": 660 }, { "completion_length": 320.2421875, "epoch": 0.7690517742873764, "grad_norm": 1.4705374961776827, "kl": 0.096923828125, "learning_rate": 1.330468176769073e-07, "loss": 0.0039, "reward": 1.130859375, "reward_std": 0.5977482199668884, "rewards/correctness_reward_func": 0.6796875, "rewards/strict_format_reward_func": 0.451171875, "step": 661 }, { "completion_length": 275.89453125, "epoch": 0.7702152414194299, "grad_norm": 0.36663180819712987, "kl": 0.1063232421875, "learning_rate": 1.317685607467641e-07, "loss": 0.0043, "reward": 0.966796875, "reward_std": 0.3606426864862442, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.451171875, "step": 662 }, { "completion_length": 276.1015625, "epoch": 0.7713787085514834, "grad_norm": 0.3262882405920641, "kl": 0.0828857421875, "learning_rate": 1.304955413896705e-07, "loss": 0.0033, "reward": 0.998046875, "reward_std": 0.4411414861679077, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.443359375, "step": 663 }, { "completion_length": 307.92578125, "epoch": 0.7725421756835369, "grad_norm": 0.9835467078681713, "kl": 0.142822265625, "learning_rate": 1.292277777125319e-07, "loss": 0.0057, "reward": 0.859375, "reward_std": 0.41398511081933975, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.4453125, "step": 664 }, { "completion_length": 362.9609375, "epoch": 0.7737056428155905, "grad_norm": 1.3375306215287002, "kl": 0.0924072265625, "learning_rate": 1.2796528774749898e-07, "loss": 0.0037, "reward": 0.990234375, "reward_std": 0.5508518218994141, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.435546875, "step": 665 }, { "completion_length": 266.6328125, "epoch": 0.774869109947644, "grad_norm": 2.9632310775237563, "kl": 0.106689453125, "learning_rate": 1.2670808945171185e-07, "loss": 0.0043, "reward": 1.10546875, "reward_std": 0.4171430915594101, "rewards/correctness_reward_func": 0.6484375, "rewards/strict_format_reward_func": 0.45703125, "step": 666 }, { "completion_length": 316.953125, "epoch": 0.7760325770796975, "grad_norm": 1.765688200213334, "kl": 0.110107421875, "learning_rate": 1.2545620070704354e-07, "loss": 0.0044, "reward": 0.904296875, "reward_std": 0.46835917979478836, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.451171875, "step": 667 }, { "completion_length": 290.91796875, "epoch": 0.777196044211751, "grad_norm": 2.398036603527806, "kl": 0.148193359375, "learning_rate": 1.2420963931984646e-07, "loss": 0.0059, "reward": 0.8046875, "reward_std": 0.30883756279945374, "rewards/correctness_reward_func": 0.34375, "rewards/strict_format_reward_func": 0.4609375, "step": 668 }, { "completion_length": 288.6796875, "epoch": 0.7783595113438045, "grad_norm": 1.0873235215529649, "kl": 0.099853515625, "learning_rate": 1.229684230206986e-07, "loss": 0.004, "reward": 1.029296875, "reward_std": 0.535124808549881, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.443359375, "step": 669 }, { "completion_length": 279.6796875, "epoch": 0.779522978475858, "grad_norm": 1.2291158759137777, "kl": 0.07257080078125, "learning_rate": 1.217325694641521e-07, "loss": 0.0029, "reward": 0.94921875, "reward_std": 0.5025981962680817, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.45703125, "step": 670 }, { "completion_length": 326.296875, "epoch": 0.7806864456079116, "grad_norm": 0.42250730499464445, "kl": 0.11083984375, "learning_rate": 1.2050209622848124e-07, "loss": 0.0044, "reward": 0.927734375, "reward_std": 0.47140881419181824, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.435546875, "step": 671 }, { "completion_length": 344.53515625, "epoch": 0.7818499127399651, "grad_norm": 3.2929552230390624, "kl": 0.1175537109375, "learning_rate": 1.1927702081543278e-07, "loss": 0.0047, "reward": 0.97265625, "reward_std": 0.41038355976343155, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.45703125, "step": 672 }, { "completion_length": 294.3515625, "epoch": 0.7830133798720186, "grad_norm": 0.7148490027327697, "kl": 0.087890625, "learning_rate": 1.1805736064997746e-07, "loss": 0.0035, "reward": 0.94921875, "reward_std": 0.3301275111734867, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.47265625, "step": 673 }, { "completion_length": 293.6640625, "epoch": 0.7841768470040721, "grad_norm": 3.1501685964126978, "kl": 0.1124267578125, "learning_rate": 1.1684313308006149e-07, "loss": 0.0045, "reward": 1.048828125, "reward_std": 0.433653861284256, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.447265625, "step": 674 }, { "completion_length": 345.2890625, "epoch": 0.7853403141361257, "grad_norm": 0.7129799978477431, "kl": 0.1160888671875, "learning_rate": 1.1563435537635985e-07, "loss": 0.0046, "reward": 0.888671875, "reward_std": 0.40694061666727066, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.458984375, "step": 675 }, { "completion_length": 345.94140625, "epoch": 0.7865037812681792, "grad_norm": 1.8720998025074826, "kl": 0.14013671875, "learning_rate": 1.1443104473203147e-07, "loss": 0.0056, "reward": 0.9609375, "reward_std": 0.4482816830277443, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.4375, "step": 676 }, { "completion_length": 338.8125, "epoch": 0.7876672484002327, "grad_norm": 1.0582175769547681, "kl": 0.1094970703125, "learning_rate": 1.1323321826247345e-07, "loss": 0.0044, "reward": 0.9296875, "reward_std": 0.4532797820866108, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.4375, "step": 677 }, { "completion_length": 360.37890625, "epoch": 0.7888307155322862, "grad_norm": 7.77578895641955, "kl": 0.0755615234375, "learning_rate": 1.1204089300507846e-07, "loss": 0.003, "reward": 0.99609375, "reward_std": 0.49950528144836426, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.44140625, "step": 678 }, { "completion_length": 281.94921875, "epoch": 0.7899941826643397, "grad_norm": 0.697625893502953, "kl": 0.0919189453125, "learning_rate": 1.1085408591899248e-07, "loss": 0.0037, "reward": 0.9609375, "reward_std": 0.4070282056927681, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.46875, "step": 679 }, { "completion_length": 333.80859375, "epoch": 0.7911576497963932, "grad_norm": 1.0947145502327276, "kl": 0.088134765625, "learning_rate": 1.0967281388487282e-07, "loss": 0.0035, "reward": 0.85546875, "reward_std": 0.4202311486005783, "rewards/correctness_reward_func": 0.40625, "rewards/strict_format_reward_func": 0.44921875, "step": 680 }, { "completion_length": 289.47265625, "epoch": 0.7923211169284468, "grad_norm": 0.8093704656931527, "kl": 0.109375, "learning_rate": 1.0849709370464871e-07, "loss": 0.0044, "reward": 1.0546875, "reward_std": 0.5030719637870789, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.4609375, "step": 681 }, { "completion_length": 301.5546875, "epoch": 0.7934845840605003, "grad_norm": 2.6595767829337893, "kl": 0.1297607421875, "learning_rate": 1.0732694210128191e-07, "loss": 0.0052, "reward": 0.998046875, "reward_std": 0.3529244214296341, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.451171875, "step": 682 }, { "completion_length": 299.05078125, "epoch": 0.7946480511925538, "grad_norm": 1.249228653006239, "kl": 0.0750732421875, "learning_rate": 1.0616237571852948e-07, "loss": 0.003, "reward": 1.068359375, "reward_std": 0.41234347969293594, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.451171875, "step": 683 }, { "completion_length": 315.6953125, "epoch": 0.7958115183246073, "grad_norm": 0.25579611518952816, "kl": 0.0870361328125, "learning_rate": 1.0500341112070605e-07, "loss": 0.0035, "reward": 0.978515625, "reward_std": 0.4635913744568825, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.455078125, "step": 684 }, { "completion_length": 325.08203125, "epoch": 0.7969749854566609, "grad_norm": 1.8638410587415064, "kl": 0.11279296875, "learning_rate": 1.0385006479244906e-07, "loss": 0.0045, "reward": 1.0625, "reward_std": 0.46428094804286957, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.453125, "step": 685 }, { "completion_length": 325.625, "epoch": 0.7981384525887144, "grad_norm": 2.703722660636293, "kl": 0.1903076171875, "learning_rate": 1.0270235313848374e-07, "loss": 0.0076, "reward": 1.12109375, "reward_std": 0.5834380388259888, "rewards/correctness_reward_func": 0.671875, "rewards/strict_format_reward_func": 0.44921875, "step": 686 }, { "completion_length": 266.0625, "epoch": 0.7993019197207679, "grad_norm": 0.41222787168089287, "kl": 0.090087890625, "learning_rate": 1.0156029248339054e-07, "loss": 0.0036, "reward": 1.015625, "reward_std": 0.4687533974647522, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.4609375, "step": 687 }, { "completion_length": 303.82421875, "epoch": 0.8004653868528214, "grad_norm": 3.717406938572235, "kl": 0.1275634765625, "learning_rate": 1.0042389907137183e-07, "loss": 0.0051, "reward": 1.02734375, "reward_std": 0.45849812030792236, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.44140625, "step": 688 }, { "completion_length": 304.12890625, "epoch": 0.8016288539848749, "grad_norm": 0.6105175819115547, "kl": 0.068359375, "learning_rate": 9.929318906602174e-08, "loss": 0.0027, "reward": 1.076171875, "reward_std": 0.5080963373184204, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.458984375, "step": 689 }, { "completion_length": 322.04296875, "epoch": 0.8027923211169284, "grad_norm": 0.9867521821593127, "kl": 0.1583251953125, "learning_rate": 9.816817855009573e-08, "loss": 0.0063, "reward": 0.869140625, "reward_std": 0.46228519082069397, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.439453125, "step": 690 }, { "completion_length": 331.29296875, "epoch": 0.803955788248982, "grad_norm": 1.6423234798489614, "kl": 0.08563232421875, "learning_rate": 9.704888352528257e-08, "loss": 0.0034, "reward": 1.080078125, "reward_std": 0.47453684359788895, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.455078125, "step": 691 }, { "completion_length": 314.515625, "epoch": 0.8051192553810355, "grad_norm": 0.6384717410851777, "kl": 0.096923828125, "learning_rate": 9.593531991197568e-08, "loss": 0.0039, "reward": 1.052734375, "reward_std": 0.3256206400692463, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.451171875, "step": 692 }, { "completion_length": 308.859375, "epoch": 0.806282722513089, "grad_norm": 2.840434685791333, "kl": 0.080078125, "learning_rate": 9.48275035490474e-08, "loss": 0.0032, "reward": 0.935546875, "reward_std": 0.42858435958623886, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.451171875, "step": 693 }, { "completion_length": 321.90234375, "epoch": 0.8074461896451425, "grad_norm": 5.392381755335814, "kl": 0.1326904296875, "learning_rate": 9.372545019362354e-08, "loss": 0.0053, "reward": 0.84765625, "reward_std": 0.42798008024692535, "rewards/correctness_reward_func": 0.3984375, "rewards/strict_format_reward_func": 0.44921875, "step": 694 }, { "completion_length": 342.15234375, "epoch": 0.8086096567771961, "grad_norm": 3.370405321318935, "kl": 0.107421875, "learning_rate": 9.262917552085947e-08, "loss": 0.0043, "reward": 0.9921875, "reward_std": 0.5245395228266716, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.4375, "step": 695 }, { "completion_length": 281.55859375, "epoch": 0.8097731239092496, "grad_norm": 0.327853147613894, "kl": 0.08380126953125, "learning_rate": 9.153869512371659e-08, "loss": 0.0033, "reward": 0.9140625, "reward_std": 0.4808673858642578, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.4609375, "step": 696 }, { "completion_length": 310.0234375, "epoch": 0.8109365910413031, "grad_norm": 2.1208171302961, "kl": 0.126708984375, "learning_rate": 9.04540245127412e-08, "loss": 0.0051, "reward": 1.111328125, "reward_std": 0.5462355315685272, "rewards/correctness_reward_func": 0.6484375, "rewards/strict_format_reward_func": 0.462890625, "step": 697 }, { "completion_length": 288.91015625, "epoch": 0.8121000581733566, "grad_norm": 1.9691836951593666, "kl": 0.16064453125, "learning_rate": 8.93751791158432e-08, "loss": 0.0064, "reward": 1.181640625, "reward_std": 0.48082733154296875, "rewards/correctness_reward_func": 0.7421875, "rewards/strict_format_reward_func": 0.439453125, "step": 698 }, { "completion_length": 324.34375, "epoch": 0.8132635253054101, "grad_norm": 0.45645499483372826, "kl": 0.080078125, "learning_rate": 8.830217427807762e-08, "loss": 0.0032, "reward": 0.99609375, "reward_std": 0.48218217492103577, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.45703125, "step": 699 }, { "completion_length": 280.734375, "epoch": 0.8144269924374636, "grad_norm": 0.4266688180649354, "kl": 0.0758056640625, "learning_rate": 8.723502526142523e-08, "loss": 0.003, "reward": 1.177734375, "reward_std": 0.4082382470369339, "rewards/correctness_reward_func": 0.7109375, "rewards/strict_format_reward_func": 0.466796875, "step": 700 }, { "completion_length": 257.08203125, "epoch": 0.8155904595695171, "grad_norm": 28.30214499447323, "kl": 0.3514404296875, "learning_rate": 8.617374724457627e-08, "loss": 0.0141, "reward": 1.158203125, "reward_std": 0.4557175636291504, "rewards/correctness_reward_func": 0.703125, "rewards/strict_format_reward_func": 0.455078125, "step": 701 }, { "completion_length": 303.35546875, "epoch": 0.8167539267015707, "grad_norm": 0.26557595770950765, "kl": 0.080810546875, "learning_rate": 8.511835532271411e-08, "loss": 0.0032, "reward": 1.013671875, "reward_std": 0.49593332409858704, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.435546875, "step": 702 }, { "completion_length": 315.0234375, "epoch": 0.8179173938336242, "grad_norm": 1.7400650028406235, "kl": 0.1025390625, "learning_rate": 8.4068864507301e-08, "loss": 0.0041, "reward": 0.91796875, "reward_std": 0.5626689791679382, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.44140625, "step": 703 }, { "completion_length": 336.87109375, "epoch": 0.8190808609656777, "grad_norm": 0.3043329384080291, "kl": 0.1103515625, "learning_rate": 8.302528972586403e-08, "loss": 0.0044, "reward": 0.921875, "reward_std": 0.4533003941178322, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.4453125, "step": 704 }, { "completion_length": 297.72265625, "epoch": 0.8202443280977313, "grad_norm": 0.8065211880785543, "kl": 0.0953369140625, "learning_rate": 8.198764582178303e-08, "loss": 0.0038, "reward": 0.919921875, "reward_std": 0.46915577352046967, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.451171875, "step": 705 }, { "completion_length": 323.21484375, "epoch": 0.8214077952297848, "grad_norm": 0.5905536885006575, "kl": 0.139404296875, "learning_rate": 8.09559475540797e-08, "loss": 0.0056, "reward": 0.935546875, "reward_std": 0.4087262898683548, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.443359375, "step": 706 }, { "completion_length": 275.08203125, "epoch": 0.8225712623618383, "grad_norm": 3.8260015916045975, "kl": 0.1484375, "learning_rate": 7.99302095972072e-08, "loss": 0.0059, "reward": 1.0703125, "reward_std": 0.4963390380144119, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.4609375, "step": 707 }, { "completion_length": 320.15625, "epoch": 0.8237347294938918, "grad_norm": 0.6303959664470524, "kl": 0.11181640625, "learning_rate": 7.891044654084184e-08, "loss": 0.0045, "reward": 0.89453125, "reward_std": 0.45234227925539017, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.44921875, "step": 708 }, { "completion_length": 339.921875, "epoch": 0.8248981966259453, "grad_norm": 0.40495546146093053, "kl": 0.109375, "learning_rate": 7.789667288967511e-08, "loss": 0.0044, "reward": 0.99609375, "reward_std": 0.3786897286772728, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.44140625, "step": 709 }, { "completion_length": 374.1171875, "epoch": 0.8260616637579988, "grad_norm": 0.6441697806331564, "kl": 0.107666015625, "learning_rate": 7.688890306320817e-08, "loss": 0.0043, "reward": 1.04296875, "reward_std": 0.5016506500542164, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.45703125, "step": 710 }, { "completion_length": 326.54296875, "epoch": 0.8272251308900523, "grad_norm": 0.4415553226023885, "kl": 0.09619140625, "learning_rate": 7.588715139554563e-08, "loss": 0.0038, "reward": 1.095703125, "reward_std": 0.5693574547767639, "rewards/correctness_reward_func": 0.640625, "rewards/strict_format_reward_func": 0.455078125, "step": 711 }, { "completion_length": 280.68359375, "epoch": 0.8283885980221058, "grad_norm": 8.127486496362426, "kl": 0.242431640625, "learning_rate": 7.4891432135193e-08, "loss": 0.0097, "reward": 1.103515625, "reward_std": 0.40982720255851746, "rewards/correctness_reward_func": 0.6484375, "rewards/strict_format_reward_func": 0.455078125, "step": 712 }, { "completion_length": 290.21875, "epoch": 0.8295520651541594, "grad_norm": 2.950862619907557, "kl": 0.1104736328125, "learning_rate": 7.390175944485238e-08, "loss": 0.0044, "reward": 0.94140625, "reward_std": 0.4587113782763481, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.45703125, "step": 713 }, { "completion_length": 323.05078125, "epoch": 0.8307155322862129, "grad_norm": 1.5632777457746123, "kl": 0.080810546875, "learning_rate": 7.291814740122281e-08, "loss": 0.0032, "reward": 0.89453125, "reward_std": 0.4608103632926941, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.45703125, "step": 714 }, { "completion_length": 332.94140625, "epoch": 0.8318789994182665, "grad_norm": 3.320664470856037, "kl": 0.0863037109375, "learning_rate": 7.194060999479857e-08, "loss": 0.0035, "reward": 1.099609375, "reward_std": 0.511575885117054, "rewards/correctness_reward_func": 0.640625, "rewards/strict_format_reward_func": 0.458984375, "step": 715 }, { "completion_length": 311.31640625, "epoch": 0.83304246655032, "grad_norm": 5.293111837879179, "kl": 0.15234375, "learning_rate": 7.096916112967133e-08, "loss": 0.0061, "reward": 1.025390625, "reward_std": 0.361906960606575, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.462890625, "step": 716 }, { "completion_length": 280.71484375, "epoch": 0.8342059336823735, "grad_norm": 1.0360968560766928, "kl": 0.0972900390625, "learning_rate": 7.00038146233311e-08, "loss": 0.0039, "reward": 1.07421875, "reward_std": 0.480039793998003, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.46484375, "step": 717 }, { "completion_length": 332.3125, "epoch": 0.835369400814427, "grad_norm": 0.5844366475064715, "kl": 0.08807373046875, "learning_rate": 6.90445842064713e-08, "loss": 0.0035, "reward": 0.9609375, "reward_std": 0.44934359192848206, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.4609375, "step": 718 }, { "completion_length": 289.05078125, "epoch": 0.8365328679464805, "grad_norm": 0.34857186915056404, "kl": 0.1053466796875, "learning_rate": 6.809148352279182e-08, "loss": 0.0042, "reward": 1.166015625, "reward_std": 0.49298807978630066, "rewards/correctness_reward_func": 0.71875, "rewards/strict_format_reward_func": 0.447265625, "step": 719 }, { "completion_length": 313.421875, "epoch": 0.837696335078534, "grad_norm": 0.6315366400903474, "kl": 0.0899658203125, "learning_rate": 6.71445261288065e-08, "loss": 0.0036, "reward": 0.8984375, "reward_std": 0.4385814629495144, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.453125, "step": 720 }, { "completion_length": 353.96875, "epoch": 0.8388598022105875, "grad_norm": 0.2580632232203998, "kl": 0.0948486328125, "learning_rate": 6.620372549364872e-08, "loss": 0.0038, "reward": 0.990234375, "reward_std": 0.3392200358211994, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.443359375, "step": 721 }, { "completion_length": 313.75, "epoch": 0.840023269342641, "grad_norm": 5.066910457326342, "kl": 0.1021728515625, "learning_rate": 6.526909499888139e-08, "loss": 0.0041, "reward": 1.16796875, "reward_std": 0.4771529585123062, "rewards/correctness_reward_func": 0.7109375, "rewards/strict_format_reward_func": 0.45703125, "step": 722 }, { "completion_length": 295.453125, "epoch": 0.8411867364746946, "grad_norm": 6.463956398500609, "kl": 0.1112060546875, "learning_rate": 6.43406479383053e-08, "loss": 0.0044, "reward": 0.9609375, "reward_std": 0.5526233166456223, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.453125, "step": 723 }, { "completion_length": 345.03515625, "epoch": 0.8423502036067481, "grad_norm": 0.35396694718783134, "kl": 0.0799560546875, "learning_rate": 6.341839751777117e-08, "loss": 0.0032, "reward": 0.9765625, "reward_std": 0.4868561327457428, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.453125, "step": 724 }, { "completion_length": 253.2421875, "epoch": 0.8435136707388017, "grad_norm": 1.720914659266595, "kl": 0.08892822265625, "learning_rate": 6.250235685499062e-08, "loss": 0.0036, "reward": 1.10546875, "reward_std": 0.4650595337152481, "rewards/correctness_reward_func": 0.640625, "rewards/strict_format_reward_func": 0.46484375, "step": 725 }, { "completion_length": 280.65625, "epoch": 0.8446771378708552, "grad_norm": 0.8040310601487729, "kl": 0.0792236328125, "learning_rate": 6.159253897935068e-08, "loss": 0.0032, "reward": 1.076171875, "reward_std": 0.5561566427350044, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.458984375, "step": 726 }, { "completion_length": 306.63671875, "epoch": 0.8458406050029087, "grad_norm": 3.3460268017897192, "kl": 0.2318115234375, "learning_rate": 6.068895683172786e-08, "loss": 0.0092, "reward": 0.962890625, "reward_std": 0.571891263127327, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.431640625, "step": 727 }, { "completion_length": 305.9609375, "epoch": 0.8470040721349622, "grad_norm": 1.0979061053244448, "kl": 0.06585693359375, "learning_rate": 5.979162326430437e-08, "loss": 0.0026, "reward": 0.958984375, "reward_std": 0.44130680710077286, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.474609375, "step": 728 }, { "completion_length": 336.41796875, "epoch": 0.8481675392670157, "grad_norm": 2.452848271998794, "kl": 0.0953369140625, "learning_rate": 5.890055104038477e-08, "loss": 0.0038, "reward": 0.98046875, "reward_std": 0.3961675837635994, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.44921875, "step": 729 }, { "completion_length": 310.54296875, "epoch": 0.8493310063990692, "grad_norm": 1.1891902352655686, "kl": 0.0860595703125, "learning_rate": 5.8015752834215336e-08, "loss": 0.0034, "reward": 1.107421875, "reward_std": 0.49162471294403076, "rewards/correctness_reward_func": 0.65625, "rewards/strict_format_reward_func": 0.451171875, "step": 730 }, { "completion_length": 282.9296875, "epoch": 0.8504944735311227, "grad_norm": 0.4261619617797313, "kl": 0.102783203125, "learning_rate": 5.713724123080294e-08, "loss": 0.0041, "reward": 1.056640625, "reward_std": 0.5883161574602127, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.447265625, "step": 731 }, { "completion_length": 341.33203125, "epoch": 0.8516579406631762, "grad_norm": 0.7932993323440082, "kl": 0.0875244140625, "learning_rate": 5.626502872573663e-08, "loss": 0.0035, "reward": 0.80859375, "reward_std": 0.3800119683146477, "rewards/correctness_reward_func": 0.3515625, "rewards/strict_format_reward_func": 0.45703125, "step": 732 }, { "completion_length": 305.37890625, "epoch": 0.8528214077952297, "grad_norm": 1.1124994725324575, "kl": 0.0927734375, "learning_rate": 5.539912772500943e-08, "loss": 0.0037, "reward": 1.017578125, "reward_std": 0.48575039207935333, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.447265625, "step": 733 }, { "completion_length": 298.5703125, "epoch": 0.8539848749272833, "grad_norm": 0.25231637412930785, "kl": 0.0965576171875, "learning_rate": 5.453955054484238e-08, "loss": 0.0039, "reward": 0.8671875, "reward_std": 0.45256930589675903, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.453125, "step": 734 }, { "completion_length": 343.88671875, "epoch": 0.8551483420593369, "grad_norm": 0.6528676517852243, "kl": 0.0869140625, "learning_rate": 5.36863094115087e-08, "loss": 0.0035, "reward": 1.06640625, "reward_std": 0.39060064405202866, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.44921875, "step": 735 }, { "completion_length": 297.2578125, "epoch": 0.8563118091913904, "grad_norm": 2.078124720721266, "kl": 0.11865234375, "learning_rate": 5.283941646116074e-08, "loss": 0.0047, "reward": 1.02734375, "reward_std": 0.4644080102443695, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.44140625, "step": 736 }, { "completion_length": 322.49609375, "epoch": 0.8574752763234439, "grad_norm": 11.893826587265929, "kl": 0.116455078125, "learning_rate": 5.1998883739656606e-08, "loss": 0.0047, "reward": 0.916015625, "reward_std": 0.4371761083602905, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.447265625, "step": 737 }, { "completion_length": 335.6484375, "epoch": 0.8586387434554974, "grad_norm": 2.4751856892074144, "kl": 0.10418701171875, "learning_rate": 5.116472320238907e-08, "loss": 0.0042, "reward": 0.970703125, "reward_std": 0.493367999792099, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.439453125, "step": 738 }, { "completion_length": 313.74609375, "epoch": 0.8598022105875509, "grad_norm": 1.350400557147364, "kl": 0.075439453125, "learning_rate": 5.033694671411593e-08, "loss": 0.003, "reward": 0.9609375, "reward_std": 0.5704307109117508, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.453125, "step": 739 }, { "completion_length": 314.50390625, "epoch": 0.8609656777196044, "grad_norm": 1.3991219788106766, "kl": 0.1253662109375, "learning_rate": 4.951556604879048e-08, "loss": 0.005, "reward": 0.859375, "reward_std": 0.47352631390094757, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.421875, "step": 740 }, { "completion_length": 335.8046875, "epoch": 0.8621291448516579, "grad_norm": 2.1410051843625295, "kl": 0.0999755859375, "learning_rate": 4.8700592889394596e-08, "loss": 0.004, "reward": 0.99609375, "reward_std": 0.5642721131443977, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.44140625, "step": 741 }, { "completion_length": 320.02734375, "epoch": 0.8632926119837114, "grad_norm": 2.6626885962056748, "kl": 0.212158203125, "learning_rate": 4.789203882777237e-08, "loss": 0.0085, "reward": 1.103515625, "reward_std": 0.405231025069952, "rewards/correctness_reward_func": 0.6328125, "rewards/strict_format_reward_func": 0.470703125, "step": 742 }, { "completion_length": 300.75390625, "epoch": 0.8644560791157649, "grad_norm": 1.6019712862162383, "kl": 0.12744140625, "learning_rate": 4.708991536446555e-08, "loss": 0.0051, "reward": 0.89453125, "reward_std": 0.49361371994018555, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.44140625, "step": 743 }, { "completion_length": 333.66015625, "epoch": 0.8656195462478184, "grad_norm": 11.05434298813199, "kl": 0.1661376953125, "learning_rate": 4.6294233908549306e-08, "loss": 0.0066, "reward": 0.9296875, "reward_std": 0.444530226290226, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.46875, "step": 744 }, { "completion_length": 314.4609375, "epoch": 0.8667830133798721, "grad_norm": 0.200254903360761, "kl": 0.0771484375, "learning_rate": 4.550500577747057e-08, "loss": 0.0031, "reward": 1.02734375, "reward_std": 0.4406590908765793, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.45703125, "step": 745 }, { "completion_length": 349.234375, "epoch": 0.8679464805119256, "grad_norm": 0.6722179013466333, "kl": 0.1138916015625, "learning_rate": 4.4722242196886693e-08, "loss": 0.0046, "reward": 0.861328125, "reward_std": 0.42282892763614655, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.431640625, "step": 746 }, { "completion_length": 357.9765625, "epoch": 0.8691099476439791, "grad_norm": 0.5534603772985556, "kl": 0.1192626953125, "learning_rate": 4.394595430050613e-08, "loss": 0.0048, "reward": 1.0703125, "reward_std": 0.517595648765564, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.4453125, "step": 747 }, { "completion_length": 301.49609375, "epoch": 0.8702734147760326, "grad_norm": 0.40750935389574305, "kl": 0.0892333984375, "learning_rate": 4.3176153129929615e-08, "loss": 0.0036, "reward": 0.833984375, "reward_std": 0.4179387725889683, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.458984375, "step": 748 }, { "completion_length": 355.2890625, "epoch": 0.8714368819080861, "grad_norm": 5.405343726453393, "kl": 0.1156005859375, "learning_rate": 4.241284963449343e-08, "loss": 0.0046, "reward": 0.986328125, "reward_std": 0.5475742071866989, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.439453125, "step": 749 }, { "completion_length": 354.4453125, "epoch": 0.8726003490401396, "grad_norm": 1.6773686896097542, "kl": 0.09234619140625, "learning_rate": 4.165605467111355e-08, "loss": 0.0037, "reward": 0.951171875, "reward_std": 0.4112582951784134, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.451171875, "step": 750 }, { "completion_length": 299.546875, "epoch": 0.8737638161721931, "grad_norm": 7.307747916354028, "kl": 0.1666259765625, "learning_rate": 4.0905779004131504e-08, "loss": 0.0066, "reward": 1.11328125, "reward_std": 0.473948173224926, "rewards/correctness_reward_func": 0.671875, "rewards/strict_format_reward_func": 0.44140625, "step": 751 }, { "completion_length": 330.28515625, "epoch": 0.8749272833042466, "grad_norm": 0.374855722255496, "kl": 0.0888671875, "learning_rate": 4.016203330516071e-08, "loss": 0.0036, "reward": 0.88671875, "reward_std": 0.4228622391819954, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.44140625, "step": 752 }, { "completion_length": 312.19921875, "epoch": 0.8760907504363001, "grad_norm": 0.8344907172041108, "kl": 0.0986328125, "learning_rate": 3.942482815293513e-08, "loss": 0.0039, "reward": 1.12109375, "reward_std": 0.42237984389066696, "rewards/correctness_reward_func": 0.6484375, "rewards/strict_format_reward_func": 0.47265625, "step": 753 }, { "completion_length": 280.203125, "epoch": 0.8772542175683536, "grad_norm": 9.206175773458755, "kl": 0.0928955078125, "learning_rate": 3.869417403315855e-08, "loss": 0.0037, "reward": 1.248046875, "reward_std": 0.5450355410575867, "rewards/correctness_reward_func": 0.78125, "rewards/strict_format_reward_func": 0.466796875, "step": 754 }, { "completion_length": 294.87890625, "epoch": 0.8784176847004073, "grad_norm": 21.046343650979672, "kl": 0.1541748046875, "learning_rate": 3.7970081338355885e-08, "loss": 0.0062, "reward": 0.95703125, "reward_std": 0.5401248037815094, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.46484375, "step": 755 }, { "completion_length": 367.2734375, "epoch": 0.8795811518324608, "grad_norm": 2.577021600096217, "kl": 0.1171875, "learning_rate": 3.725256036772473e-08, "loss": 0.0047, "reward": 0.9140625, "reward_std": 0.49222858995199203, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.4453125, "step": 756 }, { "completion_length": 329.3984375, "epoch": 0.8807446189645143, "grad_norm": 6.0082271370174425, "kl": 0.137939453125, "learning_rate": 3.654162132698918e-08, "loss": 0.0055, "reward": 1.001953125, "reward_std": 0.4484364651143551, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.455078125, "step": 757 }, { "completion_length": 308.328125, "epoch": 0.8819080860965678, "grad_norm": 1.612917780165326, "kl": 0.08984375, "learning_rate": 3.583727432825473e-08, "loss": 0.0036, "reward": 1.076171875, "reward_std": 0.5463234260678291, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.451171875, "step": 758 }, { "completion_length": 297.26171875, "epoch": 0.8830715532286213, "grad_norm": 3.1391761751104337, "kl": 0.1534423828125, "learning_rate": 3.513952938986448e-08, "loss": 0.0061, "reward": 1.025390625, "reward_std": 0.5894178003072739, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.455078125, "step": 759 }, { "completion_length": 302.56640625, "epoch": 0.8842350203606748, "grad_norm": 44.33221085913618, "kl": 2.6070556640625, "learning_rate": 3.444839643625636e-08, "loss": 0.1041, "reward": 1.046875, "reward_std": 0.4296695441007614, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.46875, "step": 760 }, { "completion_length": 321.64453125, "epoch": 0.8853984874927283, "grad_norm": 1.7418227645501663, "kl": 0.11474609375, "learning_rate": 3.376388529782215e-08, "loss": 0.0046, "reward": 1.01953125, "reward_std": 0.4638404920697212, "rewards/correctness_reward_func": 0.578125, "rewards/strict_format_reward_func": 0.44140625, "step": 761 }, { "completion_length": 317.93359375, "epoch": 0.8865619546247818, "grad_norm": 6.1131335301365315, "kl": 0.352783203125, "learning_rate": 3.3086005710767694e-08, "loss": 0.0142, "reward": 0.953125, "reward_std": 0.5007273182272911, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.453125, "step": 762 }, { "completion_length": 356.39453125, "epoch": 0.8877254217568353, "grad_norm": 2.546735935381879, "kl": 0.1004638671875, "learning_rate": 3.241476731697451e-08, "loss": 0.004, "reward": 0.904296875, "reward_std": 0.40658577159047127, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.451171875, "step": 763 }, { "completion_length": 308.6171875, "epoch": 0.8888888888888888, "grad_norm": 0.3950286828423433, "kl": 0.10546875, "learning_rate": 3.175017966386223e-08, "loss": 0.0042, "reward": 0.98046875, "reward_std": 0.4716772064566612, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.44140625, "step": 764 }, { "completion_length": 319.17578125, "epoch": 0.8900523560209425, "grad_norm": 1.3450664806279955, "kl": 0.0870361328125, "learning_rate": 3.1092252204253153e-08, "loss": 0.0035, "reward": 0.951171875, "reward_std": 0.5447573810815811, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.451171875, "step": 765 }, { "completion_length": 314.3828125, "epoch": 0.891215823152996, "grad_norm": 1.5076674508753278, "kl": 0.0819091796875, "learning_rate": 3.0440994296237974e-08, "loss": 0.0033, "reward": 0.939453125, "reward_std": 0.43087102472782135, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.455078125, "step": 766 }, { "completion_length": 309.87109375, "epoch": 0.8923792902850495, "grad_norm": 0.9817020569434545, "kl": 0.1048583984375, "learning_rate": 2.9796415203042103e-08, "loss": 0.0042, "reward": 0.98046875, "reward_std": 0.47955894470214844, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.42578125, "step": 767 }, { "completion_length": 281.40234375, "epoch": 0.893542757417103, "grad_norm": 1.6227598787655342, "kl": 0.06982421875, "learning_rate": 2.915852409289421e-08, "loss": 0.0028, "reward": 1.0078125, "reward_std": 0.5688776820898056, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.453125, "step": 768 }, { "completion_length": 375.80078125, "epoch": 0.8947062245491565, "grad_norm": 3.8992303684541776, "kl": 0.14208984375, "learning_rate": 2.8527330038896237e-08, "loss": 0.0057, "reward": 0.98046875, "reward_std": 0.3626428246498108, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.43359375, "step": 769 }, { "completion_length": 296.828125, "epoch": 0.89586969168121, "grad_norm": 3.633956322953812, "kl": 0.139404296875, "learning_rate": 2.7902842018893534e-08, "loss": 0.0056, "reward": 1.0, "reward_std": 0.4490908682346344, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.4375, "step": 770 }, { "completion_length": 316.265625, "epoch": 0.8970331588132635, "grad_norm": 1.039970664086816, "kl": 0.1064453125, "learning_rate": 2.7285068915347598e-08, "loss": 0.0043, "reward": 0.966796875, "reward_std": 0.43056702613830566, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.451171875, "step": 771 }, { "completion_length": 294.74609375, "epoch": 0.898196625945317, "grad_norm": 0.30870362255404016, "kl": 0.104736328125, "learning_rate": 2.6674019515210034e-08, "loss": 0.0042, "reward": 0.912109375, "reward_std": 0.5002991110086441, "rewards/correctness_reward_func": 0.453125, "rewards/strict_format_reward_func": 0.458984375, "step": 772 }, { "completion_length": 289.14453125, "epoch": 0.8993600930773705, "grad_norm": 0.44174311901198593, "kl": 0.0859375, "learning_rate": 2.6069702509796952e-08, "loss": 0.0034, "reward": 1.25390625, "reward_std": 0.47460825741291046, "rewards/correctness_reward_func": 0.7890625, "rewards/strict_format_reward_func": 0.46484375, "step": 773 }, { "completion_length": 294.04296875, "epoch": 0.900523560209424, "grad_norm": 0.5327479277931182, "kl": 0.1046142578125, "learning_rate": 2.547212649466568e-08, "loss": 0.0042, "reward": 1.00390625, "reward_std": 0.4712277874350548, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.44140625, "step": 774 }, { "completion_length": 333.42578125, "epoch": 0.9016870273414777, "grad_norm": 0.7613531932432054, "kl": 0.083984375, "learning_rate": 2.488129996949251e-08, "loss": 0.0034, "reward": 0.935546875, "reward_std": 0.4523793235421181, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.443359375, "step": 775 }, { "completion_length": 342.44921875, "epoch": 0.9028504944735312, "grad_norm": 0.3680278201046508, "kl": 0.123291015625, "learning_rate": 2.429723133795175e-08, "loss": 0.0049, "reward": 0.857421875, "reward_std": 0.3546883091330528, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.443359375, "step": 776 }, { "completion_length": 325.8203125, "epoch": 0.9040139616055847, "grad_norm": 0.5706269535044037, "kl": 0.085205078125, "learning_rate": 2.3719928907596276e-08, "loss": 0.0034, "reward": 1.154296875, "reward_std": 0.44932079315185547, "rewards/correctness_reward_func": 0.703125, "rewards/strict_format_reward_func": 0.451171875, "step": 777 }, { "completion_length": 322.69140625, "epoch": 0.9051774287376382, "grad_norm": 1.610796337798366, "kl": 0.085205078125, "learning_rate": 2.3149400889739157e-08, "loss": 0.0034, "reward": 0.939453125, "reward_std": 0.44330472499132156, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.455078125, "step": 778 }, { "completion_length": 274.9375, "epoch": 0.9063408958696917, "grad_norm": 0.5466868124416662, "kl": 0.100341796875, "learning_rate": 2.258565539933699e-08, "loss": 0.004, "reward": 0.931640625, "reward_std": 0.4815060645341873, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.439453125, "step": 779 }, { "completion_length": 353.9453125, "epoch": 0.9075043630017452, "grad_norm": 1.4054566776146915, "kl": 0.11865234375, "learning_rate": 2.2028700454874648e-08, "loss": 0.0047, "reward": 0.974609375, "reward_std": 0.4319823980331421, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.443359375, "step": 780 }, { "completion_length": 308.98828125, "epoch": 0.9086678301337987, "grad_norm": 4.097130619615469, "kl": 0.10107421875, "learning_rate": 2.147854397825094e-08, "loss": 0.004, "reward": 0.96875, "reward_std": 0.49286097288131714, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.453125, "step": 781 }, { "completion_length": 332.69921875, "epoch": 0.9098312972658522, "grad_norm": 0.7811118999895404, "kl": 0.10870361328125, "learning_rate": 2.0935193794666016e-08, "loss": 0.0044, "reward": 0.896484375, "reward_std": 0.38460446894168854, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.451171875, "step": 782 }, { "completion_length": 299.0390625, "epoch": 0.9109947643979057, "grad_norm": 7.329773165641476, "kl": 0.152099609375, "learning_rate": 2.039865763251014e-08, "loss": 0.0061, "reward": 1.056640625, "reward_std": 0.541579507291317, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.455078125, "step": 783 }, { "completion_length": 312.19140625, "epoch": 0.9121582315299592, "grad_norm": 1.189209135177591, "kl": 0.0908203125, "learning_rate": 1.9868943123253878e-08, "loss": 0.0036, "reward": 1.001953125, "reward_std": 0.41858238726854324, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.455078125, "step": 784 }, { "completion_length": 343.62890625, "epoch": 0.9133216986620128, "grad_norm": 0.32503030285304224, "kl": 0.07855224609375, "learning_rate": 1.9346057801339187e-08, "loss": 0.0031, "reward": 1.076171875, "reward_std": 0.4651110917329788, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.451171875, "step": 785 }, { "completion_length": 322.51953125, "epoch": 0.9144851657940664, "grad_norm": 94.53499508658099, "kl": 0.373779296875, "learning_rate": 1.8830009104072563e-08, "loss": 0.015, "reward": 0.8203125, "reward_std": 0.3773195669054985, "rewards/correctness_reward_func": 0.375, "rewards/strict_format_reward_func": 0.4453125, "step": 786 }, { "completion_length": 316.1640625, "epoch": 0.9156486329261199, "grad_norm": 0.20449377133176783, "kl": 0.0986328125, "learning_rate": 1.8320804371519116e-08, "loss": 0.004, "reward": 1.060546875, "reward_std": 0.47561784088611603, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.458984375, "step": 787 }, { "completion_length": 321.12109375, "epoch": 0.9168121000581734, "grad_norm": 3.727879450572197, "kl": 0.134033203125, "learning_rate": 1.7818450846398327e-08, "loss": 0.0054, "reward": 0.9453125, "reward_std": 0.4038724824786186, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.453125, "step": 788 }, { "completion_length": 326.94921875, "epoch": 0.9179755671902269, "grad_norm": 1.5893641904788973, "kl": 0.103515625, "learning_rate": 1.7322955673980676e-08, "loss": 0.0041, "reward": 1.015625, "reward_std": 0.46848295256495476, "rewards/correctness_reward_func": 0.5625, "rewards/strict_format_reward_func": 0.453125, "step": 789 }, { "completion_length": 312.203125, "epoch": 0.9191390343222804, "grad_norm": 17.644077602529478, "kl": 0.32666015625, "learning_rate": 1.6834325901986524e-08, "loss": 0.0131, "reward": 0.943359375, "reward_std": 0.5718882977962494, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.443359375, "step": 790 }, { "completion_length": 291.92578125, "epoch": 0.9203025014543339, "grad_norm": 0.5015237641165674, "kl": 0.10546875, "learning_rate": 1.6352568480485275e-08, "loss": 0.0042, "reward": 1.1171875, "reward_std": 0.5393026173114777, "rewards/correctness_reward_func": 0.65625, "rewards/strict_format_reward_func": 0.4609375, "step": 791 }, { "completion_length": 349.2421875, "epoch": 0.9214659685863874, "grad_norm": 10.061149571424313, "kl": 0.1209716796875, "learning_rate": 1.587769026179725e-08, "loss": 0.0048, "reward": 0.939453125, "reward_std": 0.4830440953373909, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.439453125, "step": 792 }, { "completion_length": 312.56640625, "epoch": 0.9226294357184409, "grad_norm": 3.089511787294171, "kl": 0.0810546875, "learning_rate": 1.5409698000395376e-08, "loss": 0.0032, "reward": 1.037109375, "reward_std": 0.43310272693634033, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.451171875, "step": 793 }, { "completion_length": 319.2265625, "epoch": 0.9237929028504944, "grad_norm": 66.36461632851767, "kl": 0.8316650390625, "learning_rate": 1.494859835280987e-08, "loss": 0.0334, "reward": 0.890625, "reward_std": 0.3705134242773056, "rewards/correctness_reward_func": 0.4296875, "rewards/strict_format_reward_func": 0.4609375, "step": 794 }, { "completion_length": 355.99609375, "epoch": 0.924956369982548, "grad_norm": 0.5948963937101532, "kl": 0.0926513671875, "learning_rate": 1.4494397877532982e-08, "loss": 0.0037, "reward": 1.048828125, "reward_std": 0.4456535130739212, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.447265625, "step": 795 }, { "completion_length": 346.75390625, "epoch": 0.9261198371146016, "grad_norm": 0.51242722399878, "kl": 0.0994873046875, "learning_rate": 1.4047103034926177e-08, "loss": 0.004, "reward": 1.052734375, "reward_std": 0.5369174778461456, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.451171875, "step": 796 }, { "completion_length": 275.75, "epoch": 0.9272833042466551, "grad_norm": 2.8170705524045068, "kl": 0.1031494140625, "learning_rate": 1.3606720187127995e-08, "loss": 0.0041, "reward": 0.955078125, "reward_std": 0.42147450894117355, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.447265625, "step": 797 }, { "completion_length": 296.89453125, "epoch": 0.9284467713787086, "grad_norm": 1.022594243931321, "kl": 0.0909423828125, "learning_rate": 1.3173255597963396e-08, "loss": 0.0036, "reward": 0.95703125, "reward_std": 0.46783339232206345, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.44921875, "step": 798 }, { "completion_length": 337.546875, "epoch": 0.9296102385107621, "grad_norm": 1.0724052376516462, "kl": 0.1689453125, "learning_rate": 1.274671543285516e-08, "loss": 0.0067, "reward": 0.98046875, "reward_std": 0.44384726881980896, "rewards/correctness_reward_func": 0.5546875, "rewards/strict_format_reward_func": 0.42578125, "step": 799 }, { "completion_length": 331.92578125, "epoch": 0.9307737056428156, "grad_norm": 0.42480828510847096, "kl": 0.1151123046875, "learning_rate": 1.2327105758735689e-08, "loss": 0.0046, "reward": 1.052734375, "reward_std": 0.45338931679725647, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.443359375, "step": 800 }, { "completion_length": 356.15625, "epoch": 0.9319371727748691, "grad_norm": 3.271278175585602, "kl": 0.115966796875, "learning_rate": 1.1914432543961017e-08, "loss": 0.0046, "reward": 0.8515625, "reward_std": 0.38915014266967773, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.4296875, "step": 801 }, { "completion_length": 305.97265625, "epoch": 0.9331006399069226, "grad_norm": 1.6198963519504508, "kl": 0.0892333984375, "learning_rate": 1.1508701658225706e-08, "loss": 0.0036, "reward": 1.0546875, "reward_std": 0.3567136153578758, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.4609375, "step": 802 }, { "completion_length": 338.84375, "epoch": 0.9342641070389761, "grad_norm": 0.9102164740859863, "kl": 0.1387939453125, "learning_rate": 1.110991887247964e-08, "loss": 0.0056, "reward": 1.08203125, "reward_std": 0.4604636877775192, "rewards/correctness_reward_func": 0.6328125, "rewards/strict_format_reward_func": 0.44921875, "step": 803 }, { "completion_length": 349.2578125, "epoch": 0.9354275741710296, "grad_norm": 0.7148740763136451, "kl": 0.11474609375, "learning_rate": 1.0718089858845592e-08, "loss": 0.0046, "reward": 1.064453125, "reward_std": 0.44394874572753906, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.439453125, "step": 804 }, { "completion_length": 376.60546875, "epoch": 0.9365910413030832, "grad_norm": 3.657170143455964, "kl": 0.1202392578125, "learning_rate": 1.033322019053906e-08, "loss": 0.0048, "reward": 0.890625, "reward_std": 0.48049451410770416, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.4296875, "step": 805 }, { "completion_length": 352.61328125, "epoch": 0.9377545084351367, "grad_norm": 0.6944334620186885, "kl": 0.0963134765625, "learning_rate": 9.955315341788228e-09, "loss": 0.0039, "reward": 0.900390625, "reward_std": 0.4807383790612221, "rewards/correctness_reward_func": 0.46875, "rewards/strict_format_reward_func": 0.431640625, "step": 806 }, { "completion_length": 344.56640625, "epoch": 0.9389179755671903, "grad_norm": 5.452061983557803, "kl": 0.0859375, "learning_rate": 9.5843806877568e-09, "loss": 0.0034, "reward": 0.873046875, "reward_std": 0.399037629365921, "rewards/correctness_reward_func": 0.421875, "rewards/strict_format_reward_func": 0.451171875, "step": 807 }, { "completion_length": 293.71484375, "epoch": 0.9400814426992438, "grad_norm": 4.200974113081239, "kl": 0.0762939453125, "learning_rate": 9.22042150446728e-09, "loss": 0.0031, "reward": 1.087890625, "reward_std": 0.40926627814769745, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.462890625, "step": 808 }, { "completion_length": 313.515625, "epoch": 0.9412449098312973, "grad_norm": 16.74515544148267, "kl": 0.1064453125, "learning_rate": 8.863442968725987e-09, "loss": 0.0043, "reward": 1.078125, "reward_std": 0.5370510369539261, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.453125, "step": 809 }, { "completion_length": 332.73046875, "epoch": 0.9424083769633508, "grad_norm": 2.14909621684026, "kl": 0.0994873046875, "learning_rate": 8.513450158049106e-09, "loss": 0.004, "reward": 0.90625, "reward_std": 0.4492138475179672, "rewards/correctness_reward_func": 0.4609375, "rewards/strict_format_reward_func": 0.4453125, "step": 810 }, { "completion_length": 276.33984375, "epoch": 0.9435718440954043, "grad_norm": 4.446359842149603, "kl": 0.1064453125, "learning_rate": 8.170448050591028e-09, "loss": 0.0043, "reward": 0.96875, "reward_std": 0.42111363261938095, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.453125, "step": 811 }, { "completion_length": 309.35546875, "epoch": 0.9447353112274578, "grad_norm": 0.7068831668668034, "kl": 0.102294921875, "learning_rate": 7.834441525073066e-09, "loss": 0.0041, "reward": 1.00390625, "reward_std": 0.3807905316352844, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.45703125, "step": 812 }, { "completion_length": 284.41015625, "epoch": 0.9458987783595113, "grad_norm": 0.5865120603463342, "kl": 0.125, "learning_rate": 7.505435360714407e-09, "loss": 0.005, "reward": 1.16015625, "reward_std": 0.42700520157814026, "rewards/correctness_reward_func": 0.6953125, "rewards/strict_format_reward_func": 0.46484375, "step": 813 }, { "completion_length": 299.578125, "epoch": 0.9470622454915648, "grad_norm": 0.9394163129924192, "kl": 0.1270751953125, "learning_rate": 7.183434237163721e-09, "loss": 0.0051, "reward": 0.98828125, "reward_std": 0.3410123586654663, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.45703125, "step": 814 }, { "completion_length": 344.43359375, "epoch": 0.9482257126236184, "grad_norm": 2.12684418332858, "kl": 0.146728515625, "learning_rate": 6.8684427344331e-09, "loss": 0.0059, "reward": 1.068359375, "reward_std": 0.5005639493465424, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.451171875, "step": 815 }, { "completion_length": 305.5546875, "epoch": 0.9493891797556719, "grad_norm": 1.8330996975833365, "kl": 0.1220703125, "learning_rate": 6.560465332832499e-09, "loss": 0.0049, "reward": 1.09765625, "reward_std": 0.43683435022830963, "rewards/correctness_reward_func": 0.640625, "rewards/strict_format_reward_func": 0.45703125, "step": 816 }, { "completion_length": 312.50390625, "epoch": 0.9505526468877254, "grad_norm": 0.45058690158985737, "kl": 0.1502685546875, "learning_rate": 6.259506412906402e-09, "loss": 0.006, "reward": 0.974609375, "reward_std": 0.4659872204065323, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.435546875, "step": 817 }, { "completion_length": 266.9296875, "epoch": 0.951716114019779, "grad_norm": 2.942864783233079, "kl": 0.0980224609375, "learning_rate": 5.965570255370866e-09, "loss": 0.0039, "reward": 1.1875, "reward_std": 0.6134917289018631, "rewards/correctness_reward_func": 0.71875, "rewards/strict_format_reward_func": 0.46875, "step": 818 }, { "completion_length": 329.80859375, "epoch": 0.9528795811518325, "grad_norm": 2.510740235065877, "kl": 0.0916748046875, "learning_rate": 5.678661041053467e-09, "loss": 0.0037, "reward": 1.052734375, "reward_std": 0.42922230437397957, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.443359375, "step": 819 }, { "completion_length": 321.67578125, "epoch": 0.954043048283886, "grad_norm": 4.188125941802571, "kl": 0.0823974609375, "learning_rate": 5.398782850833172e-09, "loss": 0.0033, "reward": 1.09765625, "reward_std": 0.4758744537830353, "rewards/correctness_reward_func": 0.6484375, "rewards/strict_format_reward_func": 0.44921875, "step": 820 }, { "completion_length": 304.56640625, "epoch": 0.9552065154159395, "grad_norm": 0.25251854122574197, "kl": 0.08551025390625, "learning_rate": 5.125939665582779e-09, "loss": 0.0034, "reward": 1.0, "reward_std": 0.3148980922996998, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.4609375, "step": 821 }, { "completion_length": 320.71484375, "epoch": 0.956369982547993, "grad_norm": 1.1047255040871093, "kl": 0.0821533203125, "learning_rate": 4.860135366111739e-09, "loss": 0.0033, "reward": 1.14453125, "reward_std": 0.4656406491994858, "rewards/correctness_reward_func": 0.703125, "rewards/strict_format_reward_func": 0.44140625, "step": 822 }, { "completion_length": 309.2578125, "epoch": 0.9575334496800465, "grad_norm": 7.075478514378517, "kl": 0.101318359375, "learning_rate": 4.601373733111591e-09, "loss": 0.0041, "reward": 0.9921875, "reward_std": 0.4492216631770134, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.453125, "step": 823 }, { "completion_length": 340.265625, "epoch": 0.9586969168121, "grad_norm": 0.6662651478739545, "kl": 0.13232421875, "learning_rate": 4.349658447101612e-09, "loss": 0.0053, "reward": 0.958984375, "reward_std": 0.3985661454498768, "rewards/correctness_reward_func": 0.5, "rewards/strict_format_reward_func": 0.458984375, "step": 824 }, { "completion_length": 292.55859375, "epoch": 0.9598603839441536, "grad_norm": 0.6626311347525116, "kl": 0.0849609375, "learning_rate": 4.104993088376974e-09, "loss": 0.0034, "reward": 1.126953125, "reward_std": 0.5581348538398743, "rewards/correctness_reward_func": 0.671875, "rewards/strict_format_reward_func": 0.455078125, "step": 825 }, { "completion_length": 310.69140625, "epoch": 0.9610238510762071, "grad_norm": 2.1151640267223772, "kl": 0.0831298828125, "learning_rate": 3.867381136957337e-09, "loss": 0.0033, "reward": 1.009765625, "reward_std": 0.4991043657064438, "rewards/correctness_reward_func": 0.5703125, "rewards/strict_format_reward_func": 0.439453125, "step": 826 }, { "completion_length": 364.48828125, "epoch": 0.9621873182082606, "grad_norm": 1.8783975265679693, "kl": 0.1162109375, "learning_rate": 3.6368259725377824e-09, "loss": 0.0046, "reward": 0.9296875, "reward_std": 0.4377868250012398, "rewards/correctness_reward_func": 0.4765625, "rewards/strict_format_reward_func": 0.453125, "step": 827 }, { "completion_length": 309.43359375, "epoch": 0.9633507853403142, "grad_norm": 1.0689569310315563, "kl": 0.1153564453125, "learning_rate": 3.413330874440401e-09, "loss": 0.0046, "reward": 1.037109375, "reward_std": 0.45697519183158875, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.451171875, "step": 828 }, { "completion_length": 391.2578125, "epoch": 0.9645142524723677, "grad_norm": 5.037674834424245, "kl": 0.1258544921875, "learning_rate": 3.196899021567889e-09, "loss": 0.005, "reward": 0.9375, "reward_std": 0.4380328506231308, "rewards/correctness_reward_func": 0.5078125, "rewards/strict_format_reward_func": 0.4296875, "step": 829 }, { "completion_length": 339.34765625, "epoch": 0.9656777196044212, "grad_norm": 2.8244574833731826, "kl": 0.1007080078125, "learning_rate": 2.9875334923581963e-09, "loss": 0.004, "reward": 0.994140625, "reward_std": 0.5543932020664215, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.447265625, "step": 830 }, { "completion_length": 296.828125, "epoch": 0.9668411867364747, "grad_norm": 3.7392044030636233, "kl": 0.091064453125, "learning_rate": 2.7852372647407805e-09, "loss": 0.0036, "reward": 0.982421875, "reward_std": 0.39052554219961166, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.466796875, "step": 831 }, { "completion_length": 295.640625, "epoch": 0.9680046538685282, "grad_norm": 0.44160858117254287, "kl": 0.085693359375, "learning_rate": 2.590013216094311e-09, "loss": 0.0034, "reward": 1.08203125, "reward_std": 0.45800653100013733, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.45703125, "step": 832 }, { "completion_length": 321.96875, "epoch": 0.9691681210005817, "grad_norm": 4.092960647514768, "kl": 0.13525390625, "learning_rate": 2.4018641232055325e-09, "loss": 0.0054, "reward": 1.068359375, "reward_std": 0.39710813760757446, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.443359375, "step": 833 }, { "completion_length": 332.01953125, "epoch": 0.9703315881326352, "grad_norm": 1.1768535630658568, "kl": 0.0938720703125, "learning_rate": 2.220792662230131e-09, "loss": 0.0038, "reward": 1.064453125, "reward_std": 0.4868198335170746, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.447265625, "step": 834 }, { "completion_length": 353.94140625, "epoch": 0.9714950552646888, "grad_norm": 0.28947105033325005, "kl": 0.0819091796875, "learning_rate": 2.0468014086542085e-09, "loss": 0.0033, "reward": 1.064453125, "reward_std": 0.38710395246744156, "rewards/correctness_reward_func": 0.609375, "rewards/strict_format_reward_func": 0.455078125, "step": 835 }, { "completion_length": 340.3828125, "epoch": 0.9726585223967423, "grad_norm": 4.444961702929655, "kl": 0.14990234375, "learning_rate": 1.8798928372581457e-09, "loss": 0.006, "reward": 0.986328125, "reward_std": 0.5000115260481834, "rewards/correctness_reward_func": 0.53125, "rewards/strict_format_reward_func": 0.455078125, "step": 836 }, { "completion_length": 331.98828125, "epoch": 0.9738219895287958, "grad_norm": 0.9879592177991162, "kl": 0.1031494140625, "learning_rate": 1.7200693220810192e-09, "loss": 0.0041, "reward": 1.033203125, "reward_std": 0.4458390101790428, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.447265625, "step": 837 }, { "completion_length": 324.5234375, "epoch": 0.9749854566608493, "grad_norm": 1.2800082364946195, "kl": 0.1270751953125, "learning_rate": 1.5673331363870169e-09, "loss": 0.0051, "reward": 1.048828125, "reward_std": 0.4616438299417496, "rewards/correctness_reward_func": 0.59375, "rewards/strict_format_reward_func": 0.455078125, "step": 838 }, { "completion_length": 303.63671875, "epoch": 0.9761489237929029, "grad_norm": 8.92557826095573, "kl": 0.0931396484375, "learning_rate": 1.4216864526329643e-09, "loss": 0.0037, "reward": 1.1015625, "reward_std": 0.49787144735455513, "rewards/correctness_reward_func": 0.6484375, "rewards/strict_format_reward_func": 0.453125, "step": 839 }, { "completion_length": 316.296875, "epoch": 0.9773123909249564, "grad_norm": 2.351310175804669, "kl": 0.1256103515625, "learning_rate": 1.2831313424376267e-09, "loss": 0.005, "reward": 0.978515625, "reward_std": 0.5443897843360901, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.455078125, "step": 840 }, { "completion_length": 359.578125, "epoch": 0.9784758580570099, "grad_norm": 0.2981996943597667, "kl": 0.0865478515625, "learning_rate": 1.1516697765521223e-09, "loss": 0.0035, "reward": 0.9765625, "reward_std": 0.3761020302772522, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.4609375, "step": 841 }, { "completion_length": 285.67578125, "epoch": 0.9796393251890634, "grad_norm": 2.5601262528297877, "kl": 0.1019287109375, "learning_rate": 1.0273036248318324e-09, "loss": 0.0041, "reward": 1.080078125, "reward_std": 0.5333575308322906, "rewards/correctness_reward_func": 0.6171875, "rewards/strict_format_reward_func": 0.462890625, "step": 842 }, { "completion_length": 328.875, "epoch": 0.9808027923211169, "grad_norm": 4.866206326858221, "kl": 0.108154296875, "learning_rate": 9.100346562099237e-10, "loss": 0.0043, "reward": 0.935546875, "reward_std": 0.457660511136055, "rewards/correctness_reward_func": 0.4921875, "rewards/strict_format_reward_func": 0.443359375, "step": 843 }, { "completion_length": 293.51171875, "epoch": 0.9819662594531704, "grad_norm": 6.8749105019935906, "kl": 0.0985107421875, "learning_rate": 7.998645386722013e-10, "loss": 0.0039, "reward": 1.091796875, "reward_std": 0.48302504420280457, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.466796875, "step": 844 }, { "completion_length": 347.90625, "epoch": 0.983129726585224, "grad_norm": 66.0917893718306, "kl": 0.1990966796875, "learning_rate": 6.967948392331835e-10, "loss": 0.0079, "reward": 0.970703125, "reward_std": 0.5465194135904312, "rewards/correctness_reward_func": 0.5234375, "rewards/strict_format_reward_func": 0.447265625, "step": 845 }, { "completion_length": 324.49609375, "epoch": 0.9842931937172775, "grad_norm": 1.8764937901178471, "kl": 0.111572265625, "learning_rate": 6.008270239140634e-10, "loss": 0.0045, "reward": 1.078125, "reward_std": 0.4222945496439934, "rewards/correctness_reward_func": 0.625, "rewards/strict_format_reward_func": 0.453125, "step": 846 }, { "completion_length": 289.765625, "epoch": 0.985456660849331, "grad_norm": 8.065742818569761, "kl": 0.1278076171875, "learning_rate": 5.119624577216708e-10, "loss": 0.0051, "reward": 0.94921875, "reward_std": 0.40326476842164993, "rewards/correctness_reward_func": 0.484375, "rewards/strict_format_reward_func": 0.46484375, "step": 847 }, { "completion_length": 354.26953125, "epoch": 0.9866201279813845, "grad_norm": 0.6430303781894549, "kl": 0.091064453125, "learning_rate": 4.3020240462920966e-10, "loss": 0.0036, "reward": 1.126953125, "reward_std": 0.4590747430920601, "rewards/correctness_reward_func": 0.703125, "rewards/strict_format_reward_func": 0.423828125, "step": 848 }, { "completion_length": 365.34375, "epoch": 0.987783595113438, "grad_norm": 1.110330309793843, "kl": 0.10498046875, "learning_rate": 3.5554802755816127e-10, "loss": 0.0042, "reward": 0.87109375, "reward_std": 0.5011670812964439, "rewards/correctness_reward_func": 0.4375, "rewards/strict_format_reward_func": 0.43359375, "step": 849 }, { "completion_length": 292.77734375, "epoch": 0.9889470622454916, "grad_norm": 37.52322242043326, "kl": 0.5809326171875, "learning_rate": 2.8800038836174214e-10, "loss": 0.0232, "reward": 1.15234375, "reward_std": 0.4703781232237816, "rewards/correctness_reward_func": 0.6953125, "rewards/strict_format_reward_func": 0.45703125, "step": 850 }, { "completion_length": 351.9140625, "epoch": 0.9901105293775451, "grad_norm": 1.2119764501997334, "kl": 0.0904541015625, "learning_rate": 2.275604478099158e-10, "loss": 0.0036, "reward": 0.96484375, "reward_std": 0.38621649146080017, "rewards/correctness_reward_func": 0.515625, "rewards/strict_format_reward_func": 0.44921875, "step": 851 }, { "completion_length": 276.19140625, "epoch": 0.9912739965095986, "grad_norm": 2.765841388264792, "kl": 0.0982666015625, "learning_rate": 1.742290655755707e-10, "loss": 0.0039, "reward": 1.052734375, "reward_std": 0.38674271479249, "rewards/correctness_reward_func": 0.5859375, "rewards/strict_format_reward_func": 0.466796875, "step": 852 }, { "completion_length": 318.44921875, "epoch": 0.9924374636416521, "grad_norm": 1.396492140892674, "kl": 0.085693359375, "learning_rate": 1.2800700022252974e-10, "loss": 0.0034, "reward": 0.916015625, "reward_std": 0.5383987724781036, "rewards/correctness_reward_func": 0.4453125, "rewards/strict_format_reward_func": 0.470703125, "step": 853 }, { "completion_length": 294.07421875, "epoch": 0.9936009307737056, "grad_norm": 0.8896869595367466, "kl": 0.07562255859375, "learning_rate": 8.889490919439246e-11, "loss": 0.003, "reward": 0.982421875, "reward_std": 0.468200221657753, "rewards/correctness_reward_func": 0.5390625, "rewards/strict_format_reward_func": 0.443359375, "step": 854 }, { "completion_length": 346.2578125, "epoch": 0.9947643979057592, "grad_norm": 14.238381007499067, "kl": 0.0941162109375, "learning_rate": 5.689334880559782e-11, "loss": 0.0038, "reward": 0.98828125, "reward_std": 0.38533400744199753, "rewards/correctness_reward_func": 0.546875, "rewards/strict_format_reward_func": 0.44140625, "step": 855 }, { "completion_length": 283.6640625, "epoch": 0.9959278650378127, "grad_norm": 0.7657944957059711, "kl": 0.110107421875, "learning_rate": 3.200277423320852e-11, "loss": 0.0044, "reward": 1.12109375, "reward_std": 0.4498387277126312, "rewards/correctness_reward_func": 0.6640625, "rewards/strict_format_reward_func": 0.45703125, "step": 856 }, { "completion_length": 291.46875, "epoch": 0.9970913321698662, "grad_norm": 1.711184019223977, "kl": 0.116455078125, "learning_rate": 1.4223539510471727e-11, "loss": 0.0047, "reward": 0.87109375, "reward_std": 0.47508911043405533, "rewards/correctness_reward_func": 0.4140625, "rewards/strict_format_reward_func": 0.45703125, "step": 857 }, { "completion_length": 337.90234375, "epoch": 0.9982547993019197, "grad_norm": 0.7688926809682137, "kl": 0.0902099609375, "learning_rate": 3.5558975220451037e-12, "loss": 0.0036, "reward": 1.0390625, "reward_std": 0.5588367879390717, "rewards/correctness_reward_func": 0.6015625, "rewards/strict_format_reward_func": 0.4375, "step": 858 }, { "completion_length": 320.6328125, "epoch": 0.9994182664339732, "grad_norm": 1.239317094242089, "kl": 0.1253662109375, "learning_rate": 0.0, "loss": 0.005, "reward": 1.111328125, "reward_std": 0.3847053796052933, "rewards/correctness_reward_func": 0.6640625, "rewards/strict_format_reward_func": 0.447265625, "step": 859 } ], "logging_steps": 1, "max_steps": 859, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }