{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 1000, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 687.15625, "epoch": 0.0010666666666666667, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.191489361702128e-08, "loss": -0.0142, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1 }, { "completion_length": 770.46875, "epoch": 0.0021333333333333334, "grad_norm": 0.11273776739835739, "kl": 0.0, "learning_rate": 6.382978723404255e-08, "loss": 0.0433, "reward": 0.28125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 2 }, { "completion_length": 694.84375, "epoch": 0.0032, "grad_norm": 0.2474975287914276, "kl": 4.477798938751221e-05, "learning_rate": 9.574468085106382e-08, "loss": 0.2026, "reward": 0.34375, "reward_std": 0.47617512941360474, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 3 }, { "completion_length": 746.90625, "epoch": 0.004266666666666667, "grad_norm": 0.20613761246204376, "kl": 2.1457672119140625e-05, "learning_rate": 1.276595744680851e-07, "loss": 0.1229, "reward": 0.375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 4 }, { "completion_length": 787.46875, "epoch": 0.005333333333333333, "grad_norm": 0.15468862652778625, "kl": 4.1157007217407227e-05, "learning_rate": 1.5957446808510638e-07, "loss": 0.157, "reward": 0.21875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 752.75, "epoch": 0.0064, "grad_norm": 0.0005501502892002463, "kl": 2.8360635042190552e-05, "learning_rate": 1.9148936170212765e-07, "loss": 0.0008, "reward": 0.1875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 6 }, { "completion_length": 656.6875, "epoch": 0.007466666666666667, "grad_norm": 0.2663656175136566, "kl": 2.4611130356788635e-05, "learning_rate": 2.2340425531914894e-07, "loss": -0.0625, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 7 }, { "completion_length": 749.15625, "epoch": 0.008533333333333334, "grad_norm": 0.16067425906658173, "kl": 4.4444575905799866e-05, "learning_rate": 2.553191489361702e-07, "loss": -0.0353, "reward": 0.5, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 8 }, { "completion_length": 624.90625, "epoch": 0.0096, "grad_norm": 0.21902143955230713, "kl": 5.883350968360901e-05, "learning_rate": 2.872340425531915e-07, "loss": -0.0187, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 9 }, { "completion_length": 662.0625, "epoch": 0.010666666666666666, "grad_norm": 0.1778605580329895, "kl": 9.663961827754974e-05, "learning_rate": 3.1914893617021275e-07, "loss": 0.0428, "reward": 0.21875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 691.8125, "epoch": 0.011733333333333333, "grad_norm": 0.0006564147188328207, "kl": 4.3779611587524414e-05, "learning_rate": 3.5106382978723405e-07, "loss": 0.0774, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 11 }, { "completion_length": 650.28125, "epoch": 0.0128, "grad_norm": 0.06465266644954681, "kl": 6.56023621559143e-05, "learning_rate": 3.829787234042553e-07, "loss": -0.0831, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 12 }, { "completion_length": 560.0625, "epoch": 0.013866666666666666, "grad_norm": 0.2300390750169754, "kl": 3.215670585632324e-05, "learning_rate": 4.1489361702127664e-07, "loss": -0.1054, "reward": 0.40625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 13 }, { "completion_length": 693.5, "epoch": 0.014933333333333333, "grad_norm": 0.13349862396717072, "kl": 4.921481013298035e-05, "learning_rate": 4.468085106382979e-07, "loss": 0.0447, "reward": 0.3125, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 14 }, { "completion_length": 733.0, "epoch": 0.016, "grad_norm": 0.0008494535577483475, "kl": 8.445978164672852e-05, "learning_rate": 4.787234042553192e-07, "loss": 0.1437, "reward": 0.28125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 646.125, "epoch": 0.017066666666666667, "grad_norm": 0.002213010098785162, "kl": 0.00014260411262512207, "learning_rate": 5.106382978723404e-07, "loss": 0.0574, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 16 }, { "completion_length": 780.875, "epoch": 0.018133333333333335, "grad_norm": 0.1918860226869583, "kl": 4.2907893657684326e-05, "learning_rate": 5.425531914893618e-07, "loss": 0.1155, "reward": 0.34375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 17 }, { "completion_length": 720.3125, "epoch": 0.0192, "grad_norm": 0.3138134479522705, "kl": 7.924437522888184e-05, "learning_rate": 5.74468085106383e-07, "loss": -0.031, "reward": 0.21875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 18 }, { "completion_length": 696.90625, "epoch": 0.020266666666666665, "grad_norm": 0.0012869596248492599, "kl": 0.00012006238102912903, "learning_rate": 6.063829787234043e-07, "loss": 0.0058, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 19 }, { "completion_length": 712.65625, "epoch": 0.021333333333333333, "grad_norm": 0.11523472517728806, "kl": 3.528222441673279e-05, "learning_rate": 6.382978723404255e-07, "loss": 0.1425, "reward": 0.21875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 777.65625, "epoch": 0.0224, "grad_norm": 0.15781642496585846, "kl": 4.273653030395508e-05, "learning_rate": 6.702127659574468e-07, "loss": 0.0212, "reward": 0.28125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 21 }, { "completion_length": 640.28125, "epoch": 0.023466666666666667, "grad_norm": 0.21065963804721832, "kl": 0.00011780858039855957, "learning_rate": 7.021276595744681e-07, "loss": 0.0621, "reward": 0.40625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 22 }, { "completion_length": 754.3125, "epoch": 0.024533333333333334, "grad_norm": 0.12740880250930786, "kl": 7.056631147861481e-05, "learning_rate": 7.340425531914893e-07, "loss": 0.058, "reward": 0.375, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 23 }, { "completion_length": 642.40625, "epoch": 0.0256, "grad_norm": 0.1383991241455078, "kl": 0.00010097585618495941, "learning_rate": 7.659574468085106e-07, "loss": 0.0622, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 24 }, { "completion_length": 625.4375, "epoch": 0.02666666666666667, "grad_norm": 0.29339179396629333, "kl": 5.9373676776885986e-05, "learning_rate": 7.978723404255319e-07, "loss": -0.0694, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 658.78125, "epoch": 0.027733333333333332, "grad_norm": 0.0005929844919592142, "kl": 8.183717727661133e-05, "learning_rate": 8.297872340425533e-07, "loss": 0.0083, "reward": 0.21875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 26 }, { "completion_length": 696.15625, "epoch": 0.0288, "grad_norm": 0.1070026159286499, "kl": 6.272271275520325e-05, "learning_rate": 8.617021276595745e-07, "loss": 0.0303, "reward": 0.40625, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 27 }, { "completion_length": 696.84375, "epoch": 0.029866666666666666, "grad_norm": 0.26169028878211975, "kl": 0.00011010654270648956, "learning_rate": 8.936170212765958e-07, "loss": -0.0293, "reward": 0.28125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 28 }, { "completion_length": 678.15625, "epoch": 0.030933333333333334, "grad_norm": 0.00044485993566922843, "kl": 6.862729787826538e-05, "learning_rate": 9.25531914893617e-07, "loss": 0.0312, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 29 }, { "completion_length": 697.96875, "epoch": 0.032, "grad_norm": 0.000516842701472342, "kl": 5.586445331573486e-05, "learning_rate": 9.574468085106384e-07, "loss": 0.0589, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 652.125, "epoch": 0.03306666666666667, "grad_norm": 0.10492980480194092, "kl": 0.00010887160897254944, "learning_rate": 9.893617021276595e-07, "loss": -0.0501, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 31 }, { "completion_length": 676.28125, "epoch": 0.034133333333333335, "grad_norm": 0.0013744005700573325, "kl": 5.1118433475494385e-05, "learning_rate": 1.0212765957446809e-06, "loss": 0.0275, "reward": 0.1875, "reward_std": 0.125, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 32 }, { "completion_length": 750.59375, "epoch": 0.0352, "grad_norm": 0.08381912112236023, "kl": 6.349384784698486e-05, "learning_rate": 1.0531914893617022e-06, "loss": 0.1103, "reward": 0.28125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 33 }, { "completion_length": 805.3125, "epoch": 0.03626666666666667, "grad_norm": 0.11046674847602844, "kl": 5.961954593658447e-05, "learning_rate": 1.0851063829787236e-06, "loss": 0.1025, "reward": 0.28125, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 34 }, { "completion_length": 677.625, "epoch": 0.037333333333333336, "grad_norm": 0.20119024813175201, "kl": 0.00017164647579193115, "learning_rate": 1.1170212765957447e-06, "loss": 0.076, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 679.65625, "epoch": 0.0384, "grad_norm": 0.3048509359359741, "kl": 0.00010529160499572754, "learning_rate": 1.148936170212766e-06, "loss": 0.0993, "reward": 0.375, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 36 }, { "completion_length": 693.4375, "epoch": 0.039466666666666664, "grad_norm": 0.08889836072921753, "kl": 0.0001583397388458252, "learning_rate": 1.1808510638297874e-06, "loss": -0.0244, "reward": 0.125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 37 }, { "completion_length": 801.375, "epoch": 0.04053333333333333, "grad_norm": 0.14582732319831848, "kl": 8.66129994392395e-05, "learning_rate": 1.2127659574468085e-06, "loss": 0.0872, "reward": 0.40625, "reward_std": 0.4375, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 38 }, { "completion_length": 750.90625, "epoch": 0.0416, "grad_norm": 0.257066011428833, "kl": 0.00014001131057739258, "learning_rate": 1.2446808510638299e-06, "loss": -0.0024, "reward": 0.34375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 39 }, { "completion_length": 697.0, "epoch": 0.042666666666666665, "grad_norm": 0.2912440001964569, "kl": 0.000148773193359375, "learning_rate": 1.276595744680851e-06, "loss": 0.1046, "reward": 0.34375, "reward_std": 0.5096687823534012, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 642.8125, "epoch": 0.04373333333333333, "grad_norm": 0.2337706983089447, "kl": 0.00018884241580963135, "learning_rate": 1.3085106382978724e-06, "loss": 0.0943, "reward": 0.34375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 41 }, { "completion_length": 676.6875, "epoch": 0.0448, "grad_norm": 0.33140188455581665, "kl": 0.0002124309539794922, "learning_rate": 1.3404255319148935e-06, "loss": 0.0881, "reward": 0.46875, "reward_std": 0.47617512941360474, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 42 }, { "completion_length": 667.78125, "epoch": 0.04586666666666667, "grad_norm": 0.12390223145484924, "kl": 0.0001453012228012085, "learning_rate": 1.3723404255319149e-06, "loss": 0.0446, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 43 }, { "completion_length": 782.9375, "epoch": 0.046933333333333334, "grad_norm": 0.0025114910677075386, "kl": 0.00016131997108459473, "learning_rate": 1.4042553191489362e-06, "loss": 0.022, "reward": 0.1875, "reward_std": 0.125, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 44 }, { "completion_length": 674.03125, "epoch": 0.048, "grad_norm": 0.13197164237499237, "kl": 0.00039150193333625793, "learning_rate": 1.4361702127659576e-06, "loss": 0.1133, "reward": 0.3125, "reward_std": 0.375, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 644.65625, "epoch": 0.04906666666666667, "grad_norm": 0.1735800951719284, "kl": 0.00023800134658813477, "learning_rate": 1.4680851063829787e-06, "loss": -0.045, "reward": 0.46875, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 46 }, { "completion_length": 753.5, "epoch": 0.050133333333333335, "grad_norm": 0.17605522274971008, "kl": 0.00011587515473365784, "learning_rate": 1.5e-06, "loss": 0.0388, "reward": 0.34375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 47 }, { "completion_length": 617.5625, "epoch": 0.0512, "grad_norm": 0.2180137187242508, "kl": 0.0005861446261405945, "learning_rate": 1.5319148936170212e-06, "loss": 0.0297, "reward": 0.53125, "reward_std": 0.4375, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 48 }, { "completion_length": 721.0625, "epoch": 0.05226666666666667, "grad_norm": 0.0011795010650530457, "kl": 0.00018236041069030762, "learning_rate": 1.5638297872340427e-06, "loss": 0.0358, "reward": 0.28125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 49 }, { "completion_length": 771.0, "epoch": 0.05333333333333334, "grad_norm": 0.21834860742092133, "kl": 0.00017663836479187012, "learning_rate": 1.5957446808510639e-06, "loss": 0.0555, "reward": 0.28125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 751.46875, "epoch": 0.0544, "grad_norm": 0.11882063001394272, "kl": 0.0001669377088546753, "learning_rate": 1.627659574468085e-06, "loss": 0.0812, "reward": 0.34375, "reward_std": 0.47617512941360474, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 51 }, { "completion_length": 699.90625, "epoch": 0.055466666666666664, "grad_norm": 0.08146371692419052, "kl": 0.0004940032958984375, "learning_rate": 1.6595744680851066e-06, "loss": 0.0305, "reward": 0.28125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 52 }, { "completion_length": 674.75, "epoch": 0.05653333333333333, "grad_norm": 0.30303362011909485, "kl": 0.0007016658782958984, "learning_rate": 1.6914893617021277e-06, "loss": 0.0822, "reward": 0.5, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 53 }, { "completion_length": 626.96875, "epoch": 0.0576, "grad_norm": 0.17934177815914154, "kl": 0.0005464553833007812, "learning_rate": 1.723404255319149e-06, "loss": 0.173, "reward": 0.4375, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 54 }, { "completion_length": 752.34375, "epoch": 0.058666666666666666, "grad_norm": 0.15815655887126923, "kl": 0.0002664923667907715, "learning_rate": 1.7553191489361702e-06, "loss": 0.0958, "reward": 0.375, "reward_std": 0.4858439117670059, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 719.0, "epoch": 0.05973333333333333, "grad_norm": 0.2513146996498108, "kl": 0.0003504753112792969, "learning_rate": 1.7872340425531915e-06, "loss": 0.1256, "reward": 0.375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 56 }, { "completion_length": 713.03125, "epoch": 0.0608, "grad_norm": 0.14674153923988342, "kl": 0.00043064355850219727, "learning_rate": 1.819148936170213e-06, "loss": 0.1261, "reward": 0.53125, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 57 }, { "completion_length": 702.375, "epoch": 0.06186666666666667, "grad_norm": 0.19132496416568756, "kl": 0.001273036003112793, "learning_rate": 1.851063829787234e-06, "loss": 0.0243, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 58 }, { "completion_length": 796.65625, "epoch": 0.06293333333333333, "grad_norm": 0.23306652903556824, "kl": 0.0002745389938354492, "learning_rate": 1.8829787234042552e-06, "loss": 0.054, "reward": 0.34375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 59 }, { "completion_length": 670.3125, "epoch": 0.064, "grad_norm": 0.18134832382202148, "kl": 0.0023223161697387695, "learning_rate": 1.9148936170212767e-06, "loss": 0.0617, "reward": 0.4375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 632.21875, "epoch": 0.06506666666666666, "grad_norm": 0.35671180486679077, "kl": 0.0012333393096923828, "learning_rate": 1.946808510638298e-06, "loss": 0.1688, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 61 }, { "completion_length": 695.28125, "epoch": 0.06613333333333334, "grad_norm": 0.020816734060645103, "kl": 0.0013549327850341797, "learning_rate": 1.978723404255319e-06, "loss": 0.0126, "reward": 0.125, "reward_std": 0.125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 62 }, { "completion_length": 739.6875, "epoch": 0.0672, "grad_norm": 0.07162964344024658, "kl": 0.0009806156158447266, "learning_rate": 2.0106382978723404e-06, "loss": 0.1747, "reward": 0.40625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 63 }, { "completion_length": 604.6875, "epoch": 0.06826666666666667, "grad_norm": 0.25481051206588745, "kl": 0.001232445240020752, "learning_rate": 2.0425531914893617e-06, "loss": 0.0292, "reward": 0.4375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 64 }, { "completion_length": 642.21875, "epoch": 0.06933333333333333, "grad_norm": 0.13624215126037598, "kl": 0.0012699980288743973, "learning_rate": 2.074468085106383e-06, "loss": 0.0388, "reward": 0.3125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 648.5625, "epoch": 0.0704, "grad_norm": 0.1194443330168724, "kl": 0.0045427680015563965, "learning_rate": 2.1063829787234044e-06, "loss": -0.0807, "reward": 0.40625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 66 }, { "completion_length": 708.21875, "epoch": 0.07146666666666666, "grad_norm": 0.06530560553073883, "kl": 0.0011246204376220703, "learning_rate": 2.1382978723404253e-06, "loss": 0.0977, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 67 }, { "completion_length": 660.65625, "epoch": 0.07253333333333334, "grad_norm": 0.008264207281172276, "kl": 0.0017971992492675781, "learning_rate": 2.170212765957447e-06, "loss": 0.0099, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 68 }, { "completion_length": 722.84375, "epoch": 0.0736, "grad_norm": 0.18353091180324554, "kl": 0.001039743423461914, "learning_rate": 2.202127659574468e-06, "loss": 0.0213, "reward": 0.46875, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 69 }, { "completion_length": 606.125, "epoch": 0.07466666666666667, "grad_norm": 0.11882440745830536, "kl": 0.0016934871673583984, "learning_rate": 2.2340425531914894e-06, "loss": 0.0016, "reward": 0.28125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 641.15625, "epoch": 0.07573333333333333, "grad_norm": 0.0026877911295741796, "kl": 0.002603292465209961, "learning_rate": 2.2659574468085107e-06, "loss": -0.0004, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 71 }, { "completion_length": 519.6875, "epoch": 0.0768, "grad_norm": 0.26699167490005493, "kl": 0.004136085510253906, "learning_rate": 2.297872340425532e-06, "loss": 0.0009, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 72 }, { "completion_length": 600.9375, "epoch": 0.07786666666666667, "grad_norm": 0.012988269329071045, "kl": 0.00302886962890625, "learning_rate": 2.329787234042553e-06, "loss": 0.0401, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 73 }, { "completion_length": 699.90625, "epoch": 0.07893333333333333, "grad_norm": 0.1021258756518364, "kl": 0.0014214515686035156, "learning_rate": 2.3617021276595748e-06, "loss": 0.0102, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 74 }, { "completion_length": 618.65625, "epoch": 0.08, "grad_norm": 0.09216001629829407, "kl": 0.0061414241790771484, "learning_rate": 2.3936170212765957e-06, "loss": -0.0366, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 622.8125, "epoch": 0.08106666666666666, "grad_norm": 0.26960837841033936, "kl": 0.0044820308685302734, "learning_rate": 2.425531914893617e-06, "loss": 0.0663, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 76 }, { "completion_length": 631.875, "epoch": 0.08213333333333334, "grad_norm": 0.098408542573452, "kl": 0.0008417963981628418, "learning_rate": 2.4574468085106384e-06, "loss": -0.0296, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 77 }, { "completion_length": 580.09375, "epoch": 0.0832, "grad_norm": 0.011505941860377789, "kl": 0.0030368566513061523, "learning_rate": 2.4893617021276598e-06, "loss": 0.0395, "reward": 0.78125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 78 }, { "completion_length": 558.5, "epoch": 0.08426666666666667, "grad_norm": 0.37153348326683044, "kl": 0.005629301071166992, "learning_rate": 2.521276595744681e-06, "loss": 0.1911, "reward": 0.5, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 79 }, { "completion_length": 738.875, "epoch": 0.08533333333333333, "grad_norm": 0.10705796629190445, "kl": 0.0039980411529541016, "learning_rate": 2.553191489361702e-06, "loss": 0.0578, "reward": 0.59375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 692.5625, "epoch": 0.0864, "grad_norm": 0.11198261380195618, "kl": 0.003612518310546875, "learning_rate": 2.5851063829787234e-06, "loss": 0.1227, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 81 }, { "completion_length": 577.03125, "epoch": 0.08746666666666666, "grad_norm": 0.19315125048160553, "kl": 0.003607034683227539, "learning_rate": 2.6170212765957447e-06, "loss": 0.0626, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 82 }, { "completion_length": 582.40625, "epoch": 0.08853333333333334, "grad_norm": 0.1290639340877533, "kl": 0.0031890869140625, "learning_rate": 2.648936170212766e-06, "loss": -0.0576, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 83 }, { "completion_length": 584.8125, "epoch": 0.0896, "grad_norm": 0.15652064979076385, "kl": 0.007125377655029297, "learning_rate": 2.680851063829787e-06, "loss": 0.1474, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 84 }, { "completion_length": 630.28125, "epoch": 0.09066666666666667, "grad_norm": 0.0011411955347284675, "kl": 0.005236268043518066, "learning_rate": 2.7127659574468088e-06, "loss": -0.0027, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 651.875, "epoch": 0.09173333333333333, "grad_norm": 0.010444349609315395, "kl": 0.003153085708618164, "learning_rate": 2.7446808510638297e-06, "loss": 0.0387, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 86 }, { "completion_length": 483.15625, "epoch": 0.0928, "grad_norm": 0.2527560889720917, "kl": 0.0072345733642578125, "learning_rate": 2.776595744680851e-06, "loss": 0.0455, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 87 }, { "completion_length": 647.28125, "epoch": 0.09386666666666667, "grad_norm": 0.1300310492515564, "kl": 0.011152505874633789, "learning_rate": 2.8085106382978724e-06, "loss": 0.0209, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 88 }, { "completion_length": 619.4375, "epoch": 0.09493333333333333, "grad_norm": 0.10402600467205048, "kl": 0.003293275833129883, "learning_rate": 2.8404255319148938e-06, "loss": 0.042, "reward": 0.34375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 89 }, { "completion_length": 528.84375, "epoch": 0.096, "grad_norm": 0.14990234375, "kl": 0.006053924560546875, "learning_rate": 2.872340425531915e-06, "loss": 0.1625, "reward": 0.71875, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 622.6875, "epoch": 0.09706666666666666, "grad_norm": 0.0036519516725093126, "kl": 0.006417036056518555, "learning_rate": 2.9042553191489365e-06, "loss": 0.1014, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 91 }, { "completion_length": 604.71875, "epoch": 0.09813333333333334, "grad_norm": 0.05928485840559006, "kl": 0.005362987518310547, "learning_rate": 2.9361702127659574e-06, "loss": 0.0784, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 92 }, { "completion_length": 615.96875, "epoch": 0.0992, "grad_norm": 0.0010915520833805203, "kl": 0.027950763702392578, "learning_rate": 2.9680851063829787e-06, "loss": 0.0432, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 93 }, { "completion_length": 681.34375, "epoch": 0.10026666666666667, "grad_norm": 0.1801004558801651, "kl": 0.005485057830810547, "learning_rate": 3e-06, "loss": 0.023, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 94 }, { "completion_length": 546.9375, "epoch": 0.10133333333333333, "grad_norm": 0.11417558044195175, "kl": 0.002462148666381836, "learning_rate": 2.9999895838948146e-06, "loss": 0.0494, "reward": 0.84375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 629.78125, "epoch": 0.1024, "grad_norm": 0.1597270667552948, "kl": 0.005305886268615723, "learning_rate": 2.999958335723919e-06, "loss": 0.0141, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 96 }, { "completion_length": 654.09375, "epoch": 0.10346666666666667, "grad_norm": 0.20953911542892456, "kl": 0.0035321712493896484, "learning_rate": 2.9999062559212913e-06, "loss": 0.0933, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 97 }, { "completion_length": 645.96875, "epoch": 0.10453333333333334, "grad_norm": 0.3854478895664215, "kl": 0.0037326812744140625, "learning_rate": 2.9998333452102236e-06, "loss": 0.106, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 98 }, { "completion_length": 528.4375, "epoch": 0.1056, "grad_norm": 0.10113077610731125, "kl": 0.0036869049072265625, "learning_rate": 2.999739604603311e-06, "loss": 0.0863, "reward": 0.84375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 99 }, { "completion_length": 600.59375, "epoch": 0.10666666666666667, "grad_norm": 0.07654489576816559, "kl": 0.007133960723876953, "learning_rate": 2.9996250354024346e-06, "loss": 0.116, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 694.875, "epoch": 0.10773333333333333, "grad_norm": 0.1058415099978447, "kl": 0.0015850067138671875, "learning_rate": 2.9994896391987487e-06, "loss": 0.0409, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 101 }, { "completion_length": 519.40625, "epoch": 0.1088, "grad_norm": 0.32401278614997864, "kl": 0.015497922897338867, "learning_rate": 2.9993334178726546e-06, "loss": 0.1151, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 102 }, { "completion_length": 735.65625, "epoch": 0.10986666666666667, "grad_norm": 0.05726666375994682, "kl": 0.00200653076171875, "learning_rate": 2.9991563735937752e-06, "loss": 0.0686, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 103 }, { "completion_length": 486.4375, "epoch": 0.11093333333333333, "grad_norm": 0.1388460099697113, "kl": 0.009634971618652344, "learning_rate": 2.9989585088209272e-06, "loss": 0.0103, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 104 }, { "completion_length": 529.46875, "epoch": 0.112, "grad_norm": 0.004236644599586725, "kl": 0.011536598205566406, "learning_rate": 2.9987398263020837e-06, "loss": 0.0225, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 698.78125, "epoch": 0.11306666666666666, "grad_norm": 0.08099101483821869, "kl": 0.0015056133270263672, "learning_rate": 2.998500329074338e-06, "loss": 0.0323, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 106 }, { "completion_length": 763.8125, "epoch": 0.11413333333333334, "grad_norm": 0.14171065390110016, "kl": 0.00232696533203125, "learning_rate": 2.9982400204638626e-06, "loss": 0.0627, "reward": 0.625, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 107 }, { "completion_length": 562.65625, "epoch": 0.1152, "grad_norm": 0.13639767467975616, "kl": 0.0029783248901367188, "learning_rate": 2.9979589040858586e-06, "loss": 0.037, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 108 }, { "completion_length": 731.25, "epoch": 0.11626666666666667, "grad_norm": 0.1746598780155182, "kl": 0.002180814743041992, "learning_rate": 2.9976569838445097e-06, "loss": 0.0767, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 109 }, { "completion_length": 564.0, "epoch": 0.11733333333333333, "grad_norm": 0.08821848779916763, "kl": 0.01019287109375, "learning_rate": 2.997334263932927e-06, "loss": -0.0145, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 634.8125, "epoch": 0.1184, "grad_norm": 0.0024070264771580696, "kl": 0.005717277526855469, "learning_rate": 2.9969907488330905e-06, "loss": 0.0716, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 111 }, { "completion_length": 568.125, "epoch": 0.11946666666666667, "grad_norm": 0.15547393262386322, "kl": 0.002860546112060547, "learning_rate": 2.996626443315785e-06, "loss": 0.0622, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 112 }, { "completion_length": 590.0625, "epoch": 0.12053333333333334, "grad_norm": 0.15792787075042725, "kl": 0.00521397590637207, "learning_rate": 2.996241352440537e-06, "loss": 0.0067, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 113 }, { "completion_length": 588.46875, "epoch": 0.1216, "grad_norm": 0.002936169970780611, "kl": 0.2660036087036133, "learning_rate": 2.9958354815555427e-06, "loss": -0.0274, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 114 }, { "completion_length": 724.8125, "epoch": 0.12266666666666666, "grad_norm": 0.13336944580078125, "kl": 0.005784511566162109, "learning_rate": 2.9954088362975936e-06, "loss": 0.0225, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 539.8125, "epoch": 0.12373333333333333, "grad_norm": 0.0030870395712554455, "kl": 0.005696773529052734, "learning_rate": 2.994961422591999e-06, "loss": 0.0668, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 116 }, { "completion_length": 490.46875, "epoch": 0.1248, "grad_norm": 0.07974762469530106, "kl": 0.0051708221435546875, "learning_rate": 2.994493246652504e-06, "loss": 0.0417, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 117 }, { "completion_length": 746.3125, "epoch": 0.12586666666666665, "grad_norm": 0.16573244333267212, "kl": 0.0012717247009277344, "learning_rate": 2.9940043149812002e-06, "loss": 0.0738, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 118 }, { "completion_length": 538.8125, "epoch": 0.12693333333333334, "grad_norm": 0.10567408800125122, "kl": 0.013469696044921875, "learning_rate": 2.9934946343684403e-06, "loss": -0.0122, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 119 }, { "completion_length": 540.78125, "epoch": 0.128, "grad_norm": 0.05424753576517105, "kl": 0.010761260986328125, "learning_rate": 2.99296421189274e-06, "loss": 0.0458, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 729.96875, "epoch": 0.12906666666666666, "grad_norm": 0.07586592435836792, "kl": 0.0016102790832519531, "learning_rate": 2.9924130549206804e-06, "loss": 0.0251, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 121 }, { "completion_length": 611.53125, "epoch": 0.13013333333333332, "grad_norm": 0.09894312173128128, "kl": 0.004480838775634766, "learning_rate": 2.9918411711068073e-06, "loss": 0.0608, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 122 }, { "completion_length": 484.125, "epoch": 0.1312, "grad_norm": 0.08404692262411118, "kl": 0.005300998687744141, "learning_rate": 2.991248568393524e-06, "loss": 0.0183, "reward": 0.6875, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 123 }, { "completion_length": 670.625, "epoch": 0.13226666666666667, "grad_norm": 0.0011450131423771381, "kl": 0.010477542877197266, "learning_rate": 2.9906352550109787e-06, "loss": 0.0232, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 124 }, { "completion_length": 629.84375, "epoch": 0.13333333333333333, "grad_norm": 0.05011319741606712, "kl": 0.0061261653900146484, "learning_rate": 2.9900012394769546e-06, "loss": 0.0818, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 577.6875, "epoch": 0.1344, "grad_norm": 0.10489111393690109, "kl": 0.003220081329345703, "learning_rate": 2.989346530596748e-06, "loss": 0.0469, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 126 }, { "completion_length": 721.625, "epoch": 0.13546666666666668, "grad_norm": 0.06591711193323135, "kl": 0.005730867385864258, "learning_rate": 2.988671137463048e-06, "loss": 0.0084, "reward": 0.46875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 127 }, { "completion_length": 720.34375, "epoch": 0.13653333333333334, "grad_norm": 0.019064543768763542, "kl": 0.009561538696289062, "learning_rate": 2.987975069455809e-06, "loss": 0.0578, "reward": 0.3125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 128 }, { "completion_length": 504.125, "epoch": 0.1376, "grad_norm": 0.10247477144002914, "kl": 0.0068416595458984375, "learning_rate": 2.9872583362421204e-06, "loss": 0.0461, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 129 }, { "completion_length": 548.9375, "epoch": 0.13866666666666666, "grad_norm": 0.15603476762771606, "kl": 0.006947994232177734, "learning_rate": 2.986520947776075e-06, "loss": 0.1152, "reward": 0.75, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 507.09375, "epoch": 0.13973333333333332, "grad_norm": 0.2372616082429886, "kl": 0.006831169128417969, "learning_rate": 2.985762914298626e-06, "loss": 0.0682, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 131 }, { "completion_length": 578.84375, "epoch": 0.1408, "grad_norm": 0.0018290270818397403, "kl": 0.009470939636230469, "learning_rate": 2.984984246337449e-06, "loss": 0.0107, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 132 }, { "completion_length": 425.40625, "epoch": 0.14186666666666667, "grad_norm": 0.06826941668987274, "kl": 0.017541885375976562, "learning_rate": 2.9841849547067944e-06, "loss": 0.0104, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 133 }, { "completion_length": 645.25, "epoch": 0.14293333333333333, "grad_norm": 0.0038887031842023134, "kl": 0.0046100616455078125, "learning_rate": 2.983365050507336e-06, "loss": 0.0654, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 134 }, { "completion_length": 661.15625, "epoch": 0.144, "grad_norm": 0.09491920471191406, "kl": 0.0027337074279785156, "learning_rate": 2.982524545126018e-06, "loss": -0.017, "reward": 0.3125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 502.9375, "epoch": 0.14506666666666668, "grad_norm": 0.18258221447467804, "kl": 0.005644798278808594, "learning_rate": 2.9816634502358974e-06, "loss": 0.0469, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 136 }, { "completion_length": 567.84375, "epoch": 0.14613333333333334, "grad_norm": 0.0877852812409401, "kl": 0.0034530162811279297, "learning_rate": 2.980781777795981e-06, "loss": -0.0279, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 137 }, { "completion_length": 479.625, "epoch": 0.1472, "grad_norm": 0.0036170221865177155, "kl": 0.006432056427001953, "learning_rate": 2.979879540051059e-06, "loss": 0.0386, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 138 }, { "completion_length": 630.46875, "epoch": 0.14826666666666666, "grad_norm": 0.0020105745643377304, "kl": 0.002666473388671875, "learning_rate": 2.978956749531536e-06, "loss": -0.0125, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 139 }, { "completion_length": 418.28125, "epoch": 0.14933333333333335, "grad_norm": 0.001964523224160075, "kl": 0.008090019226074219, "learning_rate": 2.9780134190532553e-06, "loss": -0.0047, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 591.375, "epoch": 0.1504, "grad_norm": 0.0030624843202531338, "kl": 0.0066242218017578125, "learning_rate": 2.977049561717324e-06, "loss": 0.0046, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 141 }, { "completion_length": 678.0625, "epoch": 0.15146666666666667, "grad_norm": 0.139149472117424, "kl": 0.0038056373596191406, "learning_rate": 2.976065190909927e-06, "loss": 0.0601, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 142 }, { "completion_length": 754.59375, "epoch": 0.15253333333333333, "grad_norm": 0.06360428035259247, "kl": 0.008878469467163086, "learning_rate": 2.975060320302145e-06, "loss": 0.024, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 143 }, { "completion_length": 569.28125, "epoch": 0.1536, "grad_norm": 0.16685007512569427, "kl": 0.008013725280761719, "learning_rate": 2.9740349638497614e-06, "loss": 0.0144, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 144 }, { "completion_length": 543.40625, "epoch": 0.15466666666666667, "grad_norm": 0.08345290273427963, "kl": 0.0031375885009765625, "learning_rate": 2.972989135793071e-06, "loss": 0.0231, "reward": 0.53125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 525.375, "epoch": 0.15573333333333333, "grad_norm": 0.0029050582088530064, "kl": 0.006566047668457031, "learning_rate": 2.971922850656679e-06, "loss": 0.0378, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 146 }, { "completion_length": 506.25, "epoch": 0.1568, "grad_norm": 0.0039052043575793505, "kl": 0.004822731018066406, "learning_rate": 2.970836123249305e-06, "loss": 0.0381, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 147 }, { "completion_length": 698.9375, "epoch": 0.15786666666666666, "grad_norm": 0.12920227646827698, "kl": 0.002834320068359375, "learning_rate": 2.9697289686635704e-06, "loss": 0.0501, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 148 }, { "completion_length": 614.0625, "epoch": 0.15893333333333334, "grad_norm": 0.0018531116656959057, "kl": 0.006834506988525391, "learning_rate": 2.9686014022757936e-06, "loss": -0.0001, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 149 }, { "completion_length": 572.90625, "epoch": 0.16, "grad_norm": 0.09835998713970184, "kl": 0.004143714904785156, "learning_rate": 2.967453439745775e-06, "loss": 0.1111, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 625.5, "epoch": 0.16106666666666666, "grad_norm": 0.1304803192615509, "kl": 0.0048160552978515625, "learning_rate": 2.9662850970165785e-06, "loss": -0.0357, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 151 }, { "completion_length": 599.34375, "epoch": 0.16213333333333332, "grad_norm": 0.07034818083047867, "kl": 0.005354404449462891, "learning_rate": 2.9650963903143124e-06, "loss": 0.0744, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 152 }, { "completion_length": 553.3125, "epoch": 0.1632, "grad_norm": 0.10030053555965424, "kl": 0.0057353973388671875, "learning_rate": 2.9638873361479016e-06, "loss": 0.0123, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 153 }, { "completion_length": 700.78125, "epoch": 0.16426666666666667, "grad_norm": 0.06664158403873444, "kl": 0.0026879310607910156, "learning_rate": 2.9626579513088605e-06, "loss": 0.0247, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 154 }, { "completion_length": 561.40625, "epoch": 0.16533333333333333, "grad_norm": 0.0017615946708247066, "kl": 0.0030694007873535156, "learning_rate": 2.961408252871058e-06, "loss": 0.0756, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 155 }, { "completion_length": 533.25, "epoch": 0.1664, "grad_norm": 0.0999365895986557, "kl": 0.007275581359863281, "learning_rate": 2.9601382581904815e-06, "loss": -0.013, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 156 }, { "completion_length": 639.65625, "epoch": 0.16746666666666668, "grad_norm": 0.10413453727960587, "kl": 0.004925727844238281, "learning_rate": 2.958847984904994e-06, "loss": 0.0381, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 157 }, { "completion_length": 475.03125, "epoch": 0.16853333333333334, "grad_norm": 0.14536528289318085, "kl": 0.005907535552978516, "learning_rate": 2.9575374509340937e-06, "loss": -0.0522, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 158 }, { "completion_length": 666.1875, "epoch": 0.1696, "grad_norm": 0.0015069888904690742, "kl": 0.004468441009521484, "learning_rate": 2.9562066744786588e-06, "loss": 0.0073, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 159 }, { "completion_length": 629.3125, "epoch": 0.17066666666666666, "grad_norm": 0.11236796528100967, "kl": 0.004606723785400391, "learning_rate": 2.9548556740207e-06, "loss": 0.13, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 586.4375, "epoch": 0.17173333333333332, "grad_norm": 0.14352257549762726, "kl": 0.002552032470703125, "learning_rate": 2.9534844683231005e-06, "loss": 0.0915, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 161 }, { "completion_length": 631.1875, "epoch": 0.1728, "grad_norm": 0.06777487695217133, "kl": 0.004531383514404297, "learning_rate": 2.9520930764293584e-06, "loss": -0.0037, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 162 }, { "completion_length": 461.875, "epoch": 0.17386666666666667, "grad_norm": 0.16525107622146606, "kl": 0.005507469177246094, "learning_rate": 2.9506815176633184e-06, "loss": 0.0071, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 163 }, { "completion_length": 436.03125, "epoch": 0.17493333333333333, "grad_norm": 0.2134162187576294, "kl": 0.0047397613525390625, "learning_rate": 2.949249811628907e-06, "loss": 0.0259, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 164 }, { "completion_length": 684.0625, "epoch": 0.176, "grad_norm": 0.13471823930740356, "kl": 0.002166271209716797, "learning_rate": 2.9477979782098592e-06, "loss": -0.0069, "reward": 0.40625, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 165 }, { "completion_length": 459.84375, "epoch": 0.17706666666666668, "grad_norm": 0.011194470338523388, "kl": 0.011411666870117188, "learning_rate": 2.94632603756944e-06, "loss": 0.065, "reward": 0.78125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 166 }, { "completion_length": 618.125, "epoch": 0.17813333333333334, "grad_norm": 0.13892412185668945, "kl": 0.004021644592285156, "learning_rate": 2.9448340101501676e-06, "loss": 0.02, "reward": 0.53125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 167 }, { "completion_length": 705.21875, "epoch": 0.1792, "grad_norm": 0.00197379058226943, "kl": 0.004115104675292969, "learning_rate": 2.9433219166735286e-06, "loss": 0.02, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 168 }, { "completion_length": 648.0, "epoch": 0.18026666666666666, "grad_norm": 0.07838324457406998, "kl": 0.004750251770019531, "learning_rate": 2.9417897781396884e-06, "loss": 0.0825, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 169 }, { "completion_length": 721.875, "epoch": 0.18133333333333335, "grad_norm": 0.0021305799018591642, "kl": 0.004076480865478516, "learning_rate": 2.9402376158272022e-06, "loss": 0.0045, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 658.40625, "epoch": 0.1824, "grad_norm": 0.0018165241926908493, "kl": 0.005099773406982422, "learning_rate": 2.938665451292719e-06, "loss": 0.0479, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 171 }, { "completion_length": 574.0, "epoch": 0.18346666666666667, "grad_norm": 0.10827412456274033, "kl": 0.014039993286132812, "learning_rate": 2.937073306370679e-06, "loss": 0.0214, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 172 }, { "completion_length": 717.90625, "epoch": 0.18453333333333333, "grad_norm": 0.00154064258094877, "kl": 0.0022192001342773438, "learning_rate": 2.9354612031730146e-06, "loss": 0.0271, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 173 }, { "completion_length": 536.71875, "epoch": 0.1856, "grad_norm": 0.0032084956765174866, "kl": 0.0038423538208007812, "learning_rate": 2.933829164088841e-06, "loss": 0.0036, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 174 }, { "completion_length": 554.5, "epoch": 0.18666666666666668, "grad_norm": 0.1838374137878418, "kl": 0.0036420822143554688, "learning_rate": 2.9321772117841463e-06, "loss": 0.0725, "reward": 0.6875, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 175 }, { "completion_length": 691.75, "epoch": 0.18773333333333334, "grad_norm": 0.10326944291591644, "kl": 0.0031805038452148438, "learning_rate": 2.9305053692014753e-06, "loss": -0.0362, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 176 }, { "completion_length": 607.59375, "epoch": 0.1888, "grad_norm": 0.12695921957492828, "kl": 0.004214286804199219, "learning_rate": 2.928813659559612e-06, "loss": 0.0286, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 177 }, { "completion_length": 557.59375, "epoch": 0.18986666666666666, "grad_norm": 0.0013459983747452497, "kl": 0.008190155029296875, "learning_rate": 2.9271021063532586e-06, "loss": -0.0094, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 178 }, { "completion_length": 503.6875, "epoch": 0.19093333333333334, "grad_norm": 0.22505255043506622, "kl": 0.005129814147949219, "learning_rate": 2.925370733352704e-06, "loss": 0.1088, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 179 }, { "completion_length": 652.03125, "epoch": 0.192, "grad_norm": 0.154541477560997, "kl": 0.003907203674316406, "learning_rate": 2.923619564603501e-06, "loss": 0.1182, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 488.0, "epoch": 0.19306666666666666, "grad_norm": 0.1373697966337204, "kl": 0.01980304718017578, "learning_rate": 2.921848624426126e-06, "loss": 0.0447, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 181 }, { "completion_length": 639.28125, "epoch": 0.19413333333333332, "grad_norm": 0.12526844441890717, "kl": 0.005747795104980469, "learning_rate": 2.9200579374156446e-06, "loss": 0.0151, "reward": 0.34375, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 182 }, { "completion_length": 611.6875, "epoch": 0.1952, "grad_norm": 0.11296387016773224, "kl": 0.006075859069824219, "learning_rate": 2.918247528441369e-06, "loss": 0.0344, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 183 }, { "completion_length": 480.71875, "epoch": 0.19626666666666667, "grad_norm": 0.18043623864650726, "kl": 0.0038557052612304688, "learning_rate": 2.9164174226465136e-06, "loss": 0.1185, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 184 }, { "completion_length": 530.8125, "epoch": 0.19733333333333333, "grad_norm": 0.05809389427304268, "kl": 0.007172584533691406, "learning_rate": 2.9145676454478435e-06, "loss": -0.0388, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 185 }, { "completion_length": 597.625, "epoch": 0.1984, "grad_norm": 0.1875530332326889, "kl": 0.010850191116333008, "learning_rate": 2.912698222535324e-06, "loss": 0.1738, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 186 }, { "completion_length": 461.40625, "epoch": 0.19946666666666665, "grad_norm": 0.006133346818387508, "kl": 0.006644248962402344, "learning_rate": 2.9108091798717634e-06, "loss": -0.0304, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 187 }, { "completion_length": 626.78125, "epoch": 0.20053333333333334, "grad_norm": 0.0056674424558877945, "kl": 0.004472255706787109, "learning_rate": 2.9089005436924505e-06, "loss": 0.0138, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 188 }, { "completion_length": 621.84375, "epoch": 0.2016, "grad_norm": 0.1068946048617363, "kl": 0.00412750244140625, "learning_rate": 2.9069723405047926e-06, "loss": 0.098, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 189 }, { "completion_length": 462.34375, "epoch": 0.20266666666666666, "grad_norm": 0.11013924330472946, "kl": 0.008099555969238281, "learning_rate": 2.9050245970879456e-06, "loss": -0.0547, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 523.8125, "epoch": 0.20373333333333332, "grad_norm": 0.10962624102830887, "kl": 0.00847625732421875, "learning_rate": 2.903057340492444e-06, "loss": 0.1092, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 191 }, { "completion_length": 505.6875, "epoch": 0.2048, "grad_norm": 0.07207497209310532, "kl": 0.00647735595703125, "learning_rate": 2.901070598039822e-06, "loss": 0.0652, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 192 }, { "completion_length": 567.28125, "epoch": 0.20586666666666667, "grad_norm": 0.15118834376335144, "kl": 0.0045413970947265625, "learning_rate": 2.8990643973222383e-06, "loss": 0.069, "reward": 0.8125, "reward_std": 0.375, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 193 }, { "completion_length": 635.9375, "epoch": 0.20693333333333333, "grad_norm": 0.1937543749809265, "kl": 0.00350189208984375, "learning_rate": 2.89703876620209e-06, "loss": 0.0139, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 194 }, { "completion_length": 603.21875, "epoch": 0.208, "grad_norm": 0.122174471616745, "kl": 0.003726959228515625, "learning_rate": 2.8949937328116252e-06, "loss": 0.0403, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 195 }, { "completion_length": 624.625, "epoch": 0.20906666666666668, "grad_norm": 0.004428999498486519, "kl": 0.008460044860839844, "learning_rate": 2.8929293255525563e-06, "loss": -0.0415, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 196 }, { "completion_length": 599.3125, "epoch": 0.21013333333333334, "grad_norm": 0.07916853576898575, "kl": 0.00289154052734375, "learning_rate": 2.8908455730956588e-06, "loss": 0.0586, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 197 }, { "completion_length": 616.59375, "epoch": 0.2112, "grad_norm": 0.07020086795091629, "kl": 0.0027709007263183594, "learning_rate": 2.88874250438038e-06, "loss": 0.1484, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 198 }, { "completion_length": 439.40625, "epoch": 0.21226666666666666, "grad_norm": 0.0017586946487426758, "kl": 0.005031585693359375, "learning_rate": 2.8866201486144333e-06, "loss": 0.0544, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 199 }, { "completion_length": 460.90625, "epoch": 0.21333333333333335, "grad_norm": 0.09752950072288513, "kl": 0.046166419982910156, "learning_rate": 2.884478535273393e-06, "loss": -0.0106, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 200 }, { "completion_length": 561.78125, "epoch": 0.2144, "grad_norm": 0.0021027277689427137, "kl": 0.004765510559082031, "learning_rate": 2.8823176941002853e-06, "loss": 0.0569, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 201 }, { "completion_length": 575.3125, "epoch": 0.21546666666666667, "grad_norm": 0.06180017068982124, "kl": 0.004106044769287109, "learning_rate": 2.880137655105176e-06, "loss": 0.0125, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 202 }, { "completion_length": 561.6875, "epoch": 0.21653333333333333, "grad_norm": 0.08899540454149246, "kl": 0.012461662292480469, "learning_rate": 2.877938448564752e-06, "loss": -0.0105, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 203 }, { "completion_length": 543.21875, "epoch": 0.2176, "grad_norm": 0.07216367870569229, "kl": 0.007869720458984375, "learning_rate": 2.875720105021903e-06, "loss": -0.0282, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 204 }, { "completion_length": 474.21875, "epoch": 0.21866666666666668, "grad_norm": 0.16203008592128754, "kl": 0.005039215087890625, "learning_rate": 2.8734826552852934e-06, "loss": -0.0154, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 205 }, { "completion_length": 617.25, "epoch": 0.21973333333333334, "grad_norm": 0.08787223696708679, "kl": 0.009008407592773438, "learning_rate": 2.8712261304289407e-06, "loss": 0.0384, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 206 }, { "completion_length": 632.25, "epoch": 0.2208, "grad_norm": 0.0019053228897973895, "kl": 0.004804134368896484, "learning_rate": 2.868950561791778e-06, "loss": -0.0296, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 207 }, { "completion_length": 602.9375, "epoch": 0.22186666666666666, "grad_norm": 0.12758515775203705, "kl": 0.004595756530761719, "learning_rate": 2.8666559809772215e-06, "loss": -0.026, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 208 }, { "completion_length": 574.84375, "epoch": 0.22293333333333334, "grad_norm": 0.5817623138427734, "kl": 0.03203868865966797, "learning_rate": 2.8643424198527314e-06, "loss": 0.0355, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 209 }, { "completion_length": 440.71875, "epoch": 0.224, "grad_norm": 0.17380958795547485, "kl": 0.005917549133300781, "learning_rate": 2.86200991054937e-06, "loss": -0.0175, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 567.4375, "epoch": 0.22506666666666666, "grad_norm": 0.12089383602142334, "kl": 0.0047588348388671875, "learning_rate": 2.8596584854613513e-06, "loss": 0.0339, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 211 }, { "completion_length": 563.9375, "epoch": 0.22613333333333333, "grad_norm": 0.10379793494939804, "kl": 0.008982658386230469, "learning_rate": 2.8572881772455993e-06, "loss": -0.0024, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 212 }, { "completion_length": 536.15625, "epoch": 0.2272, "grad_norm": 0.13498139381408691, "kl": 0.0033588409423828125, "learning_rate": 2.8548990188212853e-06, "loss": 0.0437, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 213 }, { "completion_length": 613.65625, "epoch": 0.22826666666666667, "grad_norm": 0.1013210266828537, "kl": 0.010851860046386719, "learning_rate": 2.852491043369377e-06, "loss": 0.0671, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 214 }, { "completion_length": 644.75, "epoch": 0.22933333333333333, "grad_norm": 0.11970150470733643, "kl": 0.004528045654296875, "learning_rate": 2.850064284332176e-06, "loss": -0.0098, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 215 }, { "completion_length": 618.4375, "epoch": 0.2304, "grad_norm": 0.22354348003864288, "kl": 0.004315376281738281, "learning_rate": 2.847618775412851e-06, "loss": 0.1289, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 216 }, { "completion_length": 422.625, "epoch": 0.23146666666666665, "grad_norm": 0.591202974319458, "kl": 0.027876853942871094, "learning_rate": 2.845154550574973e-06, "loss": 0.0098, "reward": 0.875, "reward_std": 0.125, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 217 }, { "completion_length": 621.1875, "epoch": 0.23253333333333334, "grad_norm": 0.1075991615653038, "kl": 0.0060787200927734375, "learning_rate": 2.842671644042043e-06, "loss": 0.1004, "reward": 0.5, "reward_std": 0.375, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 218 }, { "completion_length": 598.78125, "epoch": 0.2336, "grad_norm": 0.19930097460746765, "kl": 0.004418373107910156, "learning_rate": 2.840170090297014e-06, "loss": -0.0109, "reward": 0.59375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 219 }, { "completion_length": 591.46875, "epoch": 0.23466666666666666, "grad_norm": 0.16848641633987427, "kl": 0.0037813186645507812, "learning_rate": 2.8376499240818166e-06, "loss": -0.0302, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 599.0, "epoch": 0.23573333333333332, "grad_norm": 0.10026425868272781, "kl": 0.015532493591308594, "learning_rate": 2.8351111803968714e-06, "loss": -0.0194, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 221 }, { "completion_length": 625.3125, "epoch": 0.2368, "grad_norm": 0.07597433030605316, "kl": 0.0037522315979003906, "learning_rate": 2.8325538945006067e-06, "loss": -0.0504, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 222 }, { "completion_length": 761.625, "epoch": 0.23786666666666667, "grad_norm": 0.00153819948900491, "kl": 0.0028448104858398438, "learning_rate": 2.829978101908969e-06, "loss": 0.0202, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 223 }, { "completion_length": 579.6875, "epoch": 0.23893333333333333, "grad_norm": 0.17116552591323853, "kl": 0.0045032501220703125, "learning_rate": 2.827383838394926e-06, "loss": 0.042, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 224 }, { "completion_length": 477.15625, "epoch": 0.24, "grad_norm": 0.014179944060742855, "kl": 0.008158683776855469, "learning_rate": 2.8247711399879734e-06, "loss": 0.0285, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 225 }, { "completion_length": 623.625, "epoch": 0.24106666666666668, "grad_norm": 0.2298433631658554, "kl": 0.006934165954589844, "learning_rate": 2.8221400429736333e-06, "loss": -0.0896, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 226 }, { "completion_length": 512.4375, "epoch": 0.24213333333333334, "grad_norm": 0.041340421885252, "kl": 0.011095046997070312, "learning_rate": 2.81949058389295e-06, "loss": 0.0359, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 227 }, { "completion_length": 501.6875, "epoch": 0.2432, "grad_norm": 0.11379531770944595, "kl": 0.006816864013671875, "learning_rate": 2.8168227995419826e-06, "loss": 0.0128, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 228 }, { "completion_length": 535.28125, "epoch": 0.24426666666666666, "grad_norm": 0.11542847752571106, "kl": 0.006154060363769531, "learning_rate": 2.8141367269712943e-06, "loss": 0.0604, "reward": 0.53125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 229 }, { "completion_length": 613.3125, "epoch": 0.24533333333333332, "grad_norm": 0.0016841450706124306, "kl": 0.0031375885009765625, "learning_rate": 2.8114324034854378e-06, "loss": 0.0489, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 582.34375, "epoch": 0.2464, "grad_norm": 0.06516903638839722, "kl": 0.006449699401855469, "learning_rate": 2.808709866642437e-06, "loss": -0.0082, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 231 }, { "completion_length": 456.15625, "epoch": 0.24746666666666667, "grad_norm": 0.1051865741610527, "kl": 0.005875587463378906, "learning_rate": 2.8059691542532654e-06, "loss": -0.0343, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 232 }, { "completion_length": 563.25, "epoch": 0.24853333333333333, "grad_norm": 0.1445797234773636, "kl": 0.004630088806152344, "learning_rate": 2.8032103043813213e-06, "loss": 0.1038, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 233 }, { "completion_length": 480.84375, "epoch": 0.2496, "grad_norm": 0.10349083691835403, "kl": 0.0063934326171875, "learning_rate": 2.800433355341898e-06, "loss": -0.0441, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 234 }, { "completion_length": 501.03125, "epoch": 0.25066666666666665, "grad_norm": 0.2040635049343109, "kl": 0.009173393249511719, "learning_rate": 2.7976383457016535e-06, "loss": -0.0513, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 235 }, { "completion_length": 492.84375, "epoch": 0.2517333333333333, "grad_norm": 0.0037540006451308727, "kl": 0.009112358093261719, "learning_rate": 2.7948253142780738e-06, "loss": -0.009, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 236 }, { "completion_length": 614.09375, "epoch": 0.2528, "grad_norm": 0.037114452570676804, "kl": 0.01329803466796875, "learning_rate": 2.791994300138934e-06, "loss": -0.0135, "reward": 0.375, "reward_std": 0.125, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 237 }, { "completion_length": 514.71875, "epoch": 0.2538666666666667, "grad_norm": 0.01824725978076458, "kl": 0.0073871612548828125, "learning_rate": 2.789145342601755e-06, "loss": -0.0542, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 238 }, { "completion_length": 673.6875, "epoch": 0.25493333333333335, "grad_norm": 0.12610283493995667, "kl": 0.006556510925292969, "learning_rate": 2.786278481233259e-06, "loss": 0.0654, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 239 }, { "completion_length": 734.375, "epoch": 0.256, "grad_norm": 0.002139654476195574, "kl": 0.00655364990234375, "learning_rate": 2.7833937558488187e-06, "loss": -0.0351, "reward": 0.3125, "reward_std": 0.125, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 444.40625, "epoch": 0.25706666666666667, "grad_norm": 0.008942971006035805, "kl": 0.007976531982421875, "learning_rate": 2.7804912065119048e-06, "loss": -0.0115, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 241 }, { "completion_length": 490.0, "epoch": 0.2581333333333333, "grad_norm": 0.002351433737203479, "kl": 0.008146286010742188, "learning_rate": 2.777570873533529e-06, "loss": -0.0426, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 242 }, { "completion_length": 547.4375, "epoch": 0.2592, "grad_norm": 0.1274539828300476, "kl": 0.006899833679199219, "learning_rate": 2.7746327974716863e-06, "loss": -0.0159, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 243 }, { "completion_length": 524.4375, "epoch": 0.26026666666666665, "grad_norm": 0.11112137138843536, "kl": 0.012063026428222656, "learning_rate": 2.7716770191307885e-06, "loss": 0.082, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 244 }, { "completion_length": 525.8125, "epoch": 0.2613333333333333, "grad_norm": 0.003108512843027711, "kl": 0.005977630615234375, "learning_rate": 2.7687035795611003e-06, "loss": 0.1002, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 245 }, { "completion_length": 533.5625, "epoch": 0.2624, "grad_norm": 0.2792525291442871, "kl": 0.010778427124023438, "learning_rate": 2.7657125200581663e-06, "loss": 0.0834, "reward": 0.5, "reward_std": 0.375, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 246 }, { "completion_length": 630.34375, "epoch": 0.2634666666666667, "grad_norm": 0.14573414623737335, "kl": 0.009063720703125, "learning_rate": 2.7627038821622417e-06, "loss": 0.0779, "reward": 0.53125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 247 }, { "completion_length": 631.21875, "epoch": 0.26453333333333334, "grad_norm": 0.09727001935243607, "kl": 0.004515647888183594, "learning_rate": 2.7596777076577106e-06, "loss": 0.0726, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 248 }, { "completion_length": 514.90625, "epoch": 0.2656, "grad_norm": 0.15672162175178528, "kl": 0.0076141357421875, "learning_rate": 2.7566340385725087e-06, "loss": -0.0403, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 249 }, { "completion_length": 576.875, "epoch": 0.26666666666666666, "grad_norm": 0.10046457499265671, "kl": 0.0033979415893554688, "learning_rate": 2.7535729171775408e-06, "loss": 0.0049, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 533.875, "epoch": 0.2677333333333333, "grad_norm": 0.07560133188962936, "kl": 0.0047931671142578125, "learning_rate": 2.7504943859860883e-06, "loss": 0.0152, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 251 }, { "completion_length": 598.3125, "epoch": 0.2688, "grad_norm": 0.0016732144868001342, "kl": 0.1126089096069336, "learning_rate": 2.7473984877532248e-06, "loss": -0.0312, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 252 }, { "completion_length": 563.9375, "epoch": 0.26986666666666664, "grad_norm": 0.06384222209453583, "kl": 0.011692047119140625, "learning_rate": 2.7442852654752197e-06, "loss": 0.1401, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 253 }, { "completion_length": 525.4375, "epoch": 0.27093333333333336, "grad_norm": 0.0029080319218337536, "kl": 0.0073699951171875, "learning_rate": 2.74115476238894e-06, "loss": -0.0117, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 254 }, { "completion_length": 545.875, "epoch": 0.272, "grad_norm": 0.08067440241575241, "kl": 0.0034770965576171875, "learning_rate": 2.7380070219712514e-06, "loss": 0.035, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 255 }, { "completion_length": 523.03125, "epoch": 0.2730666666666667, "grad_norm": 0.23288778960704803, "kl": 0.007964134216308594, "learning_rate": 2.734842087938415e-06, "loss": 0.044, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 256 }, { "completion_length": 440.40625, "epoch": 0.27413333333333334, "grad_norm": 0.002510709222406149, "kl": 0.007568359375, "learning_rate": 2.731660004245478e-06, "loss": -0.0648, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 257 }, { "completion_length": 515.3125, "epoch": 0.2752, "grad_norm": 0.19114403426647186, "kl": 0.007274627685546875, "learning_rate": 2.728460815085665e-06, "loss": -0.0286, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 258 }, { "completion_length": 730.3125, "epoch": 0.27626666666666666, "grad_norm": 0.06618313491344452, "kl": 0.008176803588867188, "learning_rate": 2.725244564889764e-06, "loss": 0.0367, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 259 }, { "completion_length": 611.5, "epoch": 0.2773333333333333, "grad_norm": 0.065017931163311, "kl": 0.0039215087890625, "learning_rate": 2.722011298325509e-06, "loss": 0.0041, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 444.40625, "epoch": 0.2784, "grad_norm": 0.1477704495191574, "kl": 0.007457733154296875, "learning_rate": 2.7187610602969586e-06, "loss": 0.0031, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 261 }, { "completion_length": 706.90625, "epoch": 0.27946666666666664, "grad_norm": 0.0019224324496462941, "kl": 0.004256248474121094, "learning_rate": 2.7154938959438756e-06, "loss": 0.0092, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 262 }, { "completion_length": 577.90625, "epoch": 0.28053333333333336, "grad_norm": 0.126279816031456, "kl": 0.006718635559082031, "learning_rate": 2.7122098506410955e-06, "loss": 0.0676, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 263 }, { "completion_length": 618.78125, "epoch": 0.2816, "grad_norm": 0.1233547031879425, "kl": 0.005222320556640625, "learning_rate": 2.7089089699979008e-06, "loss": 0.0525, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 264 }, { "completion_length": 502.90625, "epoch": 0.2826666666666667, "grad_norm": 0.003951345570385456, "kl": 0.0037746429443359375, "learning_rate": 2.705591299857385e-06, "loss": 0.0091, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 265 }, { "completion_length": 586.28125, "epoch": 0.28373333333333334, "grad_norm": 0.10236603021621704, "kl": 0.007382392883300781, "learning_rate": 2.7022568862958153e-06, "loss": 0.0194, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 266 }, { "completion_length": 745.71875, "epoch": 0.2848, "grad_norm": 0.004417740274220705, "kl": 0.0034036636352539062, "learning_rate": 2.6989057756219958e-06, "loss": 0.0207, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 267 }, { "completion_length": 464.65625, "epoch": 0.28586666666666666, "grad_norm": 0.005591324530541897, "kl": 0.0063934326171875, "learning_rate": 2.6955380143766217e-06, "loss": 0.0537, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 268 }, { "completion_length": 609.375, "epoch": 0.2869333333333333, "grad_norm": 0.002050150418654084, "kl": 0.004497528076171875, "learning_rate": 2.6921536493316326e-06, "loss": 0.0115, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 269 }, { "completion_length": 585.6875, "epoch": 0.288, "grad_norm": 0.06371308118104935, "kl": 0.005126953125, "learning_rate": 2.6887527274895657e-06, "loss": 0.058, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 440.5, "epoch": 0.2890666666666667, "grad_norm": 0.0054902005940675735, "kl": 0.0067691802978515625, "learning_rate": 2.6853352960829e-06, "loss": -0.0075, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 271 }, { "completion_length": 462.0625, "epoch": 0.29013333333333335, "grad_norm": 0.057916730642318726, "kl": 0.006755828857421875, "learning_rate": 2.6819014025734022e-06, "loss": -0.0314, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 272 }, { "completion_length": 668.84375, "epoch": 0.2912, "grad_norm": 0.053568918257951736, "kl": 0.005808830261230469, "learning_rate": 2.678451094651467e-06, "loss": 0.021, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 273 }, { "completion_length": 658.65625, "epoch": 0.2922666666666667, "grad_norm": 0.08184046298265457, "kl": 0.005383491516113281, "learning_rate": 2.6749844202354553e-06, "loss": 0.0068, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 274 }, { "completion_length": 470.25, "epoch": 0.29333333333333333, "grad_norm": 0.00530163012444973, "kl": 0.006487846374511719, "learning_rate": 2.6715014274710265e-06, "loss": 0.0169, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 275 }, { "completion_length": 480.5, "epoch": 0.2944, "grad_norm": 0.004920989740639925, "kl": 0.013940811157226562, "learning_rate": 2.6680021647304735e-06, "loss": -0.0437, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 276 }, { "completion_length": 667.34375, "epoch": 0.29546666666666666, "grad_norm": 0.08716274797916412, "kl": 0.006110191345214844, "learning_rate": 2.6644866806120474e-06, "loss": 0.0186, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 277 }, { "completion_length": 546.21875, "epoch": 0.2965333333333333, "grad_norm": 0.11957955360412598, "kl": 0.00678253173828125, "learning_rate": 2.6609550239392854e-06, "loss": 0.0193, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 278 }, { "completion_length": 566.34375, "epoch": 0.2976, "grad_norm": 0.1861569881439209, "kl": 0.007904052734375, "learning_rate": 2.65740724376033e-06, "loss": -0.0528, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 279 }, { "completion_length": 649.15625, "epoch": 0.2986666666666667, "grad_norm": 0.002395899035036564, "kl": 0.005535125732421875, "learning_rate": 2.65384338934725e-06, "loss": 0.0591, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 633.15625, "epoch": 0.29973333333333335, "grad_norm": 0.0021419432014226913, "kl": 0.006155967712402344, "learning_rate": 2.6502635101953553e-06, "loss": 0.0448, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 281 }, { "completion_length": 694.65625, "epoch": 0.3008, "grad_norm": 0.1338863968849182, "kl": 0.0050792694091796875, "learning_rate": 2.64666765602251e-06, "loss": -0.0026, "reward": 0.6875, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 282 }, { "completion_length": 621.625, "epoch": 0.30186666666666667, "grad_norm": 0.07547681033611298, "kl": 0.004855155944824219, "learning_rate": 2.6430558767684408e-06, "loss": 0.0393, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 283 }, { "completion_length": 486.21875, "epoch": 0.30293333333333333, "grad_norm": 0.2223091870546341, "kl": 0.009084701538085938, "learning_rate": 2.6394282225940447e-06, "loss": 0.0436, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 284 }, { "completion_length": 571.375, "epoch": 0.304, "grad_norm": 0.14504198729991913, "kl": 0.003875732421875, "learning_rate": 2.6357847438806916e-06, "loss": 0.1292, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 285 }, { "completion_length": 531.59375, "epoch": 0.30506666666666665, "grad_norm": 0.002932249801233411, "kl": 0.0062656402587890625, "learning_rate": 2.6321254912295243e-06, "loss": 0.0251, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 286 }, { "completion_length": 614.9375, "epoch": 0.3061333333333333, "grad_norm": 0.09579221904277802, "kl": 0.003204345703125, "learning_rate": 2.628450515460758e-06, "loss": -0.0091, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 287 }, { "completion_length": 664.5, "epoch": 0.3072, "grad_norm": 0.0020599663257598877, "kl": 0.004374504089355469, "learning_rate": 2.624759867612971e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 288 }, { "completion_length": 590.75, "epoch": 0.3082666666666667, "grad_norm": 0.38902053236961365, "kl": 0.040485382080078125, "learning_rate": 2.621053598942398e-06, "loss": 0.069, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 289 }, { "completion_length": 689.1875, "epoch": 0.30933333333333335, "grad_norm": 0.0032320290338248014, "kl": 0.005346775054931641, "learning_rate": 2.617331760922218e-06, "loss": 0.0218, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 515.65625, "epoch": 0.3104, "grad_norm": 0.00312280235812068, "kl": 0.008257865905761719, "learning_rate": 2.61359440524184e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 291 }, { "completion_length": 658.40625, "epoch": 0.31146666666666667, "grad_norm": 0.06871040165424347, "kl": 0.005600929260253906, "learning_rate": 2.6098415838061832e-06, "loss": 0.0912, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 292 }, { "completion_length": 647.1875, "epoch": 0.31253333333333333, "grad_norm": 0.05467379465699196, "kl": 0.003387451171875, "learning_rate": 2.6060733487349584e-06, "loss": -0.0198, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 293 }, { "completion_length": 628.4375, "epoch": 0.3136, "grad_norm": 0.1108846440911293, "kl": 0.00905609130859375, "learning_rate": 2.6022897523619424e-06, "loss": 0.0461, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 294 }, { "completion_length": 628.71875, "epoch": 0.31466666666666665, "grad_norm": 0.021847274154424667, "kl": 0.009679794311523438, "learning_rate": 2.598490847234253e-06, "loss": -0.009, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 295 }, { "completion_length": 559.09375, "epoch": 0.3157333333333333, "grad_norm": 0.003011970315128565, "kl": 0.005032539367675781, "learning_rate": 2.5946766861116167e-06, "loss": 0.014, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 296 }, { "completion_length": 403.90625, "epoch": 0.3168, "grad_norm": 0.002400035038590431, "kl": 0.008458137512207031, "learning_rate": 2.5908473219656386e-06, "loss": -0.0361, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 297 }, { "completion_length": 582.21875, "epoch": 0.3178666666666667, "grad_norm": 0.11959020048379898, "kl": 0.004269599914550781, "learning_rate": 2.5870028079790647e-06, "loss": 0.036, "reward": 0.75, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 298 }, { "completion_length": 602.875, "epoch": 0.31893333333333335, "grad_norm": 0.07725955545902252, "kl": 0.0049076080322265625, "learning_rate": 2.583143197545044e-06, "loss": 0.0616, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 299 }, { "completion_length": 478.59375, "epoch": 0.32, "grad_norm": 0.15109021961688995, "kl": 0.010883331298828125, "learning_rate": 2.5792685442663883e-06, "loss": 0.0111, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 300 }, { "completion_length": 575.5625, "epoch": 0.32106666666666667, "grad_norm": 0.11069013178348541, "kl": 0.0047607421875, "learning_rate": 2.5753789019548255e-06, "loss": 0.0736, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 301 }, { "completion_length": 569.84375, "epoch": 0.3221333333333333, "grad_norm": 0.09868685156106949, "kl": 0.006812095642089844, "learning_rate": 2.571474324630253e-06, "loss": 0.0198, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 302 }, { "completion_length": 607.375, "epoch": 0.3232, "grad_norm": 0.006142762955278158, "kl": 0.005413055419921875, "learning_rate": 2.567554866519989e-06, "loss": 0.0045, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 303 }, { "completion_length": 415.0, "epoch": 0.32426666666666665, "grad_norm": 0.4760373830795288, "kl": 0.09434318542480469, "learning_rate": 2.5636205820580173e-06, "loss": -0.0886, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 304 }, { "completion_length": 525.6875, "epoch": 0.3253333333333333, "grad_norm": 0.0036274490412324667, "kl": 0.009111404418945312, "learning_rate": 2.559671525884232e-06, "loss": 0.0691, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 305 }, { "completion_length": 607.09375, "epoch": 0.3264, "grad_norm": 0.14186467230319977, "kl": 0.008086204528808594, "learning_rate": 2.5557077528436792e-06, "loss": 0.0404, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 306 }, { "completion_length": 568.4375, "epoch": 0.3274666666666667, "grad_norm": 0.087283656001091, "kl": 0.011846542358398438, "learning_rate": 2.551729317985795e-06, "loss": 0.1138, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 307 }, { "completion_length": 723.4375, "epoch": 0.32853333333333334, "grad_norm": 0.11794599145650864, "kl": 0.0055599212646484375, "learning_rate": 2.5477362765636408e-06, "loss": 0.046, "reward": 0.5, "reward_std": 0.4858439117670059, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 308 }, { "completion_length": 487.71875, "epoch": 0.3296, "grad_norm": 0.07036077231168747, "kl": 0.016681671142578125, "learning_rate": 2.5437286840331353e-06, "loss": 0.0732, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 309 }, { "completion_length": 392.8125, "epoch": 0.33066666666666666, "grad_norm": 0.27282875776290894, "kl": 0.019098281860351562, "learning_rate": 2.539706596052286e-06, "loss": -0.0451, "reward": 0.6875, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 521.71875, "epoch": 0.3317333333333333, "grad_norm": 0.1318587064743042, "kl": 0.015063285827636719, "learning_rate": 2.535670068480414e-06, "loss": 0.0727, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 311 }, { "completion_length": 561.5625, "epoch": 0.3328, "grad_norm": 0.16648578643798828, "kl": 0.012239456176757812, "learning_rate": 2.531619157377382e-06, "loss": 0.094, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 312 }, { "completion_length": 452.5625, "epoch": 0.33386666666666664, "grad_norm": 0.15342846512794495, "kl": 0.023651123046875, "learning_rate": 2.5275539190028104e-06, "loss": 0.1255, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 313 }, { "completion_length": 360.90625, "epoch": 0.33493333333333336, "grad_norm": 0.3805198669433594, "kl": 0.03606414794921875, "learning_rate": 2.5234744098153e-06, "loss": 0.1086, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 314 }, { "completion_length": 572.78125, "epoch": 0.336, "grad_norm": 0.3817315995693207, "kl": 0.021503448486328125, "learning_rate": 2.5193806864716466e-06, "loss": 0.0575, "reward": 0.46875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 315 }, { "completion_length": 582.125, "epoch": 0.3370666666666667, "grad_norm": 0.07985702902078629, "kl": 0.015697479248046875, "learning_rate": 2.5152728058260543e-06, "loss": -0.0041, "reward": 0.375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 316 }, { "completion_length": 493.59375, "epoch": 0.33813333333333334, "grad_norm": 0.011806112714111805, "kl": 0.02543163299560547, "learning_rate": 2.5111508249293456e-06, "loss": 0.0143, "reward": 0.59375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 317 }, { "completion_length": 445.8125, "epoch": 0.3392, "grad_norm": 0.39576229453086853, "kl": 0.03014373779296875, "learning_rate": 2.507014801028169e-06, "loss": 0.0047, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 318 }, { "completion_length": 636.34375, "epoch": 0.34026666666666666, "grad_norm": 0.10015080869197845, "kl": 0.014139175415039062, "learning_rate": 2.502864791564205e-06, "loss": 0.0451, "reward": 0.28125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 319 }, { "completion_length": 510.0625, "epoch": 0.3413333333333333, "grad_norm": 0.17959214746952057, "kl": 0.04010772705078125, "learning_rate": 2.4987008541733663e-06, "loss": -0.0146, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 597.59375, "epoch": 0.3424, "grad_norm": 0.1384740173816681, "kl": 0.01268768310546875, "learning_rate": 2.494523046685e-06, "loss": 0.1127, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 321 }, { "completion_length": 572.6875, "epoch": 0.34346666666666664, "grad_norm": 0.1773405224084854, "kl": 0.01352691650390625, "learning_rate": 2.4903314271210824e-06, "loss": -0.0218, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 322 }, { "completion_length": 556.59375, "epoch": 0.34453333333333336, "grad_norm": 0.23387420177459717, "kl": 0.0207061767578125, "learning_rate": 2.486126053695414e-06, "loss": 0.1029, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 323 }, { "completion_length": 701.34375, "epoch": 0.3456, "grad_norm": 0.2453470677137375, "kl": 0.0170745849609375, "learning_rate": 2.48190698481281e-06, "loss": 0.0591, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 324 }, { "completion_length": 715.09375, "epoch": 0.3466666666666667, "grad_norm": 0.02320495806634426, "kl": 0.01576995849609375, "learning_rate": 2.477674279068291e-06, "loss": 0.1626, "reward": 0.375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 325 }, { "completion_length": 593.375, "epoch": 0.34773333333333334, "grad_norm": 0.011513919569551945, "kl": 0.014385223388671875, "learning_rate": 2.473427995246269e-06, "loss": 0.0712, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 326 }, { "completion_length": 530.6875, "epoch": 0.3488, "grad_norm": 0.0663112998008728, "kl": 0.01848602294921875, "learning_rate": 2.4691681923197277e-06, "loss": 0.0207, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 327 }, { "completion_length": 509.8125, "epoch": 0.34986666666666666, "grad_norm": 0.12827788293361664, "kl": 0.012079238891601562, "learning_rate": 2.464894929449408e-06, "loss": -0.0153, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 328 }, { "completion_length": 663.40625, "epoch": 0.3509333333333333, "grad_norm": 0.10839039832353592, "kl": 0.010301589965820312, "learning_rate": 2.460608265982985e-06, "loss": 0.0501, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 329 }, { "completion_length": 496.21875, "epoch": 0.352, "grad_norm": 0.013008415699005127, "kl": 0.015905380249023438, "learning_rate": 2.4563082614542412e-06, "loss": -0.013, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 639.34375, "epoch": 0.35306666666666664, "grad_norm": 0.11372006684541702, "kl": 0.014196395874023438, "learning_rate": 2.4519949755822433e-06, "loss": 0.0884, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 331 }, { "completion_length": 669.8125, "epoch": 0.35413333333333336, "grad_norm": 0.13563892245292664, "kl": 0.012117385864257812, "learning_rate": 2.447668468270509e-06, "loss": 0.0446, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 332 }, { "completion_length": 547.84375, "epoch": 0.3552, "grad_norm": 0.24175286293029785, "kl": 0.01762866973876953, "learning_rate": 2.44332879960618e-06, "loss": 0.0104, "reward": 0.53125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 333 }, { "completion_length": 483.5625, "epoch": 0.3562666666666667, "grad_norm": 0.25908443331718445, "kl": 0.022211074829101562, "learning_rate": 2.4389760298591824e-06, "loss": 0.0061, "reward": 0.375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 334 }, { "completion_length": 526.96875, "epoch": 0.35733333333333334, "grad_norm": 0.07015577703714371, "kl": 0.0168609619140625, "learning_rate": 2.4346102194813937e-06, "loss": -0.004, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 335 }, { "completion_length": 492.65625, "epoch": 0.3584, "grad_norm": 0.15577548742294312, "kl": 0.0168609619140625, "learning_rate": 2.4302314291058004e-06, "loss": 0.0592, "reward": 0.875, "reward_std": 0.125, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 336 }, { "completion_length": 488.3125, "epoch": 0.35946666666666666, "grad_norm": 0.1509963721036911, "kl": 0.03153228759765625, "learning_rate": 2.4258397195456573e-06, "loss": 0.0075, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 337 }, { "completion_length": 527.25, "epoch": 0.3605333333333333, "grad_norm": 0.005878519266843796, "kl": 0.020477294921875, "learning_rate": 2.4214351517936423e-06, "loss": -0.056, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 338 }, { "completion_length": 442.5, "epoch": 0.3616, "grad_norm": 0.2335798740386963, "kl": 0.023876190185546875, "learning_rate": 2.4170177870210112e-06, "loss": -0.0401, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 339 }, { "completion_length": 608.96875, "epoch": 0.3626666666666667, "grad_norm": 0.22310422360897064, "kl": 0.0204620361328125, "learning_rate": 2.4125876865767443e-06, "loss": -0.0082, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 604.03125, "epoch": 0.36373333333333335, "grad_norm": 0.26351848244667053, "kl": 0.030942916870117188, "learning_rate": 2.4081449119866983e-06, "loss": -0.027, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 341 }, { "completion_length": 522.90625, "epoch": 0.3648, "grad_norm": 0.21228285133838654, "kl": 0.012547492980957031, "learning_rate": 2.40368952495275e-06, "loss": 0.0308, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 342 }, { "completion_length": 477.875, "epoch": 0.3658666666666667, "grad_norm": 0.1858755350112915, "kl": 0.017408370971679688, "learning_rate": 2.399221587351939e-06, "loss": 0.0703, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 343 }, { "completion_length": 559.5, "epoch": 0.36693333333333333, "grad_norm": 0.15599089860916138, "kl": 0.0170440673828125, "learning_rate": 2.3947411612356092e-06, "loss": 0.0421, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 344 }, { "completion_length": 696.8125, "epoch": 0.368, "grad_norm": 0.08617735654115677, "kl": 0.0055294036865234375, "learning_rate": 2.390248308828548e-06, "loss": 0.0709, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 345 }, { "completion_length": 638.90625, "epoch": 0.36906666666666665, "grad_norm": 0.11911772936582565, "kl": 0.005496978759765625, "learning_rate": 2.3857430925281186e-06, "loss": 0.0778, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 346 }, { "completion_length": 516.375, "epoch": 0.3701333333333333, "grad_norm": 0.20256225764751434, "kl": 0.0075511932373046875, "learning_rate": 2.3812255749033975e-06, "loss": 0.0241, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 347 }, { "completion_length": 498.34375, "epoch": 0.3712, "grad_norm": 0.0027922920417040586, "kl": 0.005218505859375, "learning_rate": 2.3766958186943022e-06, "loss": 0.0157, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 348 }, { "completion_length": 583.6875, "epoch": 0.3722666666666667, "grad_norm": 0.08654491603374481, "kl": 0.008768081665039062, "learning_rate": 2.3721538868107225e-06, "loss": 0.0083, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 349 }, { "completion_length": 595.71875, "epoch": 0.37333333333333335, "grad_norm": 0.145588681101799, "kl": 0.009061813354492188, "learning_rate": 2.367599842331646e-06, "loss": 0.0421, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 605.46875, "epoch": 0.3744, "grad_norm": 0.1675889790058136, "kl": 0.010802268981933594, "learning_rate": 2.3630337485042807e-06, "loss": 0.0638, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 351 }, { "completion_length": 619.09375, "epoch": 0.37546666666666667, "grad_norm": 0.13851353526115417, "kl": 0.011074066162109375, "learning_rate": 2.3584556687431787e-06, "loss": 0.0103, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 352 }, { "completion_length": 569.0625, "epoch": 0.37653333333333333, "grad_norm": 0.14720208942890167, "kl": 0.00936126708984375, "learning_rate": 2.3538656666293525e-06, "loss": 0.0379, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 353 }, { "completion_length": 643.84375, "epoch": 0.3776, "grad_norm": 0.00280022993683815, "kl": 0.0054645538330078125, "learning_rate": 2.3492638059093957e-06, "loss": 0.0673, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 354 }, { "completion_length": 582.21875, "epoch": 0.37866666666666665, "grad_norm": 0.12901706993579865, "kl": 0.007495880126953125, "learning_rate": 2.344650150494596e-06, "loss": 0.0349, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 355 }, { "completion_length": 597.78125, "epoch": 0.3797333333333333, "grad_norm": 0.004854352679103613, "kl": 0.008602142333984375, "learning_rate": 2.340024764460046e-06, "loss": 0.0109, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 356 }, { "completion_length": 558.8125, "epoch": 0.3808, "grad_norm": 0.0043733324855566025, "kl": 0.01284027099609375, "learning_rate": 2.3353877120437565e-06, "loss": 0.0789, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 357 }, { "completion_length": 657.1875, "epoch": 0.3818666666666667, "grad_norm": 0.0034839059226214886, "kl": 0.0056858062744140625, "learning_rate": 2.330739057645761e-06, "loss": -0.0219, "reward": 0.4375, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 358 }, { "completion_length": 504.40625, "epoch": 0.38293333333333335, "grad_norm": 0.10250770300626755, "kl": 0.020122528076171875, "learning_rate": 2.3260788658272246e-06, "loss": 0.0939, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 359 }, { "completion_length": 622.21875, "epoch": 0.384, "grad_norm": 0.2749554216861725, "kl": 0.020009994506835938, "learning_rate": 2.3214072013095436e-06, "loss": 0.0007, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 602.84375, "epoch": 0.38506666666666667, "grad_norm": 0.09728457033634186, "kl": 0.024440765380859375, "learning_rate": 2.3167241289734514e-06, "loss": -0.0195, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 361 }, { "completion_length": 504.375, "epoch": 0.38613333333333333, "grad_norm": 0.22115233540534973, "kl": 0.010540008544921875, "learning_rate": 2.312029713858112e-06, "loss": 0.0031, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 362 }, { "completion_length": 579.59375, "epoch": 0.3872, "grad_norm": 0.0760805755853653, "kl": 0.010890960693359375, "learning_rate": 2.307324021160222e-06, "loss": -0.0112, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 363 }, { "completion_length": 565.46875, "epoch": 0.38826666666666665, "grad_norm": 0.08238299190998077, "kl": 0.01108551025390625, "learning_rate": 2.302607116233101e-06, "loss": 0.0248, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 364 }, { "completion_length": 535.0625, "epoch": 0.3893333333333333, "grad_norm": 0.09151773899793625, "kl": 0.009332656860351562, "learning_rate": 2.2978790645857867e-06, "loss": -0.016, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 365 }, { "completion_length": 517.8125, "epoch": 0.3904, "grad_norm": 0.00466684065759182, "kl": 0.013158798217773438, "learning_rate": 2.293139931882123e-06, "loss": -0.0499, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 366 }, { "completion_length": 674.03125, "epoch": 0.3914666666666667, "grad_norm": 0.0048573315143585205, "kl": 0.007649421691894531, "learning_rate": 2.28838978393985e-06, "loss": 0.0011, "reward": 0.4375, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 367 }, { "completion_length": 498.40625, "epoch": 0.39253333333333335, "grad_norm": 0.12294510751962662, "kl": 0.00807952880859375, "learning_rate": 2.2836286867296872e-06, "loss": 0.0056, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 368 }, { "completion_length": 609.71875, "epoch": 0.3936, "grad_norm": 0.14896993339061737, "kl": 0.005950927734375, "learning_rate": 2.278856706374422e-06, "loss": 0.0264, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 369 }, { "completion_length": 588.375, "epoch": 0.39466666666666667, "grad_norm": 0.10624019801616669, "kl": 0.007305145263671875, "learning_rate": 2.274073909147986e-06, "loss": 0.0192, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 556.0, "epoch": 0.3957333333333333, "grad_norm": 0.12167878448963165, "kl": 0.004550933837890625, "learning_rate": 2.2692803614745386e-06, "loss": -0.0334, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 371 }, { "completion_length": 595.65625, "epoch": 0.3968, "grad_norm": 0.003087603487074375, "kl": 0.008172988891601562, "learning_rate": 2.264476129927541e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 372 }, { "completion_length": 520.5, "epoch": 0.39786666666666665, "grad_norm": 0.0024129520170390606, "kl": 0.0071258544921875, "learning_rate": 2.259661281228836e-06, "loss": -0.0354, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 373 }, { "completion_length": 696.15625, "epoch": 0.3989333333333333, "grad_norm": 0.0065423608757555485, "kl": 0.008729934692382812, "learning_rate": 2.254835882247716e-06, "loss": 0.0261, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 374 }, { "completion_length": 662.8125, "epoch": 0.4, "grad_norm": 0.12349364161491394, "kl": 0.0049076080322265625, "learning_rate": 2.25e-06, "loss": 0.0212, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 375 }, { "completion_length": 601.5625, "epoch": 0.4010666666666667, "grad_norm": 0.0025172161404043436, "kl": 0.0045318603515625, "learning_rate": 2.245153701647099e-06, "loss": 0.0105, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 376 }, { "completion_length": 460.84375, "epoch": 0.40213333333333334, "grad_norm": 0.15622177720069885, "kl": 0.010751724243164062, "learning_rate": 2.2402970544950836e-06, "loss": 0.0036, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 377 }, { "completion_length": 570.59375, "epoch": 0.4032, "grad_norm": 0.0028687575832009315, "kl": 0.0076141357421875, "learning_rate": 2.23543012599375e-06, "loss": 0.098, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 378 }, { "completion_length": 620.90625, "epoch": 0.40426666666666666, "grad_norm": 0.0027069286443293095, "kl": 0.005157470703125, "learning_rate": 2.230552983735686e-06, "loss": 0.0448, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 379 }, { "completion_length": 641.125, "epoch": 0.4053333333333333, "grad_norm": 0.001952870050445199, "kl": 0.005702972412109375, "learning_rate": 2.225665695455325e-06, "loss": 0.0454, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 503.53125, "epoch": 0.4064, "grad_norm": 0.0017114793881773949, "kl": 0.006337165832519531, "learning_rate": 2.220768329028013e-06, "loss": -0.0012, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 381 }, { "completion_length": 453.375, "epoch": 0.40746666666666664, "grad_norm": 0.1319994032382965, "kl": 0.011007308959960938, "learning_rate": 2.2158609524690615e-06, "loss": -0.0085, "reward": 0.875, "reward_std": 0.125, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 382 }, { "completion_length": 668.46875, "epoch": 0.40853333333333336, "grad_norm": 0.07925617694854736, "kl": 0.008403778076171875, "learning_rate": 2.210943633932805e-06, "loss": 0.0551, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 383 }, { "completion_length": 525.03125, "epoch": 0.4096, "grad_norm": 0.19502685964107513, "kl": 0.009962081909179688, "learning_rate": 2.206016441711652e-06, "loss": 0.0487, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 384 }, { "completion_length": 429.59375, "epoch": 0.4106666666666667, "grad_norm": 0.1692591905593872, "kl": 0.017255783081054688, "learning_rate": 2.20107944423514e-06, "loss": 0.0006, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 385 }, { "completion_length": 471.65625, "epoch": 0.41173333333333334, "grad_norm": 0.17000345885753632, "kl": 0.00647735595703125, "learning_rate": 2.1961327100689823e-06, "loss": 0.1112, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 386 }, { "completion_length": 550.34375, "epoch": 0.4128, "grad_norm": 0.11449652165174484, "kl": 0.006465911865234375, "learning_rate": 2.1911763079141163e-06, "loss": 0.0945, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 387 }, { "completion_length": 528.1875, "epoch": 0.41386666666666666, "grad_norm": 0.12032641470432281, "kl": 0.0054569244384765625, "learning_rate": 2.1862103066057508e-06, "loss": -0.0085, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 388 }, { "completion_length": 634.3125, "epoch": 0.4149333333333333, "grad_norm": 0.10439102351665497, "kl": 0.010499954223632812, "learning_rate": 2.1812347751124072e-06, "loss": 0.0049, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 389 }, { "completion_length": 655.6875, "epoch": 0.416, "grad_norm": 0.0611797459423542, "kl": 0.005924224853515625, "learning_rate": 2.1762497825349665e-06, "loss": 0.01, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 723.0, "epoch": 0.41706666666666664, "grad_norm": 0.0021899801213294268, "kl": 0.003955841064453125, "learning_rate": 2.171255398105703e-06, "loss": -0.0106, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 391 }, { "completion_length": 655.71875, "epoch": 0.41813333333333336, "grad_norm": 0.16050589084625244, "kl": 0.013135910034179688, "learning_rate": 2.166251691187329e-06, "loss": 0.0202, "reward": 0.3125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 392 }, { "completion_length": 602.03125, "epoch": 0.4192, "grad_norm": 0.09667491912841797, "kl": 0.0069122314453125, "learning_rate": 2.1612387312720286e-06, "loss": 0.0593, "reward": 0.75, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 393 }, { "completion_length": 573.625, "epoch": 0.4202666666666667, "grad_norm": 0.10193964093923569, "kl": 0.014604568481445312, "learning_rate": 2.156216587980491e-06, "loss": 0.0217, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 394 }, { "completion_length": 585.46875, "epoch": 0.42133333333333334, "grad_norm": 0.005829717498272657, "kl": 0.0072841644287109375, "learning_rate": 2.1511853310609467e-06, "loss": 0.009, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 395 }, { "completion_length": 520.0625, "epoch": 0.4224, "grad_norm": 0.1293623149394989, "kl": 0.0106048583984375, "learning_rate": 2.146145030388198e-06, "loss": 0.0285, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 396 }, { "completion_length": 620.28125, "epoch": 0.42346666666666666, "grad_norm": 0.09326758235692978, "kl": 0.00821685791015625, "learning_rate": 2.141095755962647e-06, "loss": 0.0275, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 397 }, { "completion_length": 559.28125, "epoch": 0.4245333333333333, "grad_norm": 0.25968942046165466, "kl": 0.008396148681640625, "learning_rate": 2.1360375779093257e-06, "loss": 0.239, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 398 }, { "completion_length": 718.03125, "epoch": 0.4256, "grad_norm": 0.06754752993583679, "kl": 0.005382537841796875, "learning_rate": 2.1309705664769195e-06, "loss": 0.0275, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 399 }, { "completion_length": 776.03125, "epoch": 0.4266666666666667, "grad_norm": 0.10162820667028427, "kl": 0.0022287368774414062, "learning_rate": 2.1258947920367943e-06, "loss": 0.0848, "reward": 0.46875, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 400 }, { "completion_length": 434.78125, "epoch": 0.42773333333333335, "grad_norm": 0.07655008882284164, "kl": 0.018537521362304688, "learning_rate": 2.120810325082017e-06, "loss": 0.0035, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 401 }, { "completion_length": 467.1875, "epoch": 0.4288, "grad_norm": 0.10330621898174286, "kl": 0.011241912841796875, "learning_rate": 2.1157172362263782e-06, "loss": 0.0205, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 402 }, { "completion_length": 610.53125, "epoch": 0.4298666666666667, "grad_norm": 0.1548849642276764, "kl": 0.005213737487792969, "learning_rate": 2.1106155962034103e-06, "loss": 0.0375, "reward": 0.40625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 403 }, { "completion_length": 667.46875, "epoch": 0.43093333333333333, "grad_norm": 0.10087564587593079, "kl": 0.007503986358642578, "learning_rate": 2.1055054758654056e-06, "loss": -0.0087, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 404 }, { "completion_length": 542.375, "epoch": 0.432, "grad_norm": 0.003903063479810953, "kl": 0.0064868927001953125, "learning_rate": 2.100386946182431e-06, "loss": 0.0153, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 405 }, { "completion_length": 687.375, "epoch": 0.43306666666666666, "grad_norm": 0.0038603979628533125, "kl": 0.007550239562988281, "learning_rate": 2.0952600782413454e-06, "loss": 0.0424, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 406 }, { "completion_length": 629.8125, "epoch": 0.4341333333333333, "grad_norm": 0.006375161465257406, "kl": 0.006636619567871094, "learning_rate": 2.090124943244809e-06, "loss": 0.0081, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 407 }, { "completion_length": 525.15625, "epoch": 0.4352, "grad_norm": 0.08055587112903595, "kl": 0.0062808990478515625, "learning_rate": 2.084981612510298e-06, "loss": -0.0298, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 408 }, { "completion_length": 509.4375, "epoch": 0.4362666666666667, "grad_norm": 0.004336795769631863, "kl": 0.005438804626464844, "learning_rate": 2.0798301574691106e-06, "loss": 0.0063, "reward": 0.875, "reward_std": 0.125, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 409 }, { "completion_length": 571.0, "epoch": 0.43733333333333335, "grad_norm": 0.003664974123239517, "kl": 0.005261421203613281, "learning_rate": 2.0746706496653765e-06, "loss": 0.0321, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 410 }, { "completion_length": 517.9375, "epoch": 0.4384, "grad_norm": 0.00707610510289669, "kl": 0.008172988891601562, "learning_rate": 2.069503160755064e-06, "loss": 0.0698, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 411 }, { "completion_length": 618.8125, "epoch": 0.43946666666666667, "grad_norm": 0.00851115956902504, "kl": 0.006613731384277344, "learning_rate": 2.0643277625049832e-06, "loss": 0.0021, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 412 }, { "completion_length": 720.375, "epoch": 0.44053333333333333, "grad_norm": 0.11048571765422821, "kl": 0.0030117034912109375, "learning_rate": 2.0591445267917923e-06, "loss": 0.028, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 413 }, { "completion_length": 664.75, "epoch": 0.4416, "grad_norm": 0.0036583468317985535, "kl": 0.004578590393066406, "learning_rate": 2.053953525600994e-06, "loss": 0.0234, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 414 }, { "completion_length": 665.4375, "epoch": 0.44266666666666665, "grad_norm": 0.08534727245569229, "kl": 0.0051326751708984375, "learning_rate": 2.048754831025942e-06, "loss": 0.0154, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 415 }, { "completion_length": 637.6875, "epoch": 0.4437333333333333, "grad_norm": 0.10230205208063126, "kl": 0.0046176910400390625, "learning_rate": 2.0435485152668356e-06, "loss": 0.0826, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 416 }, { "completion_length": 593.75, "epoch": 0.4448, "grad_norm": 0.11220455169677734, "kl": 0.004834175109863281, "learning_rate": 2.038334650629718e-06, "loss": -0.0126, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 417 }, { "completion_length": 698.53125, "epoch": 0.4458666666666667, "grad_norm": 0.07458607852458954, "kl": 0.004955291748046875, "learning_rate": 2.033113309525472e-06, "loss": 0.0033, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 418 }, { "completion_length": 575.9375, "epoch": 0.44693333333333335, "grad_norm": 0.06988722085952759, "kl": 0.00600433349609375, "learning_rate": 2.027884564468816e-06, "loss": 0.0143, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 419 }, { "completion_length": 601.25, "epoch": 0.448, "grad_norm": 0.24207736551761627, "kl": 0.005578041076660156, "learning_rate": 2.0226484880772943e-06, "loss": 0.134, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 420 }, { "completion_length": 711.625, "epoch": 0.44906666666666667, "grad_norm": 0.003921502269804478, "kl": 0.0052852630615234375, "learning_rate": 2.01740515307027e-06, "loss": -0.0178, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 421 }, { "completion_length": 684.65625, "epoch": 0.45013333333333333, "grad_norm": 0.05625630542635918, "kl": 0.0025005340576171875, "learning_rate": 2.012154632267915e-06, "loss": 0.0222, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 422 }, { "completion_length": 569.46875, "epoch": 0.4512, "grad_norm": 0.06189500913023949, "kl": 0.005116462707519531, "learning_rate": 2.0068969985901996e-06, "loss": 0.0054, "reward": 0.875, "reward_std": 0.125, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 423 }, { "completion_length": 573.6875, "epoch": 0.45226666666666665, "grad_norm": 0.08203983306884766, "kl": 0.009289741516113281, "learning_rate": 2.0016323250558765e-06, "loss": 0.114, "reward": 0.78125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 424 }, { "completion_length": 621.65625, "epoch": 0.4533333333333333, "grad_norm": 0.006475863512605429, "kl": 0.00775146484375, "learning_rate": 1.9963606847814702e-06, "loss": 0.0583, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 425 }, { "completion_length": 673.5625, "epoch": 0.4544, "grad_norm": 0.17741386592388153, "kl": 0.0059375762939453125, "learning_rate": 1.991082150980261e-06, "loss": 0.0192, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 426 }, { "completion_length": 562.03125, "epoch": 0.4554666666666667, "grad_norm": 0.1195107102394104, "kl": 0.006922721862792969, "learning_rate": 1.9857967969612654e-06, "loss": -0.0122, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 427 }, { "completion_length": 495.53125, "epoch": 0.45653333333333335, "grad_norm": 0.10323131084442139, "kl": 0.016477584838867188, "learning_rate": 1.9805046961282226e-06, "loss": 0.0673, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 428 }, { "completion_length": 474.25, "epoch": 0.4576, "grad_norm": 0.16920287907123566, "kl": 0.012701034545898438, "learning_rate": 1.9752059219785703e-06, "loss": 0.001, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 429 }, { "completion_length": 557.3125, "epoch": 0.45866666666666667, "grad_norm": 0.003834841074422002, "kl": 0.0042324066162109375, "learning_rate": 1.9699005481024273e-06, "loss": 0.0098, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 430 }, { "completion_length": 557.9375, "epoch": 0.4597333333333333, "grad_norm": 0.2694531977176666, "kl": 0.006443023681640625, "learning_rate": 1.96458864818157e-06, "loss": -0.001, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 431 }, { "completion_length": 730.0, "epoch": 0.4608, "grad_norm": 0.05773269012570381, "kl": 0.004549980163574219, "learning_rate": 1.9592702959884095e-06, "loss": 0.0983, "reward": 0.40625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 432 }, { "completion_length": 620.4375, "epoch": 0.46186666666666665, "grad_norm": 0.12372300028800964, "kl": 0.004482269287109375, "learning_rate": 1.953945565384967e-06, "loss": 0.0056, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 433 }, { "completion_length": 612.59375, "epoch": 0.4629333333333333, "grad_norm": 0.07266740500926971, "kl": 0.005016326904296875, "learning_rate": 1.948614530321848e-06, "loss": 0.0309, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 434 }, { "completion_length": 556.375, "epoch": 0.464, "grad_norm": 0.18170151114463806, "kl": 0.008874893188476562, "learning_rate": 1.943277264837214e-06, "loss": 0.0954, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 435 }, { "completion_length": 507.15625, "epoch": 0.4650666666666667, "grad_norm": 0.20402514934539795, "kl": 0.012416839599609375, "learning_rate": 1.9379338430557582e-06, "loss": 0.0919, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 436 }, { "completion_length": 576.8125, "epoch": 0.46613333333333334, "grad_norm": 0.0762326791882515, "kl": 0.008317947387695312, "learning_rate": 1.932584339187671e-06, "loss": 0.0426, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 437 }, { "completion_length": 643.8125, "epoch": 0.4672, "grad_norm": 0.16461005806922913, "kl": 0.0063323974609375, "learning_rate": 1.927228827527612e-06, "loss": 0.004, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 438 }, { "completion_length": 630.4375, "epoch": 0.46826666666666666, "grad_norm": 0.1025485023856163, "kl": 0.0050373077392578125, "learning_rate": 1.921867382453679e-06, "loss": 0.1175, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 439 }, { "completion_length": 585.0, "epoch": 0.4693333333333333, "grad_norm": 0.002687159925699234, "kl": 0.009491920471191406, "learning_rate": 1.9165000784263734e-06, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 440 }, { "completion_length": 629.5625, "epoch": 0.4704, "grad_norm": 0.19775213301181793, "kl": 0.005036354064941406, "learning_rate": 1.911126989987565e-06, "loss": 0.008, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 441 }, { "completion_length": 512.21875, "epoch": 0.47146666666666665, "grad_norm": 0.14588458836078644, "kl": 0.006011962890625, "learning_rate": 1.9057481917594604e-06, "loss": -0.0297, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 442 }, { "completion_length": 613.25, "epoch": 0.47253333333333336, "grad_norm": 0.05581381544470787, "kl": 0.009325981140136719, "learning_rate": 1.9003637584435633e-06, "loss": 0.0262, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 443 }, { "completion_length": 483.4375, "epoch": 0.4736, "grad_norm": 0.007016545161604881, "kl": 0.0057125091552734375, "learning_rate": 1.8949737648196395e-06, "loss": 0.011, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 444 }, { "completion_length": 629.40625, "epoch": 0.4746666666666667, "grad_norm": 0.07299446314573288, "kl": 0.006772041320800781, "learning_rate": 1.8895782857446754e-06, "loss": 0.007, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 445 }, { "completion_length": 587.4375, "epoch": 0.47573333333333334, "grad_norm": 0.14287716150283813, "kl": 0.006839752197265625, "learning_rate": 1.8841773961518417e-06, "loss": -0.0181, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 446 }, { "completion_length": 600.78125, "epoch": 0.4768, "grad_norm": 0.0015630965353921056, "kl": 0.00396728515625, "learning_rate": 1.8787711710494509e-06, "loss": -0.0068, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 447 }, { "completion_length": 611.625, "epoch": 0.47786666666666666, "grad_norm": 0.0015123070916160941, "kl": 0.0040454864501953125, "learning_rate": 1.8733596855199147e-06, "loss": 0.031, "reward": 0.6875, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 448 }, { "completion_length": 691.59375, "epoch": 0.4789333333333333, "grad_norm": 0.0021239013876765966, "kl": 0.007465362548828125, "learning_rate": 1.8679430147187031e-06, "loss": -0.0153, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 449 }, { "completion_length": 611.90625, "epoch": 0.48, "grad_norm": 0.08804386109113693, "kl": 0.004834175109863281, "learning_rate": 1.8625212338733005e-06, "loss": -0.0243, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 450 }, { "completion_length": 702.75, "epoch": 0.48106666666666664, "grad_norm": 0.17041483521461487, "kl": 0.004458427429199219, "learning_rate": 1.8570944182821588e-06, "loss": 0.0541, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 451 }, { "completion_length": 503.6875, "epoch": 0.48213333333333336, "grad_norm": 0.16905687749385834, "kl": 0.0075225830078125, "learning_rate": 1.8516626433136547e-06, "loss": 0.0078, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 452 }, { "completion_length": 588.125, "epoch": 0.4832, "grad_norm": 0.1031089648604393, "kl": 0.005222320556640625, "learning_rate": 1.8462259844050408e-06, "loss": 0.0522, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 453 }, { "completion_length": 639.34375, "epoch": 0.4842666666666667, "grad_norm": 0.16771924495697021, "kl": 0.0064220428466796875, "learning_rate": 1.840784517061398e-06, "loss": 0.0397, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 454 }, { "completion_length": 592.8125, "epoch": 0.48533333333333334, "grad_norm": 0.0644133985042572, "kl": 0.005615234375, "learning_rate": 1.835338316854588e-06, "loss": 0.0035, "reward": 0.90625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 455 }, { "completion_length": 611.125, "epoch": 0.4864, "grad_norm": 0.16060209274291992, "kl": 0.005829811096191406, "learning_rate": 1.8298874594222035e-06, "loss": -0.055, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 456 }, { "completion_length": 611.0, "epoch": 0.48746666666666666, "grad_norm": 0.07224726676940918, "kl": 0.0064411163330078125, "learning_rate": 1.824432020466517e-06, "loss": 0.0379, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 457 }, { "completion_length": 612.21875, "epoch": 0.4885333333333333, "grad_norm": 0.319087952375412, "kl": 0.007579803466796875, "learning_rate": 1.8189720757534291e-06, "loss": 0.0821, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 458 }, { "completion_length": 620.78125, "epoch": 0.4896, "grad_norm": 0.08905518800020218, "kl": 0.0057964324951171875, "learning_rate": 1.8135077011114185e-06, "loss": 0.0162, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 459 }, { "completion_length": 630.59375, "epoch": 0.49066666666666664, "grad_norm": 0.10902316123247147, "kl": 0.0071239471435546875, "learning_rate": 1.8080389724304863e-06, "loss": -0.0111, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 460 }, { "completion_length": 583.25, "epoch": 0.49173333333333336, "grad_norm": 0.001768914982676506, "kl": 0.0075359344482421875, "learning_rate": 1.8025659656611033e-06, "loss": 0.113, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 461 }, { "completion_length": 677.84375, "epoch": 0.4928, "grad_norm": 0.1765199452638626, "kl": 0.003570556640625, "learning_rate": 1.797088756813155e-06, "loss": 0.1125, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 462 }, { "completion_length": 526.8125, "epoch": 0.4938666666666667, "grad_norm": 0.011959308758378029, "kl": 0.0066890716552734375, "learning_rate": 1.7916074219548866e-06, "loss": 0.0093, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 463 }, { "completion_length": 613.125, "epoch": 0.49493333333333334, "grad_norm": 0.00265871686860919, "kl": 0.011141777038574219, "learning_rate": 1.7861220372118446e-06, "loss": -0.0029, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 464 }, { "completion_length": 588.53125, "epoch": 0.496, "grad_norm": 0.0018544851336628199, "kl": 0.006420135498046875, "learning_rate": 1.7806326787658219e-06, "loss": 0.0344, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 465 }, { "completion_length": 602.625, "epoch": 0.49706666666666666, "grad_norm": 0.04875039681792259, "kl": 0.005063056945800781, "learning_rate": 1.7751394228537989e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 466 }, { "completion_length": 612.65625, "epoch": 0.4981333333333333, "grad_norm": 0.16960999369621277, "kl": 0.0050487518310546875, "learning_rate": 1.7696423457668832e-06, "loss": 0.0451, "reward": 0.34375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 467 }, { "completion_length": 655.84375, "epoch": 0.4992, "grad_norm": 0.1425209790468216, "kl": 0.0040760040283203125, "learning_rate": 1.7641415238492536e-06, "loss": 0.11, "reward": 0.78125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 468 }, { "completion_length": 559.34375, "epoch": 0.5002666666666666, "grad_norm": 0.0027845792938023806, "kl": 0.00868988037109375, "learning_rate": 1.7586370334970954e-06, "loss": 0.0452, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 469 }, { "completion_length": 579.03125, "epoch": 0.5013333333333333, "grad_norm": 0.003170463489368558, "kl": 0.006160736083984375, "learning_rate": 1.7531289511575427e-06, "loss": -0.0432, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 470 }, { "completion_length": 696.5625, "epoch": 0.5024, "grad_norm": 0.06645096838474274, "kl": 0.0050048828125, "learning_rate": 1.747617353327616e-06, "loss": -0.028, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 471 }, { "completion_length": 784.0625, "epoch": 0.5034666666666666, "grad_norm": 0.0016921662027016282, "kl": 0.004120826721191406, "learning_rate": 1.7421023165531584e-06, "loss": 0.0013, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 472 }, { "completion_length": 668.4375, "epoch": 0.5045333333333333, "grad_norm": 0.18203701078891754, "kl": 0.0037221908569335938, "learning_rate": 1.7365839174277743e-06, "loss": 0.0515, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 473 }, { "completion_length": 577.875, "epoch": 0.5056, "grad_norm": 0.11658408492803574, "kl": 0.005161285400390625, "learning_rate": 1.7310622325917648e-06, "loss": -0.0279, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 474 }, { "completion_length": 621.1875, "epoch": 0.5066666666666667, "grad_norm": 0.005495255347341299, "kl": 0.004784584045410156, "learning_rate": 1.7255373387310633e-06, "loss": 0.0049, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 475 }, { "completion_length": 500.875, "epoch": 0.5077333333333334, "grad_norm": 0.0037739481776952744, "kl": 0.00531768798828125, "learning_rate": 1.7200093125761706e-06, "loss": -0.0051, "reward": 0.59375, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 476 }, { "completion_length": 711.59375, "epoch": 0.5088, "grad_norm": 0.1046418845653534, "kl": 0.00394439697265625, "learning_rate": 1.714478230901089e-06, "loss": 0.0108, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 477 }, { "completion_length": 600.71875, "epoch": 0.5098666666666667, "grad_norm": 0.1058778315782547, "kl": 0.008573532104492188, "learning_rate": 1.7089441705222568e-06, "loss": 0.049, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 478 }, { "completion_length": 545.5, "epoch": 0.5109333333333334, "grad_norm": 0.10143876820802689, "kl": 0.010120391845703125, "learning_rate": 1.7034072082974805e-06, "loss": -0.0194, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 479 }, { "completion_length": 636.5625, "epoch": 0.512, "grad_norm": 0.003569734515622258, "kl": 0.005367279052734375, "learning_rate": 1.6978674211248676e-06, "loss": 0.0147, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 480 }, { "completion_length": 591.3125, "epoch": 0.5130666666666667, "grad_norm": 0.0029964460991322994, "kl": 0.0060024261474609375, "learning_rate": 1.69232488594176e-06, "loss": 0.0039, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 481 }, { "completion_length": 607.96875, "epoch": 0.5141333333333333, "grad_norm": 0.006466308142989874, "kl": 0.0077419281005859375, "learning_rate": 1.6867796797236638e-06, "loss": -0.0106, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 482 }, { "completion_length": 459.46875, "epoch": 0.5152, "grad_norm": 0.003561146557331085, "kl": 0.0049228668212890625, "learning_rate": 1.6812318794831804e-06, "loss": 0.0161, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 483 }, { "completion_length": 720.6875, "epoch": 0.5162666666666667, "grad_norm": 0.25042906403541565, "kl": 0.00395965576171875, "learning_rate": 1.6756815622689371e-06, "loss": 0.0726, "reward": 0.28125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 484 }, { "completion_length": 489.9375, "epoch": 0.5173333333333333, "grad_norm": 0.00831480324268341, "kl": 0.008487701416015625, "learning_rate": 1.6701288051645182e-06, "loss": -0.0175, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 485 }, { "completion_length": 535.15625, "epoch": 0.5184, "grad_norm": 0.10171255469322205, "kl": 0.009037017822265625, "learning_rate": 1.664573685287393e-06, "loss": 0.0541, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 486 }, { "completion_length": 607.75, "epoch": 0.5194666666666666, "grad_norm": 0.11249890923500061, "kl": 0.004817962646484375, "learning_rate": 1.6590162797878457e-06, "loss": 0.1128, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 487 }, { "completion_length": 608.75, "epoch": 0.5205333333333333, "grad_norm": 0.002099963603541255, "kl": 0.004307746887207031, "learning_rate": 1.653456665847903e-06, "loss": -0.0066, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 488 }, { "completion_length": 575.5625, "epoch": 0.5216, "grad_norm": 0.14549586176872253, "kl": 0.009647369384765625, "learning_rate": 1.6478949206802629e-06, "loss": -0.0253, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 489 }, { "completion_length": 704.25, "epoch": 0.5226666666666666, "grad_norm": 0.1498410552740097, "kl": 0.0046863555908203125, "learning_rate": 1.642331121527223e-06, "loss": 0.0994, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 490 }, { "completion_length": 568.5625, "epoch": 0.5237333333333334, "grad_norm": 0.0022868765518069267, "kl": 0.005413055419921875, "learning_rate": 1.6367653456596054e-06, "loss": 0.0131, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 491 }, { "completion_length": 577.96875, "epoch": 0.5248, "grad_norm": 0.2096066027879715, "kl": 0.0055370330810546875, "learning_rate": 1.6311976703756868e-06, "loss": 0.1495, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 492 }, { "completion_length": 591.15625, "epoch": 0.5258666666666667, "grad_norm": 0.008033943362534046, "kl": 0.0063533782958984375, "learning_rate": 1.6256281730001213e-06, "loss": 0.0236, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 493 }, { "completion_length": 470.5, "epoch": 0.5269333333333334, "grad_norm": 0.1001306027173996, "kl": 0.014371871948242188, "learning_rate": 1.6200569308828705e-06, "loss": -0.0043, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 494 }, { "completion_length": 457.875, "epoch": 0.528, "grad_norm": 0.0024045556783676147, "kl": 0.007180213928222656, "learning_rate": 1.6144840213981257e-06, "loss": -0.0058, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 495 }, { "completion_length": 612.1875, "epoch": 0.5290666666666667, "grad_norm": 0.06458389759063721, "kl": 0.014261245727539062, "learning_rate": 1.6089095219432359e-06, "loss": 0.0285, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 496 }, { "completion_length": 538.40625, "epoch": 0.5301333333333333, "grad_norm": 0.06350859254598618, "kl": 0.00640869140625, "learning_rate": 1.6033335099376315e-06, "loss": 0.0403, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 497 }, { "completion_length": 609.46875, "epoch": 0.5312, "grad_norm": 0.20542192459106445, "kl": 0.0064411163330078125, "learning_rate": 1.5977560628217482e-06, "loss": -0.003, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 498 }, { "completion_length": 680.5625, "epoch": 0.5322666666666667, "grad_norm": 0.0738619863986969, "kl": 0.0035991668701171875, "learning_rate": 1.5921772580559549e-06, "loss": 0.0864, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 499 }, { "completion_length": 537.6875, "epoch": 0.5333333333333333, "grad_norm": 0.06899680942296982, "kl": 0.009592056274414062, "learning_rate": 1.5865971731194738e-06, "loss": -0.0237, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 500 }, { "completion_length": 577.25, "epoch": 0.5344, "grad_norm": 0.3886685073375702, "kl": 0.0175018310546875, "learning_rate": 1.5810158855093075e-06, "loss": -0.0446, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 501 }, { "completion_length": 769.3125, "epoch": 0.5354666666666666, "grad_norm": 0.10831693559885025, "kl": 0.0053043365478515625, "learning_rate": 1.5754334727391613e-06, "loss": 0.0363, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 502 }, { "completion_length": 726.71875, "epoch": 0.5365333333333333, "grad_norm": 0.06156935170292854, "kl": 0.0047588348388671875, "learning_rate": 1.5698500123383657e-06, "loss": -0.0297, "reward": 0.40625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 503 }, { "completion_length": 444.4375, "epoch": 0.5376, "grad_norm": 0.007379237562417984, "kl": 0.009157180786132812, "learning_rate": 1.5642655818508029e-06, "loss": 0.0441, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 504 }, { "completion_length": 537.125, "epoch": 0.5386666666666666, "grad_norm": 0.007401552051305771, "kl": 0.0103607177734375, "learning_rate": 1.5586802588338262e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 505 }, { "completion_length": 623.03125, "epoch": 0.5397333333333333, "grad_norm": 0.22349928319454193, "kl": 0.0036115646362304688, "learning_rate": 1.553094120857185e-06, "loss": 0.0224, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 506 }, { "completion_length": 534.6875, "epoch": 0.5408, "grad_norm": 0.05681007727980614, "kl": 0.005970954895019531, "learning_rate": 1.547507245501947e-06, "loss": -0.0223, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 507 }, { "completion_length": 573.125, "epoch": 0.5418666666666667, "grad_norm": 0.11273617297410965, "kl": 0.0059757232666015625, "learning_rate": 1.5419197103594208e-06, "loss": -0.0215, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 508 }, { "completion_length": 586.90625, "epoch": 0.5429333333333334, "grad_norm": 0.12455052137374878, "kl": 0.005681037902832031, "learning_rate": 1.5363315930300777e-06, "loss": -0.0089, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 509 }, { "completion_length": 544.5625, "epoch": 0.544, "grad_norm": 0.003329213010147214, "kl": 0.0090179443359375, "learning_rate": 1.5307429711224756e-06, "loss": 0.1037, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 510 }, { "completion_length": 503.71875, "epoch": 0.5450666666666667, "grad_norm": 0.24565471708774567, "kl": 0.00762939453125, "learning_rate": 1.525153922252179e-06, "loss": 0.1348, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 511 }, { "completion_length": 484.59375, "epoch": 0.5461333333333334, "grad_norm": 0.07559490948915482, "kl": 0.01198577880859375, "learning_rate": 1.519564524040682e-06, "loss": 0.0301, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 512 }, { "completion_length": 660.59375, "epoch": 0.5472, "grad_norm": 0.13475289940834045, "kl": 0.0047512054443359375, "learning_rate": 1.5139748541143317e-06, "loss": 0.06, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 513 }, { "completion_length": 690.59375, "epoch": 0.5482666666666667, "grad_norm": 0.07179153710603714, "kl": 0.006458282470703125, "learning_rate": 1.5083849901032472e-06, "loss": 0.0392, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 514 }, { "completion_length": 518.21875, "epoch": 0.5493333333333333, "grad_norm": 0.0057837325148284435, "kl": 0.021331787109375, "learning_rate": 1.5027950096402447e-06, "loss": 0.0569, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 515 }, { "completion_length": 568.21875, "epoch": 0.5504, "grad_norm": 0.0027763855177909136, "kl": 0.006763458251953125, "learning_rate": 1.4972049903597554e-06, "loss": 0.0542, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 516 }, { "completion_length": 535.15625, "epoch": 0.5514666666666667, "grad_norm": 0.08124169707298279, "kl": 0.006328582763671875, "learning_rate": 1.4916150098967525e-06, "loss": 0.0585, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 517 }, { "completion_length": 652.15625, "epoch": 0.5525333333333333, "grad_norm": 0.0013067522086203098, "kl": 0.015289306640625, "learning_rate": 1.4860251458856683e-06, "loss": 0.0018, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 518 }, { "completion_length": 552.90625, "epoch": 0.5536, "grad_norm": 0.15632221102714539, "kl": 0.007717132568359375, "learning_rate": 1.4804354759593176e-06, "loss": 0.002, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 519 }, { "completion_length": 627.15625, "epoch": 0.5546666666666666, "grad_norm": 0.08449096232652664, "kl": 0.00614166259765625, "learning_rate": 1.474846077747821e-06, "loss": 0.0244, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 520 }, { "completion_length": 689.4375, "epoch": 0.5557333333333333, "grad_norm": 0.003158832434564829, "kl": 0.00634765625, "learning_rate": 1.4692570288775243e-06, "loss": -0.0143, "reward": 0.40625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 521 }, { "completion_length": 561.03125, "epoch": 0.5568, "grad_norm": 0.06882524490356445, "kl": 0.00974273681640625, "learning_rate": 1.4636684069699222e-06, "loss": 0.0916, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 522 }, { "completion_length": 558.65625, "epoch": 0.5578666666666666, "grad_norm": 0.005670291371643543, "kl": 0.006908416748046875, "learning_rate": 1.4580802896405793e-06, "loss": 0.0188, "reward": 0.875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 523 }, { "completion_length": 605.90625, "epoch": 0.5589333333333333, "grad_norm": 0.00541009521111846, "kl": 0.0084228515625, "learning_rate": 1.452492754498053e-06, "loss": 0.0025, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 524 }, { "completion_length": 582.03125, "epoch": 0.56, "grad_norm": 0.20389975607395172, "kl": 0.008869171142578125, "learning_rate": 1.4469058791428154e-06, "loss": 0.0742, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 525 }, { "completion_length": 578.28125, "epoch": 0.5610666666666667, "grad_norm": 0.08337868005037308, "kl": 0.004375457763671875, "learning_rate": 1.4413197411661739e-06, "loss": 0.0, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 526 }, { "completion_length": 589.1875, "epoch": 0.5621333333333334, "grad_norm": 0.22679054737091064, "kl": 0.01139068603515625, "learning_rate": 1.4357344181491972e-06, "loss": 0.0928, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 527 }, { "completion_length": 454.90625, "epoch": 0.5632, "grad_norm": 0.08185582607984543, "kl": 0.009586334228515625, "learning_rate": 1.4301499876616344e-06, "loss": -0.0248, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 528 }, { "completion_length": 638.84375, "epoch": 0.5642666666666667, "grad_norm": 0.003445959649980068, "kl": 0.007947921752929688, "learning_rate": 1.4245665272608392e-06, "loss": 0.0646, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 529 }, { "completion_length": 599.4375, "epoch": 0.5653333333333334, "grad_norm": 0.003123276634141803, "kl": 0.007976531982421875, "learning_rate": 1.4189841144906928e-06, "loss": 0.0198, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 530 }, { "completion_length": 698.71875, "epoch": 0.5664, "grad_norm": 0.007981996051967144, "kl": 0.009836196899414062, "learning_rate": 1.4134028268805265e-06, "loss": 0.0601, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 531 }, { "completion_length": 584.75, "epoch": 0.5674666666666667, "grad_norm": 0.07089339941740036, "kl": 0.007921218872070312, "learning_rate": 1.4078227419440454e-06, "loss": 0.0439, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 532 }, { "completion_length": 511.0, "epoch": 0.5685333333333333, "grad_norm": 0.14340925216674805, "kl": 0.009778976440429688, "learning_rate": 1.402243937178252e-06, "loss": 0.0397, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 533 }, { "completion_length": 463.0625, "epoch": 0.5696, "grad_norm": 0.07953439652919769, "kl": 0.008708953857421875, "learning_rate": 1.396666490062369e-06, "loss": -0.038, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 534 }, { "completion_length": 422.21875, "epoch": 0.5706666666666667, "grad_norm": 0.006398436147719622, "kl": 0.01219940185546875, "learning_rate": 1.3910904780567642e-06, "loss": 0.0019, "reward": 0.96875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.0, "step": 535 }, { "completion_length": 582.46875, "epoch": 0.5717333333333333, "grad_norm": 0.0050347172655165195, "kl": 0.009166717529296875, "learning_rate": 1.3855159786018744e-06, "loss": -0.0502, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 536 }, { "completion_length": 613.4375, "epoch": 0.5728, "grad_norm": 0.08542890846729279, "kl": 0.006318092346191406, "learning_rate": 1.37994306911713e-06, "loss": -0.0206, "reward": 0.46875, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 537 }, { "completion_length": 569.84375, "epoch": 0.5738666666666666, "grad_norm": 0.08354827016592026, "kl": 0.009124755859375, "learning_rate": 1.374371826999879e-06, "loss": 0.0481, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 538 }, { "completion_length": 518.21875, "epoch": 0.5749333333333333, "grad_norm": 0.08562985807657242, "kl": 0.010179519653320312, "learning_rate": 1.368802329624314e-06, "loss": 0.0021, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 539 }, { "completion_length": 582.9375, "epoch": 0.576, "grad_norm": 0.08169583976268768, "kl": 0.0074558258056640625, "learning_rate": 1.3632346543403946e-06, "loss": 0.0274, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 540 }, { "completion_length": 716.03125, "epoch": 0.5770666666666666, "grad_norm": 0.1649378091096878, "kl": 0.008716583251953125, "learning_rate": 1.3576688784727775e-06, "loss": 0.0178, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 541 }, { "completion_length": 785.71875, "epoch": 0.5781333333333334, "grad_norm": 0.0016928298864513636, "kl": 0.0049896240234375, "learning_rate": 1.3521050793197374e-06, "loss": 0.0539, "reward": 0.375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 542 }, { "completion_length": 592.09375, "epoch": 0.5792, "grad_norm": 0.07521883398294449, "kl": 0.005626678466796875, "learning_rate": 1.3465433341520975e-06, "loss": -0.0021, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 543 }, { "completion_length": 585.96875, "epoch": 0.5802666666666667, "grad_norm": 0.0026555198710411787, "kl": 0.0054912567138671875, "learning_rate": 1.3409837202121548e-06, "loss": 0.0407, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 544 }, { "completion_length": 529.09375, "epoch": 0.5813333333333334, "grad_norm": 0.06821374595165253, "kl": 0.005103111267089844, "learning_rate": 1.335426314712607e-06, "loss": 0.0291, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 545 }, { "completion_length": 656.5, "epoch": 0.5824, "grad_norm": 0.00222398666664958, "kl": 0.0058460235595703125, "learning_rate": 1.3298711948354818e-06, "loss": -0.0263, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 546 }, { "completion_length": 652.125, "epoch": 0.5834666666666667, "grad_norm": 0.10121048241853714, "kl": 0.005620002746582031, "learning_rate": 1.324318437731063e-06, "loss": 0.1088, "reward": 0.625, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 547 }, { "completion_length": 622.90625, "epoch": 0.5845333333333333, "grad_norm": 0.1462131142616272, "kl": 0.005725860595703125, "learning_rate": 1.3187681205168196e-06, "loss": 0.0158, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 548 }, { "completion_length": 630.0625, "epoch": 0.5856, "grad_norm": 0.08489499241113663, "kl": 0.0041179656982421875, "learning_rate": 1.313220320276336e-06, "loss": -0.0089, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 549 }, { "completion_length": 759.71875, "epoch": 0.5866666666666667, "grad_norm": 0.14951834082603455, "kl": 0.023744583129882812, "learning_rate": 1.3076751140582396e-06, "loss": -0.017, "reward": 0.34375, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 550 }, { "completion_length": 710.75, "epoch": 0.5877333333333333, "grad_norm": 0.0726216658949852, "kl": 0.00521087646484375, "learning_rate": 1.3021325788751322e-06, "loss": 0.0426, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 551 }, { "completion_length": 848.5625, "epoch": 0.5888, "grad_norm": 0.07649441808462143, "kl": 0.004230499267578125, "learning_rate": 1.2965927917025198e-06, "loss": -0.0038, "reward": 0.3125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 552 }, { "completion_length": 475.59375, "epoch": 0.5898666666666667, "grad_norm": 0.10233528912067413, "kl": 0.008541107177734375, "learning_rate": 1.2910558294777435e-06, "loss": 0.115, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 553 }, { "completion_length": 496.03125, "epoch": 0.5909333333333333, "grad_norm": 0.002815983258187771, "kl": 0.006946563720703125, "learning_rate": 1.285521769098911e-06, "loss": 0.0201, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 554 }, { "completion_length": 537.15625, "epoch": 0.592, "grad_norm": 0.08456642180681229, "kl": 0.0066204071044921875, "learning_rate": 1.2799906874238297e-06, "loss": 0.0169, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 555 }, { "completion_length": 560.03125, "epoch": 0.5930666666666666, "grad_norm": 0.004056063015013933, "kl": 0.0061550140380859375, "learning_rate": 1.2744626612689368e-06, "loss": -0.0151, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 556 }, { "completion_length": 601.9375, "epoch": 0.5941333333333333, "grad_norm": 0.16341160237789154, "kl": 0.0063495635986328125, "learning_rate": 1.2689377674082355e-06, "loss": 0.0008, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 557 }, { "completion_length": 684.1875, "epoch": 0.5952, "grad_norm": 0.12563107907772064, "kl": 0.005268096923828125, "learning_rate": 1.263416082572226e-06, "loss": 0.0128, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 558 }, { "completion_length": 661.28125, "epoch": 0.5962666666666666, "grad_norm": 0.004542877431958914, "kl": 0.0076274871826171875, "learning_rate": 1.257897683446842e-06, "loss": 0.0161, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 559 }, { "completion_length": 661.46875, "epoch": 0.5973333333333334, "grad_norm": 0.18340112268924713, "kl": 0.0056018829345703125, "learning_rate": 1.2523826466723843e-06, "loss": 0.0488, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 560 }, { "completion_length": 560.0, "epoch": 0.5984, "grad_norm": 0.06594633311033249, "kl": 0.0051364898681640625, "learning_rate": 1.2468710488424574e-06, "loss": -0.0161, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 561 }, { "completion_length": 613.84375, "epoch": 0.5994666666666667, "grad_norm": 0.003265515435487032, "kl": 0.007442474365234375, "learning_rate": 1.2413629665029049e-06, "loss": 0.0169, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 562 }, { "completion_length": 526.78125, "epoch": 0.6005333333333334, "grad_norm": 0.08629011362791061, "kl": 0.009281158447265625, "learning_rate": 1.2358584761507467e-06, "loss": 0.0057, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 563 }, { "completion_length": 519.375, "epoch": 0.6016, "grad_norm": 0.1898590326309204, "kl": 0.01105499267578125, "learning_rate": 1.2303576542331168e-06, "loss": 0.0934, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 564 }, { "completion_length": 628.53125, "epoch": 0.6026666666666667, "grad_norm": 0.004293272737413645, "kl": 0.00766754150390625, "learning_rate": 1.2248605771462016e-06, "loss": -0.022, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 565 }, { "completion_length": 533.6875, "epoch": 0.6037333333333333, "grad_norm": 0.0017823687521740794, "kl": 0.007015228271484375, "learning_rate": 1.2193673212341784e-06, "loss": 0.0011, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 566 }, { "completion_length": 631.625, "epoch": 0.6048, "grad_norm": 0.09441449493169785, "kl": 0.017290115356445312, "learning_rate": 1.213877962788156e-06, "loss": 0.0114, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 567 }, { "completion_length": 427.625, "epoch": 0.6058666666666667, "grad_norm": 0.0030161200556904078, "kl": 0.009952545166015625, "learning_rate": 1.2083925780451142e-06, "loss": -0.0288, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 568 }, { "completion_length": 520.75, "epoch": 0.6069333333333333, "grad_norm": 0.005410979967564344, "kl": 0.00991058349609375, "learning_rate": 1.2029112431868455e-06, "loss": 0.0939, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 569 }, { "completion_length": 528.1875, "epoch": 0.608, "grad_norm": 0.0031468267552554607, "kl": 0.0100860595703125, "learning_rate": 1.1974340343388974e-06, "loss": 0.037, "reward": 0.5625, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 570 }, { "completion_length": 474.15625, "epoch": 0.6090666666666666, "grad_norm": 0.006301326677203178, "kl": 0.010219573974609375, "learning_rate": 1.1919610275695144e-06, "loss": -0.0107, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 571 }, { "completion_length": 612.84375, "epoch": 0.6101333333333333, "grad_norm": 0.0024508426431566477, "kl": 0.0071887969970703125, "learning_rate": 1.186492298888582e-06, "loss": 0.0081, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 572 }, { "completion_length": 693.875, "epoch": 0.6112, "grad_norm": 0.11995789408683777, "kl": 0.005517005920410156, "learning_rate": 1.1810279242465714e-06, "loss": 0.0162, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 573 }, { "completion_length": 531.90625, "epoch": 0.6122666666666666, "grad_norm": 0.10004834830760956, "kl": 0.0061187744140625, "learning_rate": 1.1755679795334832e-06, "loss": 0.0038, "reward": 0.5625, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 574 }, { "completion_length": 629.875, "epoch": 0.6133333333333333, "grad_norm": 0.17769820988178253, "kl": 0.0042629241943359375, "learning_rate": 1.1701125405777965e-06, "loss": 0.0311, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 575 }, { "completion_length": 531.8125, "epoch": 0.6144, "grad_norm": 0.0022228206507861614, "kl": 0.014484405517578125, "learning_rate": 1.164661683145412e-06, "loss": -0.0083, "reward": 0.875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 576 }, { "completion_length": 747.71875, "epoch": 0.6154666666666667, "grad_norm": 0.1392834335565567, "kl": 0.0072917938232421875, "learning_rate": 1.1592154829386022e-06, "loss": 0.0112, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 577 }, { "completion_length": 662.8125, "epoch": 0.6165333333333334, "grad_norm": 0.008566298522055149, "kl": 0.009979248046875, "learning_rate": 1.1537740155949595e-06, "loss": -0.0167, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 578 }, { "completion_length": 601.78125, "epoch": 0.6176, "grad_norm": 0.022806497290730476, "kl": 0.011278152465820312, "learning_rate": 1.1483373566863454e-06, "loss": 0.0024, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 579 }, { "completion_length": 552.71875, "epoch": 0.6186666666666667, "grad_norm": 0.16697819530963898, "kl": 0.00832366943359375, "learning_rate": 1.142905581717841e-06, "loss": 0.0386, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 580 }, { "completion_length": 496.6875, "epoch": 0.6197333333333334, "grad_norm": 0.002235228195786476, "kl": 0.0066738128662109375, "learning_rate": 1.1374787661266998e-06, "loss": 0.0672, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 581 }, { "completion_length": 554.9375, "epoch": 0.6208, "grad_norm": 0.002343305153772235, "kl": 0.0049037933349609375, "learning_rate": 1.132056985281297e-06, "loss": 0.0644, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 582 }, { "completion_length": 417.15625, "epoch": 0.6218666666666667, "grad_norm": 0.19298502802848816, "kl": 0.016454696655273438, "learning_rate": 1.1266403144800856e-06, "loss": -0.0263, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 583 }, { "completion_length": 486.625, "epoch": 0.6229333333333333, "grad_norm": 0.0044172764755785465, "kl": 0.008241653442382812, "learning_rate": 1.1212288289505494e-06, "loss": 0.0384, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 584 }, { "completion_length": 614.125, "epoch": 0.624, "grad_norm": 0.11297555267810822, "kl": 0.008264541625976562, "learning_rate": 1.1158226038481584e-06, "loss": 0.0513, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 585 }, { "completion_length": 560.59375, "epoch": 0.6250666666666667, "grad_norm": 0.0020602887962013483, "kl": 0.011857986450195312, "learning_rate": 1.1104217142553247e-06, "loss": -0.0149, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 586 }, { "completion_length": 588.625, "epoch": 0.6261333333333333, "grad_norm": 0.07729622721672058, "kl": 0.006234169006347656, "learning_rate": 1.105026235180361e-06, "loss": 0.071, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 587 }, { "completion_length": 571.625, "epoch": 0.6272, "grad_norm": 0.0027119379956275225, "kl": 0.0062046051025390625, "learning_rate": 1.099636241556437e-06, "loss": 0.0064, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 588 }, { "completion_length": 742.25, "epoch": 0.6282666666666666, "grad_norm": 0.004309109877794981, "kl": 0.004590034484863281, "learning_rate": 1.0942518082405401e-06, "loss": 0.0155, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 589 }, { "completion_length": 629.71875, "epoch": 0.6293333333333333, "grad_norm": 0.0031700979452580214, "kl": 0.0043773651123046875, "learning_rate": 1.0888730100124355e-06, "loss": 0.0152, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 590 }, { "completion_length": 541.8125, "epoch": 0.6304, "grad_norm": 0.0752197876572609, "kl": 0.00925445556640625, "learning_rate": 1.0834999215736271e-06, "loss": 0.021, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 591 }, { "completion_length": 594.5625, "epoch": 0.6314666666666666, "grad_norm": 0.006536629982292652, "kl": 0.008878707885742188, "learning_rate": 1.0781326175463212e-06, "loss": 0.0464, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 592 }, { "completion_length": 533.34375, "epoch": 0.6325333333333333, "grad_norm": 0.128348246216774, "kl": 0.01143646240234375, "learning_rate": 1.0727711724723881e-06, "loss": 0.0146, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 593 }, { "completion_length": 549.0, "epoch": 0.6336, "grad_norm": 0.07991447299718857, "kl": 0.008884429931640625, "learning_rate": 1.0674156608123294e-06, "loss": 0.1072, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 594 }, { "completion_length": 605.4375, "epoch": 0.6346666666666667, "grad_norm": 0.08525198698043823, "kl": 0.00734710693359375, "learning_rate": 1.062066156944242e-06, "loss": -0.0371, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 595 }, { "completion_length": 546.0625, "epoch": 0.6357333333333334, "grad_norm": 0.0020145827438682318, "kl": 0.007998466491699219, "learning_rate": 1.0567227351627864e-06, "loss": -0.0107, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 596 }, { "completion_length": 607.96875, "epoch": 0.6368, "grad_norm": 0.060908038169145584, "kl": 0.0057239532470703125, "learning_rate": 1.0513854696781531e-06, "loss": -0.0207, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 597 }, { "completion_length": 642.90625, "epoch": 0.6378666666666667, "grad_norm": 0.11555396765470505, "kl": 0.0044708251953125, "learning_rate": 1.0460544346150335e-06, "loss": -0.015, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 598 }, { "completion_length": 681.6875, "epoch": 0.6389333333333334, "grad_norm": 0.1107495054602623, "kl": 0.0043354034423828125, "learning_rate": 1.040729704011591e-06, "loss": 0.0251, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 599 }, { "completion_length": 456.96875, "epoch": 0.64, "grad_norm": 0.1629042625427246, "kl": 0.015047073364257812, "learning_rate": 1.0354113518184304e-06, "loss": 0.0196, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 600 }, { "completion_length": 653.53125, "epoch": 0.6410666666666667, "grad_norm": 0.010197960771620274, "kl": 0.0067996978759765625, "learning_rate": 1.0300994518975732e-06, "loss": 0.0328, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 601 }, { "completion_length": 648.75, "epoch": 0.6421333333333333, "grad_norm": 0.004543755203485489, "kl": 0.0068264007568359375, "learning_rate": 1.0247940780214302e-06, "loss": 0.0081, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 602 }, { "completion_length": 502.9375, "epoch": 0.6432, "grad_norm": 0.006155628710985184, "kl": 0.009342193603515625, "learning_rate": 1.0194953038717773e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 603 }, { "completion_length": 640.5625, "epoch": 0.6442666666666667, "grad_norm": 0.0014783332590013742, "kl": 0.0036678314208984375, "learning_rate": 1.0142032030387342e-06, "loss": 0.0037, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 604 }, { "completion_length": 565.4375, "epoch": 0.6453333333333333, "grad_norm": 0.2626013159751892, "kl": 0.008373260498046875, "learning_rate": 1.008917849019739e-06, "loss": 0.0912, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 605 }, { "completion_length": 483.28125, "epoch": 0.6464, "grad_norm": 0.0029966228175908327, "kl": 0.0058078765869140625, "learning_rate": 1.0036393152185294e-06, "loss": 0.0115, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 606 }, { "completion_length": 547.0, "epoch": 0.6474666666666666, "grad_norm": 0.15929335355758667, "kl": 0.00966644287109375, "learning_rate": 9.983676749441236e-07, "loss": -0.0531, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 607 }, { "completion_length": 531.71875, "epoch": 0.6485333333333333, "grad_norm": 0.002390535082668066, "kl": 0.005558013916015625, "learning_rate": 9.931030014098005e-07, "loss": -0.0094, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 608 }, { "completion_length": 515.125, "epoch": 0.6496, "grad_norm": 0.08344373852014542, "kl": 0.007358551025390625, "learning_rate": 9.878453677320847e-07, "loss": 0.038, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 609 }, { "completion_length": 549.75, "epoch": 0.6506666666666666, "grad_norm": 0.010012468323111534, "kl": 0.009349822998046875, "learning_rate": 9.825948469297303e-07, "loss": 0.0177, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 610 }, { "completion_length": 556.46875, "epoch": 0.6517333333333334, "grad_norm": 0.11691832542419434, "kl": 0.0055027008056640625, "learning_rate": 9.77351511922706e-07, "loss": 0.0056, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 611 }, { "completion_length": 590.15625, "epoch": 0.6528, "grad_norm": 0.003396987682208419, "kl": 0.0043735504150390625, "learning_rate": 9.721154355311845e-07, "loss": 0.0006, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 612 }, { "completion_length": 533.71875, "epoch": 0.6538666666666667, "grad_norm": 0.002546977950260043, "kl": 0.007633209228515625, "learning_rate": 9.668866904745284e-07, "loss": 0.0183, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 613 }, { "completion_length": 559.53125, "epoch": 0.6549333333333334, "grad_norm": 0.07408076524734497, "kl": 0.010529518127441406, "learning_rate": 9.616653493702824e-07, "loss": 0.0294, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 614 }, { "completion_length": 594.0625, "epoch": 0.656, "grad_norm": 0.20089086890220642, "kl": 0.0058078765869140625, "learning_rate": 9.564514847331647e-07, "loss": -0.0734, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 615 }, { "completion_length": 509.90625, "epoch": 0.6570666666666667, "grad_norm": 0.003512981813400984, "kl": 0.01174163818359375, "learning_rate": 9.512451689740579e-07, "loss": 0.0909, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 616 }, { "completion_length": 486.28125, "epoch": 0.6581333333333333, "grad_norm": 0.004877515137195587, "kl": 0.011852264404296875, "learning_rate": 9.460464743990059e-07, "loss": 0.0897, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 617 }, { "completion_length": 539.6875, "epoch": 0.6592, "grad_norm": 0.07878877967596054, "kl": 0.013519287109375, "learning_rate": 9.40855473208208e-07, "loss": 0.1351, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 618 }, { "completion_length": 574.09375, "epoch": 0.6602666666666667, "grad_norm": 0.16702018678188324, "kl": 0.010162353515625, "learning_rate": 9.356722374950166e-07, "loss": 0.017, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 619 }, { "completion_length": 671.21875, "epoch": 0.6613333333333333, "grad_norm": 0.0017575888196006417, "kl": 0.0077991485595703125, "learning_rate": 9.304968392449361e-07, "loss": 0.0556, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 620 }, { "completion_length": 705.84375, "epoch": 0.6624, "grad_norm": 0.0018127072835341096, "kl": 0.0044765472412109375, "learning_rate": 9.253293503346238e-07, "loss": 0.0209, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 621 }, { "completion_length": 583.03125, "epoch": 0.6634666666666666, "grad_norm": 0.0034383467864245176, "kl": 0.01155853271484375, "learning_rate": 9.201698425308896e-07, "loss": 0.0172, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 622 }, { "completion_length": 627.4375, "epoch": 0.6645333333333333, "grad_norm": 0.11456794291734695, "kl": 0.0067806243896484375, "learning_rate": 9.150183874897021e-07, "loss": 0.0728, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 623 }, { "completion_length": 532.59375, "epoch": 0.6656, "grad_norm": 0.2247643768787384, "kl": 0.010736465454101562, "learning_rate": 9.098750567551911e-07, "loss": 0.0678, "reward": 0.96875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.0, "step": 624 }, { "completion_length": 649.96875, "epoch": 0.6666666666666666, "grad_norm": 0.0746990293264389, "kl": 0.009521484375, "learning_rate": 9.047399217586552e-07, "loss": 0.0323, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 625 }, { "completion_length": 609.53125, "epoch": 0.6677333333333333, "grad_norm": 0.003634110791608691, "kl": 0.00774383544921875, "learning_rate": 8.996130538175697e-07, "loss": 0.0286, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 626 }, { "completion_length": 476.25, "epoch": 0.6688, "grad_norm": 0.0027973384130746126, "kl": 0.0070648193359375, "learning_rate": 8.944945241345953e-07, "loss": 0.0003, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 627 }, { "completion_length": 599.28125, "epoch": 0.6698666666666667, "grad_norm": 0.06830956041812897, "kl": 0.00909423828125, "learning_rate": 8.893844037965898e-07, "loss": -0.0006, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 628 }, { "completion_length": 728.40625, "epoch": 0.6709333333333334, "grad_norm": 0.0020148097537457943, "kl": 0.006480216979980469, "learning_rate": 8.842827637736218e-07, "loss": -0.0166, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 629 }, { "completion_length": 602.90625, "epoch": 0.672, "grad_norm": 0.006371679250150919, "kl": 0.010519027709960938, "learning_rate": 8.791896749179831e-07, "loss": 0.0174, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 630 }, { "completion_length": 497.40625, "epoch": 0.6730666666666667, "grad_norm": 0.11963934451341629, "kl": 0.00641632080078125, "learning_rate": 8.741052079632063e-07, "loss": -0.0001, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 631 }, { "completion_length": 775.21875, "epoch": 0.6741333333333334, "grad_norm": 0.002268408890813589, "kl": 0.0055294036865234375, "learning_rate": 8.690294335230808e-07, "loss": 0.0446, "reward": 0.34375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 632 }, { "completion_length": 673.375, "epoch": 0.6752, "grad_norm": 0.12447670847177505, "kl": 0.006397247314453125, "learning_rate": 8.639624220906747e-07, "loss": 0.0275, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 633 }, { "completion_length": 673.5625, "epoch": 0.6762666666666667, "grad_norm": 0.07962203025817871, "kl": 0.00616455078125, "learning_rate": 8.589042440373532e-07, "loss": 0.0247, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 634 }, { "completion_length": 757.625, "epoch": 0.6773333333333333, "grad_norm": 0.1509641706943512, "kl": 0.004940032958984375, "learning_rate": 8.538549696118023e-07, "loss": 0.0257, "reward": 0.40625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 635 }, { "completion_length": 694.0625, "epoch": 0.6784, "grad_norm": 0.12462906539440155, "kl": 0.00576019287109375, "learning_rate": 8.488146689390535e-07, "loss": 0.0144, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 636 }, { "completion_length": 550.46875, "epoch": 0.6794666666666667, "grad_norm": 0.07366382330656052, "kl": 0.00632476806640625, "learning_rate": 8.437834120195094e-07, "loss": 0.0056, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 637 }, { "completion_length": 648.28125, "epoch": 0.6805333333333333, "grad_norm": 0.11849912256002426, "kl": 0.011875152587890625, "learning_rate": 8.387612687279718e-07, "loss": 0.1514, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 638 }, { "completion_length": 598.84375, "epoch": 0.6816, "grad_norm": 0.22002704441547394, "kl": 0.011417388916015625, "learning_rate": 8.337483088126709e-07, "loss": 0.115, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 639 }, { "completion_length": 599.28125, "epoch": 0.6826666666666666, "grad_norm": 0.07228660583496094, "kl": 0.0073108673095703125, "learning_rate": 8.287446018942973e-07, "loss": 0.0801, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 640 }, { "completion_length": 508.34375, "epoch": 0.6837333333333333, "grad_norm": 0.13455718755722046, "kl": 0.006267547607421875, "learning_rate": 8.237502174650336e-07, "loss": 0.0286, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 641 }, { "completion_length": 639.03125, "epoch": 0.6848, "grad_norm": 0.19336862862110138, "kl": 0.0064411163330078125, "learning_rate": 8.187652248875924e-07, "loss": 0.1352, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 642 }, { "completion_length": 542.3125, "epoch": 0.6858666666666666, "grad_norm": 0.09589075297117233, "kl": 0.021268844604492188, "learning_rate": 8.137896933942495e-07, "loss": 0.0459, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 643 }, { "completion_length": 770.46875, "epoch": 0.6869333333333333, "grad_norm": 0.0019284122390672565, "kl": 0.004985809326171875, "learning_rate": 8.088236920858835e-07, "loss": 0.0641, "reward": 0.34375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 644 }, { "completion_length": 626.40625, "epoch": 0.688, "grad_norm": 0.09182769805192947, "kl": 0.005756378173828125, "learning_rate": 8.038672899310176e-07, "loss": 0.0965, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 645 }, { "completion_length": 623.96875, "epoch": 0.6890666666666667, "grad_norm": 0.00226573389954865, "kl": 0.00501251220703125, "learning_rate": 7.989205557648598e-07, "loss": -0.0294, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 646 }, { "completion_length": 628.1875, "epoch": 0.6901333333333334, "grad_norm": 0.11915570497512817, "kl": 0.0062999725341796875, "learning_rate": 7.939835582883478e-07, "loss": 0.0762, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 647 }, { "completion_length": 572.21875, "epoch": 0.6912, "grad_norm": 0.09051520377397537, "kl": 0.0098419189453125, "learning_rate": 7.890563660671952e-07, "loss": 0.0635, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 648 }, { "completion_length": 674.28125, "epoch": 0.6922666666666667, "grad_norm": 0.003246230771765113, "kl": 0.006587982177734375, "learning_rate": 7.841390475309386e-07, "loss": 0.0265, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 649 }, { "completion_length": 541.8125, "epoch": 0.6933333333333334, "grad_norm": 0.1549820750951767, "kl": 0.010942459106445312, "learning_rate": 7.792316709719875e-07, "loss": 0.0238, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 650 }, { "completion_length": 678.28125, "epoch": 0.6944, "grad_norm": 0.09520485997200012, "kl": 0.006464958190917969, "learning_rate": 7.743343045446756e-07, "loss": 0.0251, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 651 }, { "completion_length": 631.96875, "epoch": 0.6954666666666667, "grad_norm": 0.07874906063079834, "kl": 0.010732650756835938, "learning_rate": 7.694470162643147e-07, "loss": -0.0033, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 652 }, { "completion_length": 440.65625, "epoch": 0.6965333333333333, "grad_norm": 0.004659709520637989, "kl": 0.00861358642578125, "learning_rate": 7.6456987400625e-07, "loss": 0.0269, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 653 }, { "completion_length": 669.34375, "epoch": 0.6976, "grad_norm": 0.21597163379192352, "kl": 0.007852554321289062, "learning_rate": 7.59702945504917e-07, "loss": 0.0148, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 654 }, { "completion_length": 623.8125, "epoch": 0.6986666666666667, "grad_norm": 0.1814793050289154, "kl": 0.013158798217773438, "learning_rate": 7.548462983529016e-07, "loss": -0.0146, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 655 }, { "completion_length": 601.90625, "epoch": 0.6997333333333333, "grad_norm": 0.004123261664062738, "kl": 0.009037017822265625, "learning_rate": 7.500000000000003e-07, "loss": 0.0026, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 656 }, { "completion_length": 642.59375, "epoch": 0.7008, "grad_norm": 0.2185526043176651, "kl": 0.007839202880859375, "learning_rate": 7.451641177522844e-07, "loss": 0.0978, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 657 }, { "completion_length": 529.5625, "epoch": 0.7018666666666666, "grad_norm": 0.003913010936230421, "kl": 0.009571075439453125, "learning_rate": 7.40338718771165e-07, "loss": -0.0093, "reward": 0.9375, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 658 }, { "completion_length": 769.78125, "epoch": 0.7029333333333333, "grad_norm": 0.17077980935573578, "kl": 0.0043182373046875, "learning_rate": 7.355238700724594e-07, "loss": 0.0063, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 659 }, { "completion_length": 588.3125, "epoch": 0.704, "grad_norm": 0.08980058878660202, "kl": 0.007923126220703125, "learning_rate": 7.307196385254621e-07, "loss": 0.0093, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 660 }, { "completion_length": 540.15625, "epoch": 0.7050666666666666, "grad_norm": 0.10506106168031693, "kl": 0.009083747863769531, "learning_rate": 7.259260908520137e-07, "loss": -0.0017, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 661 }, { "completion_length": 698.21875, "epoch": 0.7061333333333333, "grad_norm": 0.1818976104259491, "kl": 0.005499839782714844, "learning_rate": 7.211432936255779e-07, "loss": 0.041, "reward": 0.6875, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 662 }, { "completion_length": 576.125, "epoch": 0.7072, "grad_norm": 0.11934824287891388, "kl": 0.0060577392578125, "learning_rate": 7.163713132703127e-07, "loss": 0.0141, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 663 }, { "completion_length": 497.15625, "epoch": 0.7082666666666667, "grad_norm": 0.09508629143238068, "kl": 0.009210586547851562, "learning_rate": 7.116102160601505e-07, "loss": -0.0246, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 664 }, { "completion_length": 536.96875, "epoch": 0.7093333333333334, "grad_norm": 0.08164741098880768, "kl": 0.00617218017578125, "learning_rate": 7.068600681178772e-07, "loss": -0.0125, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 665 }, { "completion_length": 565.96875, "epoch": 0.7104, "grad_norm": 0.10633134096860886, "kl": 0.009714126586914062, "learning_rate": 7.021209354142133e-07, "loss": 0.0066, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 666 }, { "completion_length": 618.1875, "epoch": 0.7114666666666667, "grad_norm": 0.1309768259525299, "kl": 0.009613037109375, "learning_rate": 6.97392883766899e-07, "loss": -0.0063, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 667 }, { "completion_length": 588.4375, "epoch": 0.7125333333333334, "grad_norm": 0.09822507947683334, "kl": 0.010354995727539062, "learning_rate": 6.926759788397783e-07, "loss": -0.0892, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 668 }, { "completion_length": 610.125, "epoch": 0.7136, "grad_norm": 0.003065653145313263, "kl": 0.0068511962890625, "learning_rate": 6.879702861418883e-07, "loss": 0.0206, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 669 }, { "completion_length": 726.6875, "epoch": 0.7146666666666667, "grad_norm": 0.111954465508461, "kl": 0.0049381256103515625, "learning_rate": 6.832758710265492e-07, "loss": -0.03, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 670 }, { "completion_length": 675.78125, "epoch": 0.7157333333333333, "grad_norm": 0.0030857266392558813, "kl": 0.0058727264404296875, "learning_rate": 6.785927986904567e-07, "loss": 0.0321, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 671 }, { "completion_length": 639.96875, "epoch": 0.7168, "grad_norm": 0.002022956730797887, "kl": 0.0060062408447265625, "learning_rate": 6.739211341727761e-07, "loss": -0.0062, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 672 }, { "completion_length": 585.65625, "epoch": 0.7178666666666667, "grad_norm": 0.0019693912472575903, "kl": 0.010616302490234375, "learning_rate": 6.692609423542393e-07, "loss": 0.0004, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 673 }, { "completion_length": 702.5625, "epoch": 0.7189333333333333, "grad_norm": 0.1405096799135208, "kl": 0.005766868591308594, "learning_rate": 6.646122879562435e-07, "loss": 0.0282, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 674 }, { "completion_length": 574.8125, "epoch": 0.72, "grad_norm": 0.0018582915654405951, "kl": 0.0062236785888671875, "learning_rate": 6.599752355399538e-07, "loss": 0.0207, "reward": 0.875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 675 }, { "completion_length": 688.5, "epoch": 0.7210666666666666, "grad_norm": 0.123520627617836, "kl": 0.006732940673828125, "learning_rate": 6.55349849505404e-07, "loss": -0.0152, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 676 }, { "completion_length": 644.8125, "epoch": 0.7221333333333333, "grad_norm": 0.08535100519657135, "kl": 0.0054874420166015625, "learning_rate": 6.507361940906042e-07, "loss": -0.0011, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 677 }, { "completion_length": 581.65625, "epoch": 0.7232, "grad_norm": 0.08444581180810928, "kl": 0.009906768798828125, "learning_rate": 6.461343333706476e-07, "loss": 0.0901, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 678 }, { "completion_length": 685.75, "epoch": 0.7242666666666666, "grad_norm": 0.10592332482337952, "kl": 0.005435943603515625, "learning_rate": 6.415443312568216e-07, "loss": -0.0349, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 679 }, { "completion_length": 599.875, "epoch": 0.7253333333333334, "grad_norm": 0.0034602282103151083, "kl": 0.004803657531738281, "learning_rate": 6.369662514957191e-07, "loss": 0.0054, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 680 }, { "completion_length": 733.96875, "epoch": 0.7264, "grad_norm": 0.13551753759384155, "kl": 0.0059680938720703125, "learning_rate": 6.324001576683539e-07, "loss": -0.0439, "reward": 0.53125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 681 }, { "completion_length": 612.0625, "epoch": 0.7274666666666667, "grad_norm": 0.0033902465365827084, "kl": 0.004238128662109375, "learning_rate": 6.278461131892775e-07, "loss": 0.0418, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 682 }, { "completion_length": 482.3125, "epoch": 0.7285333333333334, "grad_norm": 0.08672572672367096, "kl": 0.010015487670898438, "learning_rate": 6.233041813056982e-07, "loss": 0.0411, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 683 }, { "completion_length": 621.28125, "epoch": 0.7296, "grad_norm": 0.0023047016002237797, "kl": 0.00687408447265625, "learning_rate": 6.187744250966031e-07, "loss": 0.1053, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 684 }, { "completion_length": 656.625, "epoch": 0.7306666666666667, "grad_norm": 0.14560002088546753, "kl": 0.0064983367919921875, "learning_rate": 6.142569074718818e-07, "loss": 0.0326, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 685 }, { "completion_length": 637.78125, "epoch": 0.7317333333333333, "grad_norm": 0.003690156154334545, "kl": 0.010103225708007812, "learning_rate": 6.097516911714523e-07, "loss": 0.1044, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 686 }, { "completion_length": 666.25, "epoch": 0.7328, "grad_norm": 0.006694280542433262, "kl": 0.0063991546630859375, "learning_rate": 6.052588387643908e-07, "loss": 0.0819, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 687 }, { "completion_length": 769.875, "epoch": 0.7338666666666667, "grad_norm": 0.1564282327890396, "kl": 0.005238533020019531, "learning_rate": 6.007784126480615e-07, "loss": 0.0471, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 688 }, { "completion_length": 581.78125, "epoch": 0.7349333333333333, "grad_norm": 0.12418191879987717, "kl": 0.0108489990234375, "learning_rate": 5.963104750472507e-07, "loss": 0.011, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 689 }, { "completion_length": 588.25, "epoch": 0.736, "grad_norm": 0.14718829095363617, "kl": 0.018484115600585938, "learning_rate": 5.918550880133018e-07, "loss": -0.0253, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 690 }, { "completion_length": 630.28125, "epoch": 0.7370666666666666, "grad_norm": 0.09304559975862503, "kl": 0.01064300537109375, "learning_rate": 5.874123134232558e-07, "loss": 0.0053, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 691 }, { "completion_length": 485.53125, "epoch": 0.7381333333333333, "grad_norm": 0.0074284011498093605, "kl": 0.010320663452148438, "learning_rate": 5.829822129789891e-07, "loss": -0.01, "reward": 0.90625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 692 }, { "completion_length": 660.21875, "epoch": 0.7392, "grad_norm": 0.0748903825879097, "kl": 0.006557464599609375, "learning_rate": 5.785648482063575e-07, "loss": 0.0029, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 693 }, { "completion_length": 438.9375, "epoch": 0.7402666666666666, "grad_norm": 0.1984458714723587, "kl": 0.013507843017578125, "learning_rate": 5.741602804543429e-07, "loss": 0.004, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 694 }, { "completion_length": 696.65625, "epoch": 0.7413333333333333, "grad_norm": 0.004082027357071638, "kl": 0.006256103515625, "learning_rate": 5.697685708941996e-07, "loss": 0.0485, "reward": 0.375, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 695 }, { "completion_length": 646.75, "epoch": 0.7424, "grad_norm": 0.1002301350235939, "kl": 0.0073757171630859375, "learning_rate": 5.653897805186062e-07, "loss": 0.1464, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 696 }, { "completion_length": 503.09375, "epoch": 0.7434666666666667, "grad_norm": 0.0019200870301574469, "kl": 0.005741119384765625, "learning_rate": 5.610239701408176e-07, "loss": 0.0086, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 697 }, { "completion_length": 797.625, "epoch": 0.7445333333333334, "grad_norm": 0.07168744504451752, "kl": 0.0059375762939453125, "learning_rate": 5.566712003938203e-07, "loss": -0.0097, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 698 }, { "completion_length": 557.375, "epoch": 0.7456, "grad_norm": 0.16634681820869446, "kl": 0.009192466735839844, "learning_rate": 5.52331531729491e-07, "loss": -0.0164, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 699 }, { "completion_length": 632.6875, "epoch": 0.7466666666666667, "grad_norm": 0.16910400986671448, "kl": 0.0063915252685546875, "learning_rate": 5.480050244177573e-07, "loss": 0.015, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 700 }, { "completion_length": 658.375, "epoch": 0.7477333333333334, "grad_norm": 0.00400980981066823, "kl": 0.006122589111328125, "learning_rate": 5.436917385457589e-07, "loss": 0.1333, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 701 }, { "completion_length": 525.9375, "epoch": 0.7488, "grad_norm": 0.008898410946130753, "kl": 0.009622573852539062, "learning_rate": 5.393917340170151e-07, "loss": 0.0544, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 702 }, { "completion_length": 554.90625, "epoch": 0.7498666666666667, "grad_norm": 0.1892712563276291, "kl": 0.00769805908203125, "learning_rate": 5.351050705505919e-07, "loss": -0.0344, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 703 }, { "completion_length": 529.125, "epoch": 0.7509333333333333, "grad_norm": 0.004739647731184959, "kl": 0.021844863891601562, "learning_rate": 5.308318076802728e-07, "loss": -0.0096, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 704 }, { "completion_length": 707.3125, "epoch": 0.752, "grad_norm": 0.17846082150936127, "kl": 0.0061092376708984375, "learning_rate": 5.265720047537318e-07, "loss": 0.0739, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 705 }, { "completion_length": 676.53125, "epoch": 0.7530666666666667, "grad_norm": 0.1468873918056488, "kl": 0.011857986450195312, "learning_rate": 5.223257209317092e-07, "loss": 0.0031, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 706 }, { "completion_length": 592.21875, "epoch": 0.7541333333333333, "grad_norm": 0.08820891380310059, "kl": 0.0073566436767578125, "learning_rate": 5.180930151871906e-07, "loss": 0.013, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 707 }, { "completion_length": 536.5, "epoch": 0.7552, "grad_norm": 0.0022333976812660694, "kl": 0.005672454833984375, "learning_rate": 5.138739463045863e-07, "loss": 0.0031, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 708 }, { "completion_length": 448.125, "epoch": 0.7562666666666666, "grad_norm": 0.0060645174235105515, "kl": 0.00881195068359375, "learning_rate": 5.096685728789175e-07, "loss": 0.0649, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 709 }, { "completion_length": 577.9375, "epoch": 0.7573333333333333, "grad_norm": 0.17103607952594757, "kl": 0.008312225341796875, "learning_rate": 5.054769533149999e-07, "loss": 0.0396, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 710 }, { "completion_length": 656.90625, "epoch": 0.7584, "grad_norm": 0.15209975838661194, "kl": 0.005420684814453125, "learning_rate": 5.012991458266337e-07, "loss": 0.0264, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 711 }, { "completion_length": 575.71875, "epoch": 0.7594666666666666, "grad_norm": 0.021864358335733414, "kl": 0.008665084838867188, "learning_rate": 4.971352084357953e-07, "loss": 0.0211, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 712 }, { "completion_length": 621.34375, "epoch": 0.7605333333333333, "grad_norm": 0.19523032009601593, "kl": 0.00673675537109375, "learning_rate": 4.92985198971831e-07, "loss": 0.0368, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 713 }, { "completion_length": 613.40625, "epoch": 0.7616, "grad_norm": 0.10075124353170395, "kl": 0.007640838623046875, "learning_rate": 4.888491750706547e-07, "loss": 0.0641, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 714 }, { "completion_length": 534.3125, "epoch": 0.7626666666666667, "grad_norm": 0.07917347550392151, "kl": 0.008226394653320312, "learning_rate": 4.847271941739458e-07, "loss": 0.1107, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 715 }, { "completion_length": 560.5, "epoch": 0.7637333333333334, "grad_norm": 0.063052698969841, "kl": 0.015171051025390625, "learning_rate": 4.806193135283535e-07, "loss": 0.0214, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 716 }, { "completion_length": 533.0, "epoch": 0.7648, "grad_norm": 0.0018102749017998576, "kl": 0.0096435546875, "learning_rate": 4.765255901847003e-07, "loss": 0.0252, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 717 }, { "completion_length": 572.59375, "epoch": 0.7658666666666667, "grad_norm": 0.0036427255254238844, "kl": 0.007823944091796875, "learning_rate": 4.7244608099719e-07, "loss": -0.003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 718 }, { "completion_length": 596.28125, "epoch": 0.7669333333333334, "grad_norm": 0.0025250730104744434, "kl": 0.009079933166503906, "learning_rate": 4.6838084262261776e-07, "loss": 0.026, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 719 }, { "completion_length": 510.5, "epoch": 0.768, "grad_norm": 0.005339978728443384, "kl": 0.008085250854492188, "learning_rate": 4.643299315195855e-07, "loss": -0.0281, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 720 }, { "completion_length": 561.78125, "epoch": 0.7690666666666667, "grad_norm": 0.006095587741583586, "kl": 0.0074329376220703125, "learning_rate": 4.6029340394771426e-07, "loss": 0.001, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 721 }, { "completion_length": 612.34375, "epoch": 0.7701333333333333, "grad_norm": 0.15021517872810364, "kl": 0.005501747131347656, "learning_rate": 4.562713159668648e-07, "loss": 0.0959, "reward": 0.375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 722 }, { "completion_length": 580.78125, "epoch": 0.7712, "grad_norm": 0.2567138373851776, "kl": 0.010372161865234375, "learning_rate": 4.522637234363593e-07, "loss": -0.0024, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 723 }, { "completion_length": 462.46875, "epoch": 0.7722666666666667, "grad_norm": 0.16775673627853394, "kl": 0.00902557373046875, "learning_rate": 4.4827068201420486e-07, "loss": 0.0308, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 724 }, { "completion_length": 608.78125, "epoch": 0.7733333333333333, "grad_norm": 0.10104429721832275, "kl": 0.0026922225952148438, "learning_rate": 4.442922471563205e-07, "loss": 0.0766, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 725 }, { "completion_length": 639.78125, "epoch": 0.7744, "grad_norm": 0.002231909427791834, "kl": 0.008554458618164062, "learning_rate": 4.4032847411576785e-07, "loss": 0.0003, "reward": 0.375, "reward_std": 0.0, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 726 }, { "completion_length": 640.25, "epoch": 0.7754666666666666, "grad_norm": 0.07825721800327301, "kl": 0.0033969879150390625, "learning_rate": 4.3637941794198264e-07, "loss": 0.0348, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 727 }, { "completion_length": 703.25, "epoch": 0.7765333333333333, "grad_norm": 0.08200140297412872, "kl": 0.005702018737792969, "learning_rate": 4.3244513348001104e-07, "loss": 0.1603, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 728 }, { "completion_length": 512.46875, "epoch": 0.7776, "grad_norm": 0.0029304679483175278, "kl": 0.009218215942382812, "learning_rate": 4.2852567536974705e-07, "loss": 0.0193, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 729 }, { "completion_length": 513.03125, "epoch": 0.7786666666666666, "grad_norm": 0.07975484430789948, "kl": 0.01204681396484375, "learning_rate": 4.24621098045175e-07, "loss": -0.0167, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 730 }, { "completion_length": 772.78125, "epoch": 0.7797333333333333, "grad_norm": 0.0527532659471035, "kl": 0.005401611328125, "learning_rate": 4.2073145573361197e-07, "loss": 0.0132, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 731 }, { "completion_length": 478.84375, "epoch": 0.7808, "grad_norm": 0.009066343307495117, "kl": 0.01059722900390625, "learning_rate": 4.168568024549562e-07, "loss": 0.0004, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 732 }, { "completion_length": 724.15625, "epoch": 0.7818666666666667, "grad_norm": 0.08316764235496521, "kl": 0.0051021575927734375, "learning_rate": 4.129971920209359e-07, "loss": 0.0428, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 733 }, { "completion_length": 632.375, "epoch": 0.7829333333333334, "grad_norm": 0.16768662631511688, "kl": 0.005443572998046875, "learning_rate": 4.0915267803436186e-07, "loss": 0.0762, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 734 }, { "completion_length": 532.53125, "epoch": 0.784, "grad_norm": 0.13082166016101837, "kl": 0.0067310333251953125, "learning_rate": 4.053233138883835e-07, "loss": 0.0752, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 735 }, { "completion_length": 609.65625, "epoch": 0.7850666666666667, "grad_norm": 0.11487575620412827, "kl": 0.004665374755859375, "learning_rate": 4.015091527657472e-07, "loss": 0.0208, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 736 }, { "completion_length": 547.21875, "epoch": 0.7861333333333334, "grad_norm": 0.28817206621170044, "kl": 0.008367538452148438, "learning_rate": 3.977102476380576e-07, "loss": 0.0284, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 737 }, { "completion_length": 562.0625, "epoch": 0.7872, "grad_norm": 0.148079976439476, "kl": 0.0074291229248046875, "learning_rate": 3.9392665126504196e-07, "loss": -0.0166, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 738 }, { "completion_length": 634.4375, "epoch": 0.7882666666666667, "grad_norm": 0.18064576387405396, "kl": 0.005580902099609375, "learning_rate": 3.901584161938172e-07, "loss": 0.0784, "reward": 0.53125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 739 }, { "completion_length": 743.25, "epoch": 0.7893333333333333, "grad_norm": 0.010317322798073292, "kl": 0.006394386291503906, "learning_rate": 3.864055947581605e-07, "loss": 0.0615, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 740 }, { "completion_length": 618.96875, "epoch": 0.7904, "grad_norm": 0.21956171095371246, "kl": 0.0065212249755859375, "learning_rate": 3.8266823907778244e-07, "loss": 0.0518, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 741 }, { "completion_length": 501.625, "epoch": 0.7914666666666667, "grad_norm": 0.003784902859479189, "kl": 0.0068149566650390625, "learning_rate": 3.7894640105760217e-07, "loss": -0.0514, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 742 }, { "completion_length": 551.9375, "epoch": 0.7925333333333333, "grad_norm": 0.14260245859622955, "kl": 0.0075511932373046875, "learning_rate": 3.7524013238702907e-07, "loss": -0.1014, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 743 }, { "completion_length": 633.28125, "epoch": 0.7936, "grad_norm": 0.004009216092526913, "kl": 0.0060577392578125, "learning_rate": 3.715494845392418e-07, "loss": -0.0161, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 744 }, { "completion_length": 658.59375, "epoch": 0.7946666666666666, "grad_norm": 0.006183084100484848, "kl": 0.008142471313476562, "learning_rate": 3.6787450877047543e-07, "loss": 0.0151, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 745 }, { "completion_length": 569.25, "epoch": 0.7957333333333333, "grad_norm": 0.09155230969190598, "kl": 0.009784698486328125, "learning_rate": 3.6421525611930873e-07, "loss": -0.0367, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 746 }, { "completion_length": 539.125, "epoch": 0.7968, "grad_norm": 0.004530595615506172, "kl": 0.008974075317382812, "learning_rate": 3.6057177740595546e-07, "loss": 0.0014, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 747 }, { "completion_length": 690.78125, "epoch": 0.7978666666666666, "grad_norm": 0.0027653155848383904, "kl": 0.007171630859375, "learning_rate": 3.569441232315594e-07, "loss": 0.1021, "reward": 0.4375, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 748 }, { "completion_length": 496.96875, "epoch": 0.7989333333333334, "grad_norm": 0.12031655013561249, "kl": 0.010082244873046875, "learning_rate": 3.5333234397748987e-07, "loss": 0.0152, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 749 }, { "completion_length": 577.28125, "epoch": 0.8, "grad_norm": 0.11285604536533356, "kl": 0.007984161376953125, "learning_rate": 3.4973648980464454e-07, "loss": -0.0178, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 750 }, { "completion_length": 521.90625, "epoch": 0.8010666666666667, "grad_norm": 0.20899592339992523, "kl": 0.0071964263916015625, "learning_rate": 3.4615661065275007e-07, "loss": 0.1318, "reward": 0.65625, "reward_std": 0.5096687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 751 }, { "completion_length": 528.03125, "epoch": 0.8021333333333334, "grad_norm": 0.2387075424194336, "kl": 0.015249252319335938, "learning_rate": 3.425927562396702e-07, "loss": -0.0112, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 752 }, { "completion_length": 603.03125, "epoch": 0.8032, "grad_norm": 0.17253021895885468, "kl": 0.006877899169921875, "learning_rate": 3.3904497606071473e-07, "loss": 0.0081, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 753 }, { "completion_length": 519.59375, "epoch": 0.8042666666666667, "grad_norm": 0.003982819616794586, "kl": 0.007961273193359375, "learning_rate": 3.3551331938795246e-07, "loss": 0.0669, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 754 }, { "completion_length": 643.65625, "epoch": 0.8053333333333333, "grad_norm": 0.003070240607485175, "kl": 0.00600433349609375, "learning_rate": 3.3199783526952656e-07, "loss": 0.0356, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 755 }, { "completion_length": 606.5625, "epoch": 0.8064, "grad_norm": 0.08456442505121231, "kl": 0.0045757293701171875, "learning_rate": 3.284985725289734e-07, "loss": -0.0152, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 756 }, { "completion_length": 509.59375, "epoch": 0.8074666666666667, "grad_norm": 0.00652235746383667, "kl": 0.01210784912109375, "learning_rate": 3.25015579764545e-07, "loss": -0.0144, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 757 }, { "completion_length": 565.8125, "epoch": 0.8085333333333333, "grad_norm": 0.07438244670629501, "kl": 0.00882720947265625, "learning_rate": 3.2154890534853295e-07, "loss": 0.0565, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 758 }, { "completion_length": 614.59375, "epoch": 0.8096, "grad_norm": 0.0016918244073167443, "kl": 0.004940032958984375, "learning_rate": 3.1809859742659784e-07, "loss": 0.0041, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 759 }, { "completion_length": 701.03125, "epoch": 0.8106666666666666, "grad_norm": 0.07620763033628464, "kl": 0.0043735504150390625, "learning_rate": 3.146647039171002e-07, "loss": 0.0316, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 760 }, { "completion_length": 626.0625, "epoch": 0.8117333333333333, "grad_norm": 0.09237799793481827, "kl": 0.0064983367919921875, "learning_rate": 3.112472725104345e-07, "loss": 0.0716, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 761 }, { "completion_length": 572.65625, "epoch": 0.8128, "grad_norm": 0.004011401906609535, "kl": 0.005825042724609375, "learning_rate": 3.078463506683674e-07, "loss": 0.0008, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 762 }, { "completion_length": 598.65625, "epoch": 0.8138666666666666, "grad_norm": 0.10553431510925293, "kl": 0.011697769165039062, "learning_rate": 3.0446198562337857e-07, "loss": 0.0385, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 763 }, { "completion_length": 674.59375, "epoch": 0.8149333333333333, "grad_norm": 0.05300498381257057, "kl": 0.0069103240966796875, "learning_rate": 3.0109422437800415e-07, "loss": 0.0139, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 764 }, { "completion_length": 496.75, "epoch": 0.816, "grad_norm": 0.1415751576423645, "kl": 0.008638381958007812, "learning_rate": 2.977431137041848e-07, "loss": 0.04, "reward": 0.875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 765 }, { "completion_length": 484.96875, "epoch": 0.8170666666666667, "grad_norm": 0.10078023374080658, "kl": 0.012969970703125, "learning_rate": 2.944087001426154e-07, "loss": 0.0099, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 766 }, { "completion_length": 681.90625, "epoch": 0.8181333333333334, "grad_norm": 0.06611615419387817, "kl": 0.0070343017578125, "learning_rate": 2.9109103000209945e-07, "loss": 0.1228, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 767 }, { "completion_length": 541.3125, "epoch": 0.8192, "grad_norm": 0.13726595044136047, "kl": 0.007450103759765625, "learning_rate": 2.877901493589048e-07, "loss": 0.0234, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 768 }, { "completion_length": 628.8125, "epoch": 0.8202666666666667, "grad_norm": 0.07682011276483536, "kl": 0.0055084228515625, "learning_rate": 2.8450610405612504e-07, "loss": 0.0224, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 769 }, { "completion_length": 629.59375, "epoch": 0.8213333333333334, "grad_norm": 0.1563805490732193, "kl": 0.0074939727783203125, "learning_rate": 2.8123893970304154e-07, "loss": 0.0026, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 770 }, { "completion_length": 467.65625, "epoch": 0.8224, "grad_norm": 0.006879034917801619, "kl": 0.0088653564453125, "learning_rate": 2.779887016744915e-07, "loss": -0.0291, "reward": 0.875, "reward_std": 0.125, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 771 }, { "completion_length": 543.125, "epoch": 0.8234666666666667, "grad_norm": 0.12483768910169601, "kl": 0.0073032379150390625, "learning_rate": 2.7475543511023627e-07, "loss": 0.0302, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 772 }, { "completion_length": 486.71875, "epoch": 0.8245333333333333, "grad_norm": 0.2511114776134491, "kl": 0.011397361755371094, "learning_rate": 2.715391849143354e-07, "loss": -0.0205, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 773 }, { "completion_length": 557.78125, "epoch": 0.8256, "grad_norm": 0.08086814731359482, "kl": 0.008451461791992188, "learning_rate": 2.6833999575452256e-07, "loss": 0.075, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 774 }, { "completion_length": 669.09375, "epoch": 0.8266666666666667, "grad_norm": 0.0029153493233025074, "kl": 0.008945465087890625, "learning_rate": 2.651579120615855e-07, "loss": 0.005, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 775 }, { "completion_length": 639.21875, "epoch": 0.8277333333333333, "grad_norm": 0.09505913406610489, "kl": 0.006107330322265625, "learning_rate": 2.6199297802874865e-07, "loss": 0.0134, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 776 }, { "completion_length": 516.0, "epoch": 0.8288, "grad_norm": 0.010082271881401539, "kl": 0.025806427001953125, "learning_rate": 2.5884523761106026e-07, "loss": 0.0107, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 777 }, { "completion_length": 621.90625, "epoch": 0.8298666666666666, "grad_norm": 0.13697364926338196, "kl": 0.00799560546875, "learning_rate": 2.5571473452478045e-07, "loss": 0.0899, "reward": 0.46875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 778 }, { "completion_length": 691.28125, "epoch": 0.8309333333333333, "grad_norm": 0.005412078462541103, "kl": 0.00347900390625, "learning_rate": 2.526015122467751e-07, "loss": 0.0122, "reward": 0.34375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 779 }, { "completion_length": 590.03125, "epoch": 0.832, "grad_norm": 0.05748090520501137, "kl": 0.007892608642578125, "learning_rate": 2.495056140139119e-07, "loss": 0.0285, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 780 }, { "completion_length": 739.8125, "epoch": 0.8330666666666666, "grad_norm": 0.00243077683262527, "kl": 0.00597381591796875, "learning_rate": 2.464270828224597e-07, "loss": 0.0601, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 781 }, { "completion_length": 765.1875, "epoch": 0.8341333333333333, "grad_norm": 0.0823466032743454, "kl": 0.0046520233154296875, "learning_rate": 2.433659614274909e-07, "loss": -0.0265, "reward": 0.53125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 782 }, { "completion_length": 686.6875, "epoch": 0.8352, "grad_norm": 0.0014736105222254992, "kl": 0.00667572021484375, "learning_rate": 2.403222923422895e-07, "loss": -0.0048, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 783 }, { "completion_length": 581.34375, "epoch": 0.8362666666666667, "grad_norm": 0.004039286635816097, "kl": 0.009975433349609375, "learning_rate": 2.372961178377585e-07, "loss": 0.0846, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 784 }, { "completion_length": 690.84375, "epoch": 0.8373333333333334, "grad_norm": 0.07407344877719879, "kl": 0.00466156005859375, "learning_rate": 2.3428747994183364e-07, "loss": 0.0364, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 785 }, { "completion_length": 571.1875, "epoch": 0.8384, "grad_norm": 0.11089657992124557, "kl": 0.014995574951171875, "learning_rate": 2.312964204389e-07, "loss": 0.0172, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 786 }, { "completion_length": 587.9375, "epoch": 0.8394666666666667, "grad_norm": 0.14618657529354095, "kl": 0.009794235229492188, "learning_rate": 2.2832298086921127e-07, "loss": 0.0336, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 787 }, { "completion_length": 558.40625, "epoch": 0.8405333333333334, "grad_norm": 0.10964526981115341, "kl": 0.0075626373291015625, "learning_rate": 2.2536720252831367e-07, "loss": 0.0167, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 788 }, { "completion_length": 505.15625, "epoch": 0.8416, "grad_norm": 0.003082748269662261, "kl": 0.009204864501953125, "learning_rate": 2.2242912646647086e-07, "loss": -0.0154, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 789 }, { "completion_length": 661.6875, "epoch": 0.8426666666666667, "grad_norm": 0.07762983441352844, "kl": 0.008485794067382812, "learning_rate": 2.1950879348809548e-07, "loss": -0.0071, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 790 }, { "completion_length": 763.0, "epoch": 0.8437333333333333, "grad_norm": 0.003289029933512211, "kl": 0.0042476654052734375, "learning_rate": 2.1660624415118158e-07, "loss": 0.0331, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 791 }, { "completion_length": 651.4375, "epoch": 0.8448, "grad_norm": 0.006697571370750666, "kl": 0.00522613525390625, "learning_rate": 2.1372151876674112e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 792 }, { "completion_length": 685.71875, "epoch": 0.8458666666666667, "grad_norm": 0.08910330384969711, "kl": 0.0056095123291015625, "learning_rate": 2.1085465739824516e-07, "loss": -0.0287, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 793 }, { "completion_length": 549.03125, "epoch": 0.8469333333333333, "grad_norm": 0.0027012338396161795, "kl": 0.005107879638671875, "learning_rate": 2.080056998610662e-07, "loss": 0.0033, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 794 }, { "completion_length": 564.1875, "epoch": 0.848, "grad_norm": 0.35098379850387573, "kl": 0.005840301513671875, "learning_rate": 2.0517468572192632e-07, "loss": 0.0672, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 795 }, { "completion_length": 565.3125, "epoch": 0.8490666666666666, "grad_norm": 0.0030043257866054773, "kl": 0.0060901641845703125, "learning_rate": 2.023616542983466e-07, "loss": 0.0363, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 796 }, { "completion_length": 596.3125, "epoch": 0.8501333333333333, "grad_norm": 0.0034999202471226454, "kl": 0.006389617919921875, "learning_rate": 1.995666446581023e-07, "loss": -0.0212, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 797 }, { "completion_length": 682.5625, "epoch": 0.8512, "grad_norm": 0.20963862538337708, "kl": 0.008108139038085938, "learning_rate": 1.9678969561867894e-07, "loss": 0.1091, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 798 }, { "completion_length": 633.0, "epoch": 0.8522666666666666, "grad_norm": 0.16772130131721497, "kl": 0.0058460235595703125, "learning_rate": 1.9403084574673463e-07, "loss": 0.0073, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 799 }, { "completion_length": 589.625, "epoch": 0.8533333333333334, "grad_norm": 0.002160250674933195, "kl": 0.0067310333251953125, "learning_rate": 1.9129013335756317e-07, "loss": 0.0003, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 800 }, { "completion_length": 603.4375, "epoch": 0.8544, "grad_norm": 0.11980433762073517, "kl": 0.0064220428466796875, "learning_rate": 1.8856759651456234e-07, "loss": 0.0298, "reward": 0.75, "reward_std": 0.375, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 801 }, { "completion_length": 504.15625, "epoch": 0.8554666666666667, "grad_norm": 0.1250668168067932, "kl": 0.0103607177734375, "learning_rate": 1.8586327302870599e-07, "loss": 0.0671, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 802 }, { "completion_length": 536.90625, "epoch": 0.8565333333333334, "grad_norm": 0.08011102676391602, "kl": 0.0074825286865234375, "learning_rate": 1.8317720045801778e-07, "loss": 0.0901, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 803 }, { "completion_length": 642.46875, "epoch": 0.8576, "grad_norm": 0.003025019308552146, "kl": 0.006636619567871094, "learning_rate": 1.8050941610705053e-07, "loss": -0.0102, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 804 }, { "completion_length": 732.15625, "epoch": 0.8586666666666667, "grad_norm": 0.0868997648358345, "kl": 0.0054607391357421875, "learning_rate": 1.7785995702636698e-07, "loss": -0.0361, "reward": 0.53125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 805 }, { "completion_length": 527.90625, "epoch": 0.8597333333333333, "grad_norm": 0.21921825408935547, "kl": 0.010009765625, "learning_rate": 1.7522886001202687e-07, "loss": 0.126, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 806 }, { "completion_length": 686.1875, "epoch": 0.8608, "grad_norm": 0.13464634120464325, "kl": 0.006175041198730469, "learning_rate": 1.7261616160507403e-07, "loss": 0.0806, "reward": 0.46875, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 807 }, { "completion_length": 511.25, "epoch": 0.8618666666666667, "grad_norm": 0.008266044780611992, "kl": 0.0106658935546875, "learning_rate": 1.700218980910311e-07, "loss": 0.031, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 808 }, { "completion_length": 534.28125, "epoch": 0.8629333333333333, "grad_norm": 0.0028818491846323013, "kl": 0.011322021484375, "learning_rate": 1.6744610549939322e-07, "loss": 0.0586, "reward": 0.96875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.0, "step": 809 }, { "completion_length": 676.53125, "epoch": 0.864, "grad_norm": 0.054974932223558426, "kl": 0.007770538330078125, "learning_rate": 1.64888819603129e-07, "loss": 0.0078, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 810 }, { "completion_length": 533.75, "epoch": 0.8650666666666667, "grad_norm": 0.002682663034647703, "kl": 0.0067043304443359375, "learning_rate": 1.6235007591818385e-07, "loss": 0.0945, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 811 }, { "completion_length": 708.15625, "epoch": 0.8661333333333333, "grad_norm": 0.018177397549152374, "kl": 0.006298065185546875, "learning_rate": 1.598299097029859e-07, "loss": -0.0017, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 812 }, { "completion_length": 536.53125, "epoch": 0.8672, "grad_norm": 0.162200465798378, "kl": 0.0058574676513671875, "learning_rate": 1.573283559579572e-07, "loss": 0.1222, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 813 }, { "completion_length": 511.125, "epoch": 0.8682666666666666, "grad_norm": 0.0979967787861824, "kl": 0.007936477661132812, "learning_rate": 1.5484544942502694e-07, "loss": -0.0082, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 814 }, { "completion_length": 419.71875, "epoch": 0.8693333333333333, "grad_norm": 0.002360833575949073, "kl": 0.0063686370849609375, "learning_rate": 1.5238122458714925e-07, "loss": 0.0327, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 815 }, { "completion_length": 499.3125, "epoch": 0.8704, "grad_norm": 0.00411321222782135, "kl": 0.0078029632568359375, "learning_rate": 1.4993571566782404e-07, "loss": 0.0006, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 816 }, { "completion_length": 514.0625, "epoch": 0.8714666666666666, "grad_norm": 0.0898655578494072, "kl": 0.0062408447265625, "learning_rate": 1.475089566306226e-07, "loss": -0.0134, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 817 }, { "completion_length": 769.40625, "epoch": 0.8725333333333334, "grad_norm": 0.0998729020357132, "kl": 0.007190704345703125, "learning_rate": 1.4510098117871462e-07, "loss": 0.0705, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 818 }, { "completion_length": 722.03125, "epoch": 0.8736, "grad_norm": 0.07927057147026062, "kl": 0.00799560546875, "learning_rate": 1.4271182275440077e-07, "loss": 0.0352, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 819 }, { "completion_length": 621.03125, "epoch": 0.8746666666666667, "grad_norm": 0.09437448531389236, "kl": 0.03481006622314453, "learning_rate": 1.4034151453864846e-07, "loss": -0.0186, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 820 }, { "completion_length": 708.53125, "epoch": 0.8757333333333334, "grad_norm": 0.004353997763246298, "kl": 0.0073413848876953125, "learning_rate": 1.3799008945063046e-07, "loss": 0.0025, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 821 }, { "completion_length": 659.28125, "epoch": 0.8768, "grad_norm": 0.10449235141277313, "kl": 0.0045261383056640625, "learning_rate": 1.3565758014726843e-07, "loss": 0.0243, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 822 }, { "completion_length": 598.71875, "epoch": 0.8778666666666667, "grad_norm": 0.003382681868970394, "kl": 0.006282806396484375, "learning_rate": 1.3334401902277849e-07, "loss": 0.0182, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 823 }, { "completion_length": 582.0625, "epoch": 0.8789333333333333, "grad_norm": 0.07301092147827148, "kl": 0.005721092224121094, "learning_rate": 1.3104943820822195e-07, "loss": 0.021, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 824 }, { "completion_length": 630.5625, "epoch": 0.88, "grad_norm": 0.05437426641583443, "kl": 0.011182785034179688, "learning_rate": 1.287738695710592e-07, "loss": 0.033, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 825 }, { "completion_length": 613.59375, "epoch": 0.8810666666666667, "grad_norm": 0.09001360088586807, "kl": 0.009222030639648438, "learning_rate": 1.265173447147064e-07, "loss": 0.0431, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 826 }, { "completion_length": 596.8125, "epoch": 0.8821333333333333, "grad_norm": 0.16213977336883545, "kl": 0.008136749267578125, "learning_rate": 1.2427989497809733e-07, "loss": -0.0349, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 827 }, { "completion_length": 678.90625, "epoch": 0.8832, "grad_norm": 0.002098598051816225, "kl": 0.0055942535400390625, "learning_rate": 1.220615514352479e-07, "loss": 0.0121, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 828 }, { "completion_length": 637.71875, "epoch": 0.8842666666666666, "grad_norm": 0.12294886261224747, "kl": 0.005770683288574219, "learning_rate": 1.19862344894824e-07, "loss": 0.0885, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 829 }, { "completion_length": 690.875, "epoch": 0.8853333333333333, "grad_norm": 0.13096478581428528, "kl": 0.008678436279296875, "learning_rate": 1.1768230589971457e-07, "loss": 0.0209, "reward": 0.3125, "reward_std": 0.25, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 830 }, { "completion_length": 599.375, "epoch": 0.8864, "grad_norm": 0.09292510896921158, "kl": 0.012037277221679688, "learning_rate": 1.1552146472660724e-07, "loss": 0.0213, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 831 }, { "completion_length": 670.03125, "epoch": 0.8874666666666666, "grad_norm": 0.003002967219799757, "kl": 0.004879951477050781, "learning_rate": 1.1337985138556695e-07, "loss": 0.0176, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 832 }, { "completion_length": 603.3125, "epoch": 0.8885333333333333, "grad_norm": 0.0035216340329498053, "kl": 0.006328582763671875, "learning_rate": 1.1125749561962023e-07, "loss": 0.0978, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 833 }, { "completion_length": 467.9375, "epoch": 0.8896, "grad_norm": 0.007442415691912174, "kl": 0.0081787109375, "learning_rate": 1.0915442690434158e-07, "loss": 0.0003, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 834 }, { "completion_length": 572.84375, "epoch": 0.8906666666666667, "grad_norm": 0.06543497741222382, "kl": 0.0073394775390625, "learning_rate": 1.0707067444744439e-07, "loss": 0.0527, "reward": 0.875, "reward_std": 0.125, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 835 }, { "completion_length": 608.6875, "epoch": 0.8917333333333334, "grad_norm": 0.0033548774663358927, "kl": 0.008630752563476562, "learning_rate": 1.0500626718837453e-07, "loss": 0.0849, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 836 }, { "completion_length": 437.28125, "epoch": 0.8928, "grad_norm": 0.10637304186820984, "kl": 0.008941650390625, "learning_rate": 1.0296123379791039e-07, "loss": -0.0029, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 837 }, { "completion_length": 494.375, "epoch": 0.8938666666666667, "grad_norm": 0.003050882602110505, "kl": 0.01534271240234375, "learning_rate": 1.009356026777618e-07, "loss": 0.0313, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 838 }, { "completion_length": 645.53125, "epoch": 0.8949333333333334, "grad_norm": 0.004718273412436247, "kl": 0.0098419189453125, "learning_rate": 9.89294019601783e-08, "loss": 0.0314, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 839 }, { "completion_length": 655.1875, "epoch": 0.896, "grad_norm": 0.07143094390630722, "kl": 0.009172439575195312, "learning_rate": 9.69426595075566e-08, "loss": 0.0113, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 840 }, { "completion_length": 526.21875, "epoch": 0.8970666666666667, "grad_norm": 0.11427925527095795, "kl": 0.0058422088623046875, "learning_rate": 9.497540291205459e-08, "loss": 0.1414, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 841 }, { "completion_length": 716.625, "epoch": 0.8981333333333333, "grad_norm": 0.11895410716533661, "kl": 0.06463432312011719, "learning_rate": 9.302765949520765e-08, "loss": -0.0218, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 842 }, { "completion_length": 519.28125, "epoch": 0.8992, "grad_norm": 0.005462268833070993, "kl": 0.012447357177734375, "learning_rate": 9.109945630754974e-08, "loss": 0.0369, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 843 }, { "completion_length": 785.03125, "epoch": 0.9002666666666667, "grad_norm": 0.05187647044658661, "kl": 0.007007598876953125, "learning_rate": 8.919082012823675e-08, "loss": -0.0086, "reward": 0.34375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 844 }, { "completion_length": 672.625, "epoch": 0.9013333333333333, "grad_norm": 0.11939622461795807, "kl": 0.0058345794677734375, "learning_rate": 8.730177746467616e-08, "loss": 0.0459, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 845 }, { "completion_length": 771.90625, "epoch": 0.9024, "grad_norm": 0.07743582129478455, "kl": 0.00579833984375, "learning_rate": 8.543235455215687e-08, "loss": 0.0174, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 846 }, { "completion_length": 574.21875, "epoch": 0.9034666666666666, "grad_norm": 0.09704414010047913, "kl": 0.0060062408447265625, "learning_rate": 8.358257735348695e-08, "loss": 0.0294, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 847 }, { "completion_length": 576.5625, "epoch": 0.9045333333333333, "grad_norm": 0.0939788892865181, "kl": 0.0056133270263671875, "learning_rate": 8.175247155863124e-08, "loss": -0.0218, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 848 }, { "completion_length": 495.15625, "epoch": 0.9056, "grad_norm": 0.002678000135347247, "kl": 0.0075054168701171875, "learning_rate": 7.994206258435576e-08, "loss": 0.0216, "reward": 0.9375, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 849 }, { "completion_length": 534.65625, "epoch": 0.9066666666666666, "grad_norm": 0.005566445644944906, "kl": 0.008861541748046875, "learning_rate": 7.81513755738742e-08, "loss": 0.0226, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 850 }, { "completion_length": 628.90625, "epoch": 0.9077333333333333, "grad_norm": 0.004998371005058289, "kl": 0.005831718444824219, "learning_rate": 7.638043539649897e-08, "loss": 0.0391, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 851 }, { "completion_length": 646.6875, "epoch": 0.9088, "grad_norm": 0.004401471000164747, "kl": 0.00722503662109375, "learning_rate": 7.462926664729592e-08, "loss": 0.0169, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 852 }, { "completion_length": 440.59375, "epoch": 0.9098666666666667, "grad_norm": 0.11410848796367645, "kl": 0.009748458862304688, "learning_rate": 7.289789364674165e-08, "loss": -0.0094, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 853 }, { "completion_length": 688.59375, "epoch": 0.9109333333333334, "grad_norm": 0.15487860143184662, "kl": 0.00606536865234375, "learning_rate": 7.118634044038774e-08, "loss": 0.0073, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 854 }, { "completion_length": 516.65625, "epoch": 0.912, "grad_norm": 0.027837609872221947, "kl": 0.012132644653320312, "learning_rate": 6.949463079852491e-08, "loss": -0.0034, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 855 }, { "completion_length": 503.84375, "epoch": 0.9130666666666667, "grad_norm": 0.004226836375892162, "kl": 0.007259368896484375, "learning_rate": 6.782278821585386e-08, "loss": 0.0113, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 856 }, { "completion_length": 580.78125, "epoch": 0.9141333333333334, "grad_norm": 0.004731197375804186, "kl": 0.006359100341796875, "learning_rate": 6.617083591115897e-08, "loss": 0.0048, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 857 }, { "completion_length": 586.25, "epoch": 0.9152, "grad_norm": 0.1929353028535843, "kl": 0.009121894836425781, "learning_rate": 6.453879682698543e-08, "loss": -0.0008, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 858 }, { "completion_length": 486.59375, "epoch": 0.9162666666666667, "grad_norm": 0.1762266308069229, "kl": 0.017578125, "learning_rate": 6.292669362932102e-08, "loss": 0.0542, "reward": 0.6875, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 859 }, { "completion_length": 597.34375, "epoch": 0.9173333333333333, "grad_norm": 0.08684267103672028, "kl": 0.0075283050537109375, "learning_rate": 6.133454870728111e-08, "loss": -0.0109, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 860 }, { "completion_length": 573.96875, "epoch": 0.9184, "grad_norm": 0.1111418753862381, "kl": 0.011514663696289062, "learning_rate": 5.97623841727975e-08, "loss": 0.0318, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 861 }, { "completion_length": 716.375, "epoch": 0.9194666666666667, "grad_norm": 0.08242709934711456, "kl": 0.004878997802734375, "learning_rate": 5.8210221860311774e-08, "loss": 0.0403, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 862 }, { "completion_length": 603.78125, "epoch": 0.9205333333333333, "grad_norm": 0.22665412724018097, "kl": 0.005207061767578125, "learning_rate": 5.6678083326472064e-08, "loss": 0.1211, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 863 }, { "completion_length": 578.78125, "epoch": 0.9216, "grad_norm": 0.16141948103904724, "kl": 0.015872955322265625, "learning_rate": 5.516598984983279e-08, "loss": 0.0655, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 864 }, { "completion_length": 592.34375, "epoch": 0.9226666666666666, "grad_norm": 0.07737910747528076, "kl": 0.010690689086914062, "learning_rate": 5.367396243056022e-08, "loss": 0.0843, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 865 }, { "completion_length": 521.1875, "epoch": 0.9237333333333333, "grad_norm": 0.1283794343471527, "kl": 0.0081634521484375, "learning_rate": 5.2202021790140884e-08, "loss": 0.0236, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 866 }, { "completion_length": 740.84375, "epoch": 0.9248, "grad_norm": 0.07489572465419769, "kl": 0.0056514739990234375, "learning_rate": 5.075018837109263e-08, "loss": 0.0638, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 867 }, { "completion_length": 660.625, "epoch": 0.9258666666666666, "grad_norm": 0.16480883955955505, "kl": 0.006222724914550781, "learning_rate": 4.9318482336681515e-08, "loss": 0.0457, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 868 }, { "completion_length": 548.53125, "epoch": 0.9269333333333334, "grad_norm": 0.001837251358665526, "kl": 0.0061798095703125, "learning_rate": 4.7906923570641695e-08, "loss": 0.0344, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 869 }, { "completion_length": 656.84375, "epoch": 0.928, "grad_norm": 0.1044219359755516, "kl": 0.00885772705078125, "learning_rate": 4.6515531676899316e-08, "loss": -0.0184, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 870 }, { "completion_length": 728.90625, "epoch": 0.9290666666666667, "grad_norm": 0.11431588977575302, "kl": 0.00390625, "learning_rate": 4.514432597930007e-08, "loss": 0.0671, "reward": 0.625, "reward_std": 0.375, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 871 }, { "completion_length": 593.53125, "epoch": 0.9301333333333334, "grad_norm": 0.11885813623666763, "kl": 0.004863739013671875, "learning_rate": 4.379332552134124e-08, "loss": 0.0197, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 872 }, { "completion_length": 623.03125, "epoch": 0.9312, "grad_norm": 0.0018202133942395449, "kl": 0.0054187774658203125, "learning_rate": 4.246254906590641e-08, "loss": 0.0145, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 873 }, { "completion_length": 562.53125, "epoch": 0.9322666666666667, "grad_norm": 0.005210235249251127, "kl": 0.008880615234375, "learning_rate": 4.115201509500582e-08, "loss": -0.041, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 874 }, { "completion_length": 614.28125, "epoch": 0.9333333333333333, "grad_norm": 0.14878682792186737, "kl": 0.009272575378417969, "learning_rate": 3.986174180951896e-08, "loss": 0.0559, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 875 }, { "completion_length": 561.90625, "epoch": 0.9344, "grad_norm": 0.06940197199583054, "kl": 0.0061855316162109375, "learning_rate": 3.8591747128942033e-08, "loss": 0.0038, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 876 }, { "completion_length": 559.34375, "epoch": 0.9354666666666667, "grad_norm": 0.16986717283725739, "kl": 0.01032257080078125, "learning_rate": 3.734204869113955e-08, "loss": 0.0713, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 877 }, { "completion_length": 659.59375, "epoch": 0.9365333333333333, "grad_norm": 0.07494507730007172, "kl": 0.006107330322265625, "learning_rate": 3.611266385209849e-08, "loss": 0.0668, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 878 }, { "completion_length": 732.75, "epoch": 0.9376, "grad_norm": 0.11545497924089432, "kl": 0.004036903381347656, "learning_rate": 3.490360968568801e-08, "loss": 0.0101, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 879 }, { "completion_length": 458.5, "epoch": 0.9386666666666666, "grad_norm": 0.00638976925984025, "kl": 0.01093292236328125, "learning_rate": 3.3714902983421944e-08, "loss": 0.0152, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 880 }, { "completion_length": 628.40625, "epoch": 0.9397333333333333, "grad_norm": 0.08193977922201157, "kl": 0.00589752197265625, "learning_rate": 3.254656025422553e-08, "loss": 0.0837, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 881 }, { "completion_length": 714.1875, "epoch": 0.9408, "grad_norm": 0.06505157053470612, "kl": 0.0052356719970703125, "learning_rate": 3.1398597724206555e-08, "loss": -0.0051, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 882 }, { "completion_length": 518.75, "epoch": 0.9418666666666666, "grad_norm": 0.0023964536376297474, "kl": 0.00778961181640625, "learning_rate": 3.027103133642972e-08, "loss": -0.0051, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 883 }, { "completion_length": 562.4375, "epoch": 0.9429333333333333, "grad_norm": 0.10536618530750275, "kl": 0.008493423461914062, "learning_rate": 2.9163876750694986e-08, "loss": -0.0105, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 884 }, { "completion_length": 534.71875, "epoch": 0.944, "grad_norm": 0.1756277233362198, "kl": 0.0052318572998046875, "learning_rate": 2.807714934332073e-08, "loss": 0.1409, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 885 }, { "completion_length": 553.78125, "epoch": 0.9450666666666667, "grad_norm": 0.10607413202524185, "kl": 0.0072078704833984375, "learning_rate": 2.7010864206929443e-08, "loss": 0.0819, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 886 }, { "completion_length": 552.53125, "epoch": 0.9461333333333334, "grad_norm": 0.18058444559574127, "kl": 0.0071277618408203125, "learning_rate": 2.5965036150238706e-08, "loss": 0.1041, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 887 }, { "completion_length": 699.875, "epoch": 0.9472, "grad_norm": 0.07983206957578659, "kl": 0.0066356658935546875, "learning_rate": 2.4939679697855212e-08, "loss": 0.0122, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 888 }, { "completion_length": 538.5625, "epoch": 0.9482666666666667, "grad_norm": 0.11233188211917877, "kl": 0.0059490203857421875, "learning_rate": 2.393480909007306e-08, "loss": 0.0079, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 889 }, { "completion_length": 561.8125, "epoch": 0.9493333333333334, "grad_norm": 0.10350865870714188, "kl": 0.007923126220703125, "learning_rate": 2.2950438282676455e-08, "loss": -0.0198, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 890 }, { "completion_length": 646.75, "epoch": 0.9504, "grad_norm": 0.002868642332032323, "kl": 0.010474205017089844, "learning_rate": 2.1986580946744993e-08, "loss": 0.0096, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 891 }, { "completion_length": 640.09375, "epoch": 0.9514666666666667, "grad_norm": 0.1619413197040558, "kl": 0.01331329345703125, "learning_rate": 2.104325046846467e-08, "loss": -0.0517, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 892 }, { "completion_length": 488.875, "epoch": 0.9525333333333333, "grad_norm": 0.12944656610488892, "kl": 0.00604248046875, "learning_rate": 2.012045994894135e-08, "loss": 0.074, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 893 }, { "completion_length": 631.28125, "epoch": 0.9536, "grad_norm": 0.0607440322637558, "kl": 0.010813713073730469, "learning_rate": 1.9218222204019087e-08, "loss": 0.0279, "reward": 0.46875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 894 }, { "completion_length": 505.0625, "epoch": 0.9546666666666667, "grad_norm": 0.16310951113700867, "kl": 0.006206512451171875, "learning_rate": 1.8336549764102594e-08, "loss": 0.0713, "reward": 0.78125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 895 }, { "completion_length": 550.46875, "epoch": 0.9557333333333333, "grad_norm": 0.10113345086574554, "kl": 0.00696563720703125, "learning_rate": 1.7475454873982057e-08, "loss": 0.0434, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 896 }, { "completion_length": 573.84375, "epoch": 0.9568, "grad_norm": 0.015663795173168182, "kl": 0.010442733764648438, "learning_rate": 1.6634949492664253e-08, "loss": 0.1722, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 897 }, { "completion_length": 660.375, "epoch": 0.9578666666666666, "grad_norm": 0.09063471853733063, "kl": 0.0049152374267578125, "learning_rate": 1.5815045293205544e-08, "loss": 0.0444, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 898 }, { "completion_length": 479.15625, "epoch": 0.9589333333333333, "grad_norm": 0.003343922318890691, "kl": 0.008802413940429688, "learning_rate": 1.5015753662550813e-08, "loss": -0.0098, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 899 }, { "completion_length": 668.53125, "epoch": 0.96, "grad_norm": 0.06262471526861191, "kl": 0.0045318603515625, "learning_rate": 1.4237085701374109e-08, "loss": 0.0856, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 900 }, { "completion_length": 552.6875, "epoch": 0.9610666666666666, "grad_norm": 0.005250674206763506, "kl": 0.012552261352539062, "learning_rate": 1.3479052223925259e-08, "loss": 0.0495, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 901 }, { "completion_length": 638.0, "epoch": 0.9621333333333333, "grad_norm": 0.06022271886467934, "kl": 0.008663177490234375, "learning_rate": 1.2741663757879496e-08, "loss": 0.0012, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 902 }, { "completion_length": 481.78125, "epoch": 0.9632, "grad_norm": 0.35478001832962036, "kl": 0.008554458618164062, "learning_rate": 1.2024930544191237e-08, "loss": 0.136, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 903 }, { "completion_length": 559.75, "epoch": 0.9642666666666667, "grad_norm": 0.004272103309631348, "kl": 0.006519317626953125, "learning_rate": 1.1328862536952033e-08, "loss": 0.0048, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 904 }, { "completion_length": 701.625, "epoch": 0.9653333333333334, "grad_norm": 0.08515181392431259, "kl": 0.014339447021484375, "learning_rate": 1.0653469403252015e-08, "loss": 0.0414, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 905 }, { "completion_length": 581.5625, "epoch": 0.9664, "grad_norm": 0.0028841840103268623, "kl": 0.006646156311035156, "learning_rate": 9.998760523045492e-09, "loss": -0.0131, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 906 }, { "completion_length": 638.59375, "epoch": 0.9674666666666667, "grad_norm": 0.12667216360569, "kl": 0.008531570434570312, "learning_rate": 9.3647449890214e-09, "loss": -0.0201, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 907 }, { "completion_length": 488.0, "epoch": 0.9685333333333334, "grad_norm": 0.01387849822640419, "kl": 0.01461029052734375, "learning_rate": 8.751431606476234e-09, "loss": 0.15, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 908 }, { "completion_length": 574.09375, "epoch": 0.9696, "grad_norm": 0.16224469244480133, "kl": 0.012041091918945312, "learning_rate": 8.158828893192471e-09, "loss": 0.0935, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 909 }, { "completion_length": 676.78125, "epoch": 0.9706666666666667, "grad_norm": 0.00495490524917841, "kl": 0.0061244964599609375, "learning_rate": 7.586945079319673e-09, "loss": 0.0052, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 910 }, { "completion_length": 576.34375, "epoch": 0.9717333333333333, "grad_norm": 0.10055922716856003, "kl": 0.011476516723632812, "learning_rate": 7.035788107260244e-09, "loss": -0.0566, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 911 }, { "completion_length": 738.75, "epoch": 0.9728, "grad_norm": 0.07312820106744766, "kl": 0.0050201416015625, "learning_rate": 6.5053656315598455e-09, "loss": 0.0516, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 912 }, { "completion_length": 726.84375, "epoch": 0.9738666666666667, "grad_norm": 0.09372871369123459, "kl": 0.0065288543701171875, "learning_rate": 5.9956850187998235e-09, "loss": 0.0428, "reward": 0.46875, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 913 }, { "completion_length": 485.25, "epoch": 0.9749333333333333, "grad_norm": 0.1746622771024704, "kl": 0.0070590972900390625, "learning_rate": 5.506753347496285e-09, "loss": 0.0464, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 914 }, { "completion_length": 542.34375, "epoch": 0.976, "grad_norm": 0.08513789623975754, "kl": 0.005536079406738281, "learning_rate": 5.038577408000844e-09, "loss": -0.01, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 915 }, { "completion_length": 421.09375, "epoch": 0.9770666666666666, "grad_norm": 0.0060369400307536125, "kl": 0.008894920349121094, "learning_rate": 4.591163702406531e-09, "loss": -0.016, "reward": 0.96875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.0, "step": 916 }, { "completion_length": 539.8125, "epoch": 0.9781333333333333, "grad_norm": 0.08802180737257004, "kl": 0.0045413970947265625, "learning_rate": 4.1645184444575325e-09, "loss": 0.0998, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 917 }, { "completion_length": 611.375, "epoch": 0.9792, "grad_norm": 0.008430156856775284, "kl": 0.0075206756591796875, "learning_rate": 3.758647559463091e-09, "loss": 0.0528, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 918 }, { "completion_length": 609.59375, "epoch": 0.9802666666666666, "grad_norm": 0.0055547901429235935, "kl": 0.008502960205078125, "learning_rate": 3.37355668421524e-09, "loss": -0.0245, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 919 }, { "completion_length": 733.09375, "epoch": 0.9813333333333333, "grad_norm": 0.004653254523873329, "kl": 0.0074443817138671875, "learning_rate": 3.009251166909699e-09, "loss": 0.032, "reward": 0.375, "reward_std": 0.125, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 920 }, { "completion_length": 589.71875, "epoch": 0.9824, "grad_norm": 0.13699905574321747, "kl": 0.006884574890136719, "learning_rate": 2.665736067072766e-09, "loss": 0.0324, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 921 }, { "completion_length": 661.9375, "epoch": 0.9834666666666667, "grad_norm": 0.002555394545197487, "kl": 0.008514404296875, "learning_rate": 2.343016155490374e-09, "loss": 0.0629, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 922 }, { "completion_length": 592.96875, "epoch": 0.9845333333333334, "grad_norm": 0.09909087419509888, "kl": 0.00714874267578125, "learning_rate": 2.041095914141644e-09, "loss": -0.0363, "reward": 0.75, "reward_std": 0.375, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 923 }, { "completion_length": 727.28125, "epoch": 0.9856, "grad_norm": 0.0029475067276507616, "kl": 0.0067882537841796875, "learning_rate": 1.7599795361376015e-09, "loss": 0.0084, "reward": 0.40625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 924 }, { "completion_length": 536.3125, "epoch": 0.9866666666666667, "grad_norm": 0.09156189113855362, "kl": 0.0075321197509765625, "learning_rate": 1.4996709256617225e-09, "loss": 0.0332, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 925 }, { "completion_length": 720.59375, "epoch": 0.9877333333333334, "grad_norm": 0.001600004849024117, "kl": 0.0053653717041015625, "learning_rate": 1.260173697916478e-09, "loss": -0.0118, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 926 }, { "completion_length": 479.6875, "epoch": 0.9888, "grad_norm": 0.002138319192454219, "kl": 0.009784698486328125, "learning_rate": 1.0414911790730397e-09, "loss": 0.0235, "reward": 0.96875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.0, "step": 927 }, { "completion_length": 529.53125, "epoch": 0.9898666666666667, "grad_norm": 0.12831760942935944, "kl": 0.0067882537841796875, "learning_rate": 8.436264062248178e-10, "loss": -0.0308, "reward": 0.71875, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 928 }, { "completion_length": 626.625, "epoch": 0.9909333333333333, "grad_norm": 0.003438361920416355, "kl": 0.005381584167480469, "learning_rate": 6.665821273456607e-10, "loss": 0.0123, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 929 }, { "completion_length": 589.59375, "epoch": 0.992, "grad_norm": 0.003927535377442837, "kl": 0.006459236145019531, "learning_rate": 5.103608012512195e-10, "loss": 0.0391, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 930 }, { "completion_length": 516.96875, "epoch": 0.9930666666666667, "grad_norm": 0.2046954482793808, "kl": 0.010519027709960938, "learning_rate": 3.749645975653082e-10, "loss": 0.0527, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 931 }, { "completion_length": 667.59375, "epoch": 0.9941333333333333, "grad_norm": 0.056261733174324036, "kl": 0.008993148803710938, "learning_rate": 2.6039539668909486e-10, "loss": -0.017, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 932 }, { "completion_length": 615.4375, "epoch": 0.9952, "grad_norm": 0.0015674337046220899, "kl": 0.008129119873046875, "learning_rate": 1.666547897761217e-10, "loss": 0.0134, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 933 }, { "completion_length": 611.1875, "epoch": 0.9962666666666666, "grad_norm": 0.09837815165519714, "kl": 0.00637054443359375, "learning_rate": 9.374407870882396e-11, "loss": -0.0373, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 934 }, { "completion_length": 729.6875, "epoch": 0.9973333333333333, "grad_norm": 0.0021954781841486692, "kl": 0.00510406494140625, "learning_rate": 4.1664276081376796e-11, "loss": 0.0002, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 935 }, { "completion_length": 559.21875, "epoch": 0.9984, "grad_norm": 0.22538290917873383, "kl": 0.011171340942382812, "learning_rate": 1.0416105185373503e-11, "loss": -0.0043, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 936 }, { "completion_length": 642.9375, "epoch": 0.9994666666666666, "grad_norm": 0.17377488315105438, "kl": 0.0067577362060546875, "learning_rate": 0.0, "loss": 0.1485, "reward": 0.6875, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 937 }, { "epoch": 0.9994666666666666, "eval_completion_length": 590.31625, "eval_kl": 0.009681316375732421, "eval_loss": 0.018218113109469414, "eval_reward": 0.5881, "eval_reward_std": 0.21186545033454895, "eval_rewards/accuracy_reward": 0.5881, "eval_rewards/format_reward": 0.0, "eval_runtime": 21141.4031, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.059, "step": 937 }, { "epoch": 0.9994666666666666, "step": 937, "total_flos": 0.0, "train_loss": 0.029535205167337087, "train_runtime": 60208.9204, "train_samples_per_second": 0.125, "train_steps_per_second": 0.016 } ], "logging_steps": 1, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }