{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997333333333333, "eval_steps": 3000, "global_step": 2811, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 687.15625, "epoch": 0.0010666666666666667, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0638297872340425e-08, "loss": -0.0, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1 }, { "completion_length": 770.46875, "epoch": 0.0021333333333333334, "grad_norm": 0.01287431176751852, "kl": 0.0, "learning_rate": 2.127659574468085e-08, "loss": -0.0, "reward": 0.28125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 2 }, { "completion_length": 740.5625, "epoch": 0.0032, "grad_norm": 0.01456542033702135, "kl": 4.088878631591797e-05, "learning_rate": 3.191489361702128e-08, "loss": 0.0, "reward": 0.34375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 3 }, { "completion_length": 799.46875, "epoch": 0.004266666666666667, "grad_norm": 0.05252040922641754, "kl": 3.8586556911468506e-05, "learning_rate": 4.25531914893617e-08, "loss": 0.0, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 4 }, { "completion_length": 679.71875, "epoch": 0.005333333333333333, "grad_norm": 0.0946703627705574, "kl": 3.99760901927948e-05, "learning_rate": 5.319148936170213e-08, "loss": 0.0, "reward": 0.40625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 810.15625, "epoch": 0.0064, "grad_norm": 0.026912199333310127, "kl": 3.8780272006988525e-05, "learning_rate": 6.382978723404255e-08, "loss": 0.0, "reward": 0.34375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 6 }, { "completion_length": 666.96875, "epoch": 0.007466666666666667, "grad_norm": 0.009772908873856068, "kl": 3.6932528018951416e-05, "learning_rate": 7.446808510638299e-08, "loss": 0.0, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 7 }, { "completion_length": 739.75, "epoch": 0.008533333333333334, "grad_norm": 0.006907371804118156, "kl": 0.00011970847845077515, "learning_rate": 8.51063829787234e-08, "loss": 0.0, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 8 }, { "completion_length": 638.3125, "epoch": 0.0096, "grad_norm": 0.0889580026268959, "kl": 5.345046520233154e-05, "learning_rate": 9.574468085106382e-08, "loss": 0.0, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 9 }, { "completion_length": 660.40625, "epoch": 0.010666666666666666, "grad_norm": 0.01722249947488308, "kl": 4.62457537651062e-05, "learning_rate": 1.0638297872340426e-07, "loss": 0.0, "reward": 0.3125, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 787.125, "epoch": 0.011733333333333333, "grad_norm": 0.017362741753458977, "kl": 2.0425766706466675e-05, "learning_rate": 1.1702127659574468e-07, "loss": 0.0, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 11 }, { "completion_length": 721.1875, "epoch": 0.0128, "grad_norm": 7.832473056623712e-05, "kl": 3.7476420402526855e-05, "learning_rate": 1.276595744680851e-07, "loss": 0.0, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 12 }, { "completion_length": 671.15625, "epoch": 0.013866666666666666, "grad_norm": 0.02314009703695774, "kl": 0.00011057034134864807, "learning_rate": 1.3829787234042553e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 13 }, { "completion_length": 727.96875, "epoch": 0.014933333333333333, "grad_norm": 0.020008860155940056, "kl": 5.7213008403778076e-05, "learning_rate": 1.4893617021276598e-07, "loss": 0.0, "reward": 0.21875, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 14 }, { "completion_length": 736.9375, "epoch": 0.016, "grad_norm": 0.038316287100315094, "kl": 6.014108657836914e-05, "learning_rate": 1.5957446808510638e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 682.9375, "epoch": 0.017066666666666667, "grad_norm": 0.14651907980442047, "kl": 6.151199340820312e-05, "learning_rate": 1.702127659574468e-07, "loss": 0.0, "reward": 0.3125, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 16 }, { "completion_length": 710.59375, "epoch": 0.018133333333333335, "grad_norm": 0.21507208049297333, "kl": 4.805624485015869e-05, "learning_rate": 1.8085106382978722e-07, "loss": 0.0, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 17 }, { "completion_length": 796.34375, "epoch": 0.0192, "grad_norm": 0.02087116241455078, "kl": 0.00011198222637176514, "learning_rate": 1.9148936170212765e-07, "loss": 0.0, "reward": 0.3125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 18 }, { "completion_length": 736.875, "epoch": 0.020266666666666665, "grad_norm": 0.029343435540795326, "kl": 4.124455153942108e-05, "learning_rate": 2.021276595744681e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 19 }, { "completion_length": 673.09375, "epoch": 0.021333333333333333, "grad_norm": 0.014750908128917217, "kl": 7.419288158416748e-05, "learning_rate": 2.1276595744680852e-07, "loss": 0.0, "reward": 0.21875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 776.0, "epoch": 0.0224, "grad_norm": 0.010532191954553127, "kl": 3.378838300704956e-05, "learning_rate": 2.2340425531914894e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 21 }, { "completion_length": 716.53125, "epoch": 0.023466666666666667, "grad_norm": 0.04072289541363716, "kl": 8.209794759750366e-05, "learning_rate": 2.3404255319148937e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 22 }, { "completion_length": 754.46875, "epoch": 0.024533333333333334, "grad_norm": 0.011522076092660427, "kl": 4.0648505091667175e-05, "learning_rate": 2.446808510638298e-07, "loss": 0.0, "reward": 0.21875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 23 }, { "completion_length": 644.09375, "epoch": 0.0256, "grad_norm": 0.028750240802764893, "kl": 3.549456596374512e-05, "learning_rate": 2.553191489361702e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.375, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 24 }, { "completion_length": 674.65625, "epoch": 0.02666666666666667, "grad_norm": 0.32458436489105225, "kl": 9.870529174804688e-05, "learning_rate": 2.6595744680851066e-07, "loss": 0.0, "reward": 0.3125, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 681.5, "epoch": 0.027733333333333332, "grad_norm": 0.02871345914900303, "kl": 8.100271224975586e-05, "learning_rate": 2.7659574468085106e-07, "loss": 0.0, "reward": 0.21875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 26 }, { "completion_length": 798.71875, "epoch": 0.0288, "grad_norm": 0.02275983616709709, "kl": 5.792081356048584e-05, "learning_rate": 2.872340425531915e-07, "loss": 0.0, "reward": 0.40625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 27 }, { "completion_length": 709.3125, "epoch": 0.029866666666666666, "grad_norm": 0.025387700647115707, "kl": 5.4076313972473145e-05, "learning_rate": 2.9787234042553196e-07, "loss": 0.0, "reward": 0.4375, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 28 }, { "completion_length": 570.65625, "epoch": 0.030933333333333334, "grad_norm": 0.027490632608532906, "kl": 5.172938108444214e-05, "learning_rate": 3.0851063829787236e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 29 }, { "completion_length": 715.84375, "epoch": 0.032, "grad_norm": 7.5897405622527e-05, "kl": 3.725290298461914e-05, "learning_rate": 3.1914893617021275e-07, "loss": 0.0, "reward": 0.46875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 659.1875, "epoch": 0.03306666666666667, "grad_norm": 0.018309790641069412, "kl": 4.550069570541382e-05, "learning_rate": 3.297872340425532e-07, "loss": 0.0, "reward": 0.40625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 31 }, { "completion_length": 774.9375, "epoch": 0.034133333333333335, "grad_norm": 0.00026976066874340177, "kl": 5.422532558441162e-05, "learning_rate": 3.404255319148936e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 32 }, { "completion_length": 650.34375, "epoch": 0.0352, "grad_norm": 0.015839088708162308, "kl": 3.1717121601104736e-05, "learning_rate": 3.5106382978723405e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 33 }, { "completion_length": 701.40625, "epoch": 0.03626666666666667, "grad_norm": 0.05953094735741615, "kl": 6.0245394706726074e-05, "learning_rate": 3.6170212765957445e-07, "loss": 0.0, "reward": 0.34375, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 34 }, { "completion_length": 715.65625, "epoch": 0.037333333333333336, "grad_norm": 0.044842008501291275, "kl": 8.72686505317688e-05, "learning_rate": 3.723404255319149e-07, "loss": 0.0, "reward": 0.59375, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 634.125, "epoch": 0.0384, "grad_norm": 0.039516087621450424, "kl": 7.07283616065979e-05, "learning_rate": 3.829787234042553e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 36 }, { "completion_length": 840.34375, "epoch": 0.039466666666666664, "grad_norm": 9.249732829630375e-05, "kl": 4.0844082832336426e-05, "learning_rate": 3.9361702127659574e-07, "loss": 0.0, "reward": 0.15625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.0, "step": 37 }, { "completion_length": 651.46875, "epoch": 0.04053333333333333, "grad_norm": 0.06706740707159042, "kl": 3.0037015676498413e-05, "learning_rate": 4.042553191489362e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 38 }, { "completion_length": 858.40625, "epoch": 0.0416, "grad_norm": 0.030603496357798576, "kl": 3.235042095184326e-05, "learning_rate": 4.1489361702127664e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 39 }, { "completion_length": 699.59375, "epoch": 0.042666666666666665, "grad_norm": 0.03424488380551338, "kl": 0.0001620650291442871, "learning_rate": 4.2553191489361704e-07, "loss": 0.0, "reward": 0.4375, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 778.78125, "epoch": 0.04373333333333333, "grad_norm": 0.01530576404184103, "kl": 4.236958920955658e-05, "learning_rate": 4.3617021276595744e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 41 }, { "completion_length": 648.75, "epoch": 0.0448, "grad_norm": 0.010215582326054573, "kl": 4.511326551437378e-05, "learning_rate": 4.468085106382979e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 42 }, { "completion_length": 700.25, "epoch": 0.04586666666666667, "grad_norm": 0.01878911629319191, "kl": 5.554407835006714e-05, "learning_rate": 4.574468085106383e-07, "loss": 0.0, "reward": 0.4375, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 43 }, { "completion_length": 672.375, "epoch": 0.046933333333333334, "grad_norm": 0.0009520026505924761, "kl": 0.00013020634651184082, "learning_rate": 4.6808510638297873e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 44 }, { "completion_length": 626.34375, "epoch": 0.048, "grad_norm": 0.029306991025805473, "kl": 5.278363823890686e-05, "learning_rate": 4.787234042553192e-07, "loss": 0.0, "reward": 0.3125, "reward_std": 0.375, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 729.84375, "epoch": 0.04906666666666667, "grad_norm": 0.08475662767887115, "kl": 5.285441875457764e-05, "learning_rate": 4.893617021276596e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 46 }, { "completion_length": 842.28125, "epoch": 0.050133333333333335, "grad_norm": 0.012463957071304321, "kl": 5.363672971725464e-05, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 47 }, { "completion_length": 739.125, "epoch": 0.0512, "grad_norm": 0.012969022616744041, "kl": 1.6987323760986328e-05, "learning_rate": 5.106382978723404e-07, "loss": 0.0, "reward": 0.40625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 48 }, { "completion_length": 785.8125, "epoch": 0.05226666666666667, "grad_norm": 0.0001407959935022518, "kl": 9.769201278686523e-05, "learning_rate": 5.212765957446809e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 49 }, { "completion_length": 697.375, "epoch": 0.05333333333333334, "grad_norm": 0.026049425825476646, "kl": 7.943063974380493e-05, "learning_rate": 5.319148936170213e-07, "loss": 0.0, "reward": 0.21875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 785.125, "epoch": 0.0544, "grad_norm": 0.022382045164704323, "kl": 5.015730857849121e-05, "learning_rate": 5.425531914893618e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 51 }, { "completion_length": 758.78125, "epoch": 0.055466666666666664, "grad_norm": 0.03363676369190216, "kl": 2.7686357498168945e-05, "learning_rate": 5.531914893617021e-07, "loss": 0.0, "reward": 0.09375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "step": 52 }, { "completion_length": 699.84375, "epoch": 0.05653333333333333, "grad_norm": 0.027558164671063423, "kl": 6.554275751113892e-05, "learning_rate": 5.638297872340426e-07, "loss": 0.0, "reward": 0.3125, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 53 }, { "completion_length": 805.0625, "epoch": 0.0576, "grad_norm": 0.016504371538758278, "kl": 4.573911428451538e-05, "learning_rate": 5.74468085106383e-07, "loss": 0.0, "reward": 0.3125, "reward_std": 0.25, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 54 }, { "completion_length": 791.625, "epoch": 0.058666666666666666, "grad_norm": 0.023718422278761864, "kl": 0.000507015734910965, "learning_rate": 5.851063829787235e-07, "loss": 0.0, "reward": 0.21875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 767.34375, "epoch": 0.05973333333333333, "grad_norm": 0.028951559215784073, "kl": 0.0001323595643043518, "learning_rate": 5.957446808510639e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 56 }, { "completion_length": 719.65625, "epoch": 0.0608, "grad_norm": 0.029293889179825783, "kl": 6.875395774841309e-05, "learning_rate": 6.063829787234043e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 57 }, { "completion_length": 686.90625, "epoch": 0.06186666666666667, "grad_norm": 0.0001223793369717896, "kl": 5.3063035011291504e-05, "learning_rate": 6.170212765957447e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 58 }, { "completion_length": 712.28125, "epoch": 0.06293333333333333, "grad_norm": 0.04681137576699257, "kl": 9.772926568984985e-05, "learning_rate": 6.276595744680852e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 59 }, { "completion_length": 772.03125, "epoch": 0.064, "grad_norm": 0.024741122499108315, "kl": 5.759298801422119e-05, "learning_rate": 6.382978723404255e-07, "loss": 0.0, "reward": 0.34375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 741.84375, "epoch": 0.06506666666666666, "grad_norm": 0.11622889339923859, "kl": 5.472451448440552e-05, "learning_rate": 6.48936170212766e-07, "loss": 0.0, "reward": 0.34375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 61 }, { "completion_length": 641.21875, "epoch": 0.06613333333333334, "grad_norm": 0.0001778965670382604, "kl": 6.44102692604065e-05, "learning_rate": 6.595744680851064e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 62 }, { "completion_length": 762.65625, "epoch": 0.0672, "grad_norm": 0.017755355685949326, "kl": 6.605684757232666e-05, "learning_rate": 6.702127659574468e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 63 }, { "completion_length": 662.03125, "epoch": 0.06826666666666667, "grad_norm": 0.043530408293008804, "kl": 8.497387170791626e-05, "learning_rate": 6.808510638297872e-07, "loss": 0.0, "reward": 0.34375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 64 }, { "completion_length": 737.625, "epoch": 0.06933333333333333, "grad_norm": 0.03578560799360275, "kl": 5.6862831115722656e-05, "learning_rate": 6.914893617021277e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 738.34375, "epoch": 0.0704, "grad_norm": 0.0009032224188558757, "kl": 0.00011768937110900879, "learning_rate": 7.021276595744681e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 66 }, { "completion_length": 745.0625, "epoch": 0.07146666666666666, "grad_norm": 0.009712659753859043, "kl": 3.89590859413147e-05, "learning_rate": 7.127659574468086e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 67 }, { "completion_length": 764.21875, "epoch": 0.07253333333333334, "grad_norm": 0.04932890087366104, "kl": 5.8144330978393555e-05, "learning_rate": 7.234042553191489e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 68 }, { "completion_length": 756.09375, "epoch": 0.0736, "grad_norm": 0.0002563186571933329, "kl": 5.480647087097168e-05, "learning_rate": 7.340425531914893e-07, "loss": 0.0, "reward": 0.3125, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 69 }, { "completion_length": 753.9375, "epoch": 0.07466666666666667, "grad_norm": 0.014124361798167229, "kl": 4.352442920207977e-05, "learning_rate": 7.446808510638298e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.25, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 685.09375, "epoch": 0.07573333333333333, "grad_norm": 0.07064241915941238, "kl": 9.156763553619385e-05, "learning_rate": 7.553191489361701e-07, "loss": 0.0, "reward": 0.40625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 71 }, { "completion_length": 619.3125, "epoch": 0.0768, "grad_norm": 0.029690541326999664, "kl": 6.736814975738525e-05, "learning_rate": 7.659574468085106e-07, "loss": 0.0, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 72 }, { "completion_length": 666.25, "epoch": 0.07786666666666667, "grad_norm": 0.03450064733624458, "kl": 8.529424667358398e-05, "learning_rate": 7.76595744680851e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 73 }, { "completion_length": 688.1875, "epoch": 0.07893333333333333, "grad_norm": 0.008248378522694111, "kl": 5.7466328144073486e-05, "learning_rate": 7.872340425531915e-07, "loss": 0.0, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 74 }, { "completion_length": 740.375, "epoch": 0.08, "grad_norm": 0.018864931538701057, "kl": 8.817017078399658e-05, "learning_rate": 7.978723404255319e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 772.1875, "epoch": 0.08106666666666666, "grad_norm": 0.1350451111793518, "kl": 0.00020660459995269775, "learning_rate": 8.085106382978724e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 76 }, { "completion_length": 731.21875, "epoch": 0.08213333333333334, "grad_norm": 0.017250308766961098, "kl": 7.304549217224121e-05, "learning_rate": 8.191489361702128e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 77 }, { "completion_length": 750.15625, "epoch": 0.0832, "grad_norm": 0.07677006721496582, "kl": 8.55661928653717e-05, "learning_rate": 8.297872340425533e-07, "loss": 0.0, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 78 }, { "completion_length": 644.90625, "epoch": 0.08426666666666667, "grad_norm": 0.035879358649253845, "kl": 0.00011707842350006104, "learning_rate": 8.404255319148937e-07, "loss": 0.0, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 79 }, { "completion_length": 835.125, "epoch": 0.08533333333333333, "grad_norm": 0.02688344195485115, "kl": 8.541345596313477e-05, "learning_rate": 8.510638297872341e-07, "loss": 0.0, "reward": 0.40625, "reward_std": 0.47617512941360474, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 821.84375, "epoch": 0.0864, "grad_norm": 0.026960188522934914, "kl": 0.00010132044553756714, "learning_rate": 8.617021276595745e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 81 }, { "completion_length": 698.0, "epoch": 0.08746666666666666, "grad_norm": 0.03875778615474701, "kl": 7.51987099647522e-05, "learning_rate": 8.723404255319149e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 82 }, { "completion_length": 666.625, "epoch": 0.08853333333333334, "grad_norm": 0.21586580574512482, "kl": 0.0004012584686279297, "learning_rate": 8.829787234042553e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 83 }, { "completion_length": 708.5625, "epoch": 0.0896, "grad_norm": 0.22359302639961243, "kl": 0.000194549560546875, "learning_rate": 8.936170212765958e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 84 }, { "completion_length": 843.4375, "epoch": 0.09066666666666667, "grad_norm": 5.8405719755683094e-05, "kl": 0.00010073184967041016, "learning_rate": 9.042553191489361e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.4330126941204071, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 752.25, "epoch": 0.09173333333333333, "grad_norm": 0.021342478692531586, "kl": 0.00016987323760986328, "learning_rate": 9.148936170212766e-07, "loss": 0.0, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 86 }, { "completion_length": 574.125, "epoch": 0.0928, "grad_norm": 0.014598211273550987, "kl": 0.00015026330947875977, "learning_rate": 9.25531914893617e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 87 }, { "completion_length": 792.15625, "epoch": 0.09386666666666667, "grad_norm": 0.016709517687559128, "kl": 0.00010377168655395508, "learning_rate": 9.361702127659575e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 88 }, { "completion_length": 747.4375, "epoch": 0.09493333333333333, "grad_norm": 0.017524098977446556, "kl": 0.000180855393409729, "learning_rate": 9.468085106382979e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.125, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 89 }, { "completion_length": 688.90625, "epoch": 0.096, "grad_norm": 0.042594924569129944, "kl": 0.00021407008171081543, "learning_rate": 9.574468085106384e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 716.125, "epoch": 0.09706666666666666, "grad_norm": 0.0001490006543463096, "kl": 8.70823860168457e-05, "learning_rate": 9.680851063829788e-07, "loss": 0.0, "reward": 0.34375, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 91 }, { "completion_length": 692.75, "epoch": 0.09813333333333334, "grad_norm": 0.0002908668830059469, "kl": 0.00013011693954467773, "learning_rate": 9.787234042553193e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 92 }, { "completion_length": 773.28125, "epoch": 0.0992, "grad_norm": 0.01875477470457554, "kl": 0.00019121170043945312, "learning_rate": 9.893617021276595e-07, "loss": 0.0, "reward": 0.21875, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 93 }, { "completion_length": 737.0625, "epoch": 0.10026666666666667, "grad_norm": 0.15415102243423462, "kl": 0.00015585124492645264, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.40625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 94 }, { "completion_length": 607.71875, "epoch": 0.10133333333333333, "grad_norm": 0.03830459341406822, "kl": 0.0003159940242767334, "learning_rate": 1.0106382978723404e-06, "loss": 0.0, "reward": 0.625, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 800.21875, "epoch": 0.1024, "grad_norm": 0.016723917797207832, "kl": 0.0008231997489929199, "learning_rate": 1.0212765957446809e-06, "loss": 0.0, "reward": 0.34375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 96 }, { "completion_length": 836.46875, "epoch": 0.10346666666666667, "grad_norm": 0.016839798539876938, "kl": 0.0001538097858428955, "learning_rate": 1.0319148936170213e-06, "loss": 0.0, "reward": 0.34375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 97 }, { "completion_length": 652.46875, "epoch": 0.10453333333333334, "grad_norm": 0.1086062341928482, "kl": 0.0001483401283621788, "learning_rate": 1.0425531914893618e-06, "loss": 0.0, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 98 }, { "completion_length": 570.53125, "epoch": 0.1056, "grad_norm": 0.013866174034774303, "kl": 0.0005609393119812012, "learning_rate": 1.0531914893617022e-06, "loss": 0.0, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 99 }, { "completion_length": 642.90625, "epoch": 0.10666666666666667, "grad_norm": 0.019983241334557533, "kl": 0.0005436539649963379, "learning_rate": 1.0638297872340427e-06, "loss": 0.0, "reward": 0.34375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 749.75, "epoch": 0.10773333333333333, "grad_norm": 0.01535655278712511, "kl": 0.00021314620971679688, "learning_rate": 1.074468085106383e-06, "loss": 0.0, "reward": 0.53125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 101 }, { "completion_length": 721.25, "epoch": 0.1088, "grad_norm": 0.033598609268665314, "kl": 0.0002485215663909912, "learning_rate": 1.0851063829787236e-06, "loss": 0.0, "reward": 0.40625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 102 }, { "completion_length": 695.71875, "epoch": 0.10986666666666667, "grad_norm": 0.024030551314353943, "kl": 0.0002474784851074219, "learning_rate": 1.095744680851064e-06, "loss": 0.0, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 103 }, { "completion_length": 797.40625, "epoch": 0.11093333333333333, "grad_norm": 0.02765764482319355, "kl": 0.00035834312438964844, "learning_rate": 1.1063829787234042e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 104 }, { "completion_length": 770.78125, "epoch": 0.112, "grad_norm": 0.031106343492865562, "kl": 0.0003383159637451172, "learning_rate": 1.1170212765957447e-06, "loss": 0.0, "reward": 0.34375, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 653.78125, "epoch": 0.11306666666666666, "grad_norm": 0.0008745308150537312, "kl": 0.0005409419536590576, "learning_rate": 1.1276595744680851e-06, "loss": 0.0, "reward": 0.375, "reward_std": 0.125, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 106 }, { "completion_length": 829.4375, "epoch": 0.11413333333333334, "grad_norm": 0.0003123157366644591, "kl": 0.00017520785331726074, "learning_rate": 1.1382978723404256e-06, "loss": 0.0, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 107 }, { "completion_length": 698.125, "epoch": 0.1152, "grad_norm": 0.02439175546169281, "kl": 0.00022399425506591797, "learning_rate": 1.148936170212766e-06, "loss": 0.0, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 108 }, { "completion_length": 786.34375, "epoch": 0.11626666666666667, "grad_norm": 0.02094857208430767, "kl": 0.00044858455657958984, "learning_rate": 1.1595744680851065e-06, "loss": 0.0, "reward": 0.34375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 109 }, { "completion_length": 665.65625, "epoch": 0.11733333333333333, "grad_norm": 0.017399359494447708, "kl": 0.0003718137741088867, "learning_rate": 1.170212765957447e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 715.5625, "epoch": 0.1184, "grad_norm": 0.0001095441184588708, "kl": 0.0003916621208190918, "learning_rate": 1.1808510638297874e-06, "loss": 0.0, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 111 }, { "completion_length": 729.65625, "epoch": 0.11946666666666667, "grad_norm": 0.02591477520763874, "kl": 0.0004892945289611816, "learning_rate": 1.1914893617021278e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 112 }, { "completion_length": 716.1875, "epoch": 0.12053333333333334, "grad_norm": 0.017047494649887085, "kl": 0.0007135160267353058, "learning_rate": 1.2021276595744683e-06, "loss": 0.0, "reward": 0.4375, "reward_std": 0.375, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 113 }, { "completion_length": 711.71875, "epoch": 0.1216, "grad_norm": 0.0003403484297450632, "kl": 0.0002443939447402954, "learning_rate": 1.2127659574468085e-06, "loss": 0.0, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 114 }, { "completion_length": 765.46875, "epoch": 0.12266666666666666, "grad_norm": 0.037503283470869064, "kl": 0.0006112754344940186, "learning_rate": 1.223404255319149e-06, "loss": 0.0, "reward": 0.15625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 662.3125, "epoch": 0.12373333333333333, "grad_norm": 0.023322822526097298, "kl": 0.002521216869354248, "learning_rate": 1.2340425531914894e-06, "loss": 0.0001, "reward": 0.4375, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 116 }, { "completion_length": 562.3125, "epoch": 0.1248, "grad_norm": 0.01133518572896719, "kl": 0.0005480051040649414, "learning_rate": 1.2446808510638299e-06, "loss": 0.0, "reward": 0.53125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 117 }, { "completion_length": 694.15625, "epoch": 0.12586666666666665, "grad_norm": 0.016713587567210197, "kl": 0.0004071742296218872, "learning_rate": 1.2553191489361703e-06, "loss": 0.0, "reward": 0.375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 118 }, { "completion_length": 762.71875, "epoch": 0.12693333333333334, "grad_norm": 0.02082926407456398, "kl": 0.0009309053421020508, "learning_rate": 1.2659574468085106e-06, "loss": 0.0, "reward": 0.28125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 119 }, { "completion_length": 749.90625, "epoch": 0.128, "grad_norm": 0.018572334200143814, "kl": 0.0007722377777099609, "learning_rate": 1.276595744680851e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 744.5625, "epoch": 0.12906666666666666, "grad_norm": 0.08475305885076523, "kl": 0.0005460977554321289, "learning_rate": 1.2872340425531915e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 121 }, { "completion_length": 711.59375, "epoch": 0.13013333333333332, "grad_norm": 0.014888680540025234, "kl": 0.0010457634925842285, "learning_rate": 1.297872340425532e-06, "loss": 0.0, "reward": 0.5, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 122 }, { "completion_length": 553.84375, "epoch": 0.1312, "grad_norm": 0.010981667786836624, "kl": 0.0029885172843933105, "learning_rate": 1.3085106382978724e-06, "loss": 0.0001, "reward": 0.34375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 123 }, { "completion_length": 771.5, "epoch": 0.13226666666666667, "grad_norm": 0.021123724058270454, "kl": 0.0008013248443603516, "learning_rate": 1.3191489361702128e-06, "loss": 0.0, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 124 }, { "completion_length": 721.21875, "epoch": 0.13333333333333333, "grad_norm": 0.015703797340393066, "kl": 0.0010248124599456787, "learning_rate": 1.329787234042553e-06, "loss": 0.0, "reward": 0.4375, "reward_std": 0.4858439117670059, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 642.65625, "epoch": 0.1344, "grad_norm": 0.023636141791939735, "kl": 0.0003209114074707031, "learning_rate": 1.3404255319148935e-06, "loss": 0.0, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 126 }, { "completion_length": 710.3125, "epoch": 0.13546666666666668, "grad_norm": 0.009757593274116516, "kl": 0.0009133219718933105, "learning_rate": 1.351063829787234e-06, "loss": 0.0, "reward": 0.28125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 127 }, { "completion_length": 801.15625, "epoch": 0.13653333333333334, "grad_norm": 0.009237082675099373, "kl": 0.0008905529975891113, "learning_rate": 1.3617021276595744e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 128 }, { "completion_length": 765.03125, "epoch": 0.1376, "grad_norm": 0.013944610953330994, "kl": 0.001104593276977539, "learning_rate": 1.3723404255319149e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 129 }, { "completion_length": 654.21875, "epoch": 0.13866666666666666, "grad_norm": 0.0336986668407917, "kl": 0.0015014410018920898, "learning_rate": 1.3829787234042553e-06, "loss": 0.0001, "reward": 0.46875, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 675.40625, "epoch": 0.13973333333333332, "grad_norm": 0.0227128304541111, "kl": 0.0013393163681030273, "learning_rate": 1.3936170212765958e-06, "loss": 0.0001, "reward": 0.34375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 131 }, { "completion_length": 664.5, "epoch": 0.1408, "grad_norm": 0.0464305505156517, "kl": 0.0011380910873413086, "learning_rate": 1.4042553191489362e-06, "loss": 0.0, "reward": 0.46875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 132 }, { "completion_length": 694.375, "epoch": 0.14186666666666667, "grad_norm": 0.02618665061891079, "kl": 0.0009541511535644531, "learning_rate": 1.4148936170212767e-06, "loss": 0.0, "reward": 0.375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 133 }, { "completion_length": 658.375, "epoch": 0.14293333333333333, "grad_norm": 0.006451260298490524, "kl": 0.0018731355667114258, "learning_rate": 1.425531914893617e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 134 }, { "completion_length": 696.34375, "epoch": 0.144, "grad_norm": 0.0004955111653544009, "kl": 0.0007361173629760742, "learning_rate": 1.4361702127659576e-06, "loss": 0.0, "reward": 0.28125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 533.96875, "epoch": 0.14506666666666668, "grad_norm": 0.026780234649777412, "kl": 0.0015807151794433594, "learning_rate": 1.4468085106382978e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 136 }, { "completion_length": 637.0, "epoch": 0.14613333333333334, "grad_norm": 0.019693076610565186, "kl": 0.0012477636337280273, "learning_rate": 1.4574468085106382e-06, "loss": 0.0, "reward": 0.46875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 137 }, { "completion_length": 561.625, "epoch": 0.1472, "grad_norm": 0.019571883603930473, "kl": 0.002072155475616455, "learning_rate": 1.4680851063829787e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 138 }, { "completion_length": 682.625, "epoch": 0.14826666666666666, "grad_norm": 0.028477225452661514, "kl": 0.0010127425193786621, "learning_rate": 1.4787234042553191e-06, "loss": 0.0, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 139 }, { "completion_length": 518.25, "epoch": 0.14933333333333335, "grad_norm": 0.001025684643536806, "kl": 0.0029706954956054688, "learning_rate": 1.4893617021276596e-06, "loss": 0.0001, "reward": 0.46875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 728.0, "epoch": 0.1504, "grad_norm": 0.03756802901625633, "kl": 0.0013059675693511963, "learning_rate": 1.5e-06, "loss": 0.0001, "reward": 0.3125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 141 }, { "completion_length": 566.8125, "epoch": 0.15146666666666667, "grad_norm": 0.10770288109779358, "kl": 0.002039670944213867, "learning_rate": 1.5106382978723403e-06, "loss": 0.0001, "reward": 0.34375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 142 }, { "completion_length": 838.21875, "epoch": 0.15253333333333333, "grad_norm": 0.008824730291962624, "kl": 0.0004410743713378906, "learning_rate": 1.521276595744681e-06, "loss": 0.0, "reward": 0.3125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 143 }, { "completion_length": 622.28125, "epoch": 0.1536, "grad_norm": 0.1009758934378624, "kl": 0.0015273690223693848, "learning_rate": 1.5319148936170212e-06, "loss": 0.0001, "reward": 0.3125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 144 }, { "completion_length": 609.65625, "epoch": 0.15466666666666667, "grad_norm": 0.015401325188577175, "kl": 0.0007725954055786133, "learning_rate": 1.5425531914893618e-06, "loss": 0.0, "reward": 0.40625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 591.5, "epoch": 0.15573333333333333, "grad_norm": 0.08682723343372345, "kl": 0.0019183158874511719, "learning_rate": 1.553191489361702e-06, "loss": 0.0001, "reward": 0.4375, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 146 }, { "completion_length": 560.25, "epoch": 0.1568, "grad_norm": 0.01633138209581375, "kl": 0.004166722297668457, "learning_rate": 1.5638297872340427e-06, "loss": 0.0002, "reward": 0.40625, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 147 }, { "completion_length": 697.90625, "epoch": 0.15786666666666666, "grad_norm": 0.24196137487888336, "kl": 0.0016993880271911621, "learning_rate": 1.574468085106383e-06, "loss": 0.0001, "reward": 0.3125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 148 }, { "completion_length": 787.78125, "epoch": 0.15893333333333334, "grad_norm": 0.014901064336299896, "kl": 0.001746058464050293, "learning_rate": 1.5851063829787236e-06, "loss": 0.0001, "reward": 0.28125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 149 }, { "completion_length": 620.125, "epoch": 0.16, "grad_norm": 0.02242707833647728, "kl": 0.0009870529174804688, "learning_rate": 1.5957446808510639e-06, "loss": 0.0, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 595.90625, "epoch": 0.16106666666666666, "grad_norm": 0.03209035471081734, "kl": 0.0012347698211669922, "learning_rate": 1.6063829787234045e-06, "loss": 0.0, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 151 }, { "completion_length": 654.125, "epoch": 0.16213333333333332, "grad_norm": 0.007217520847916603, "kl": 0.0014528632164001465, "learning_rate": 1.6170212765957448e-06, "loss": 0.0001, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 152 }, { "completion_length": 623.4375, "epoch": 0.1632, "grad_norm": 0.028249917551875114, "kl": 0.0015864372253417969, "learning_rate": 1.627659574468085e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 153 }, { "completion_length": 801.03125, "epoch": 0.16426666666666667, "grad_norm": 0.01931595243513584, "kl": 0.001577913761138916, "learning_rate": 1.6382978723404257e-06, "loss": 0.0001, "reward": 0.46875, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 154 }, { "completion_length": 547.6875, "epoch": 0.16533333333333333, "grad_norm": 0.3100874125957489, "kl": 0.004728972911834717, "learning_rate": 1.648936170212766e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 155 }, { "completion_length": 509.3125, "epoch": 0.1664, "grad_norm": 0.022565457969903946, "kl": 0.01290738582611084, "learning_rate": 1.6595744680851066e-06, "loss": 0.0005, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 156 }, { "completion_length": 740.75, "epoch": 0.16746666666666668, "grad_norm": 0.009149301797151566, "kl": 0.0018447637557983398, "learning_rate": 1.6702127659574468e-06, "loss": 0.0001, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 157 }, { "completion_length": 674.96875, "epoch": 0.16853333333333334, "grad_norm": 0.025140373036265373, "kl": 0.0015971660614013672, "learning_rate": 1.6808510638297875e-06, "loss": 0.0001, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 158 }, { "completion_length": 786.03125, "epoch": 0.1696, "grad_norm": 0.007993107661604881, "kl": 0.0030270814895629883, "learning_rate": 1.6914893617021277e-06, "loss": 0.0001, "reward": 0.46875, "reward_std": 0.4375, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 159 }, { "completion_length": 585.78125, "epoch": 0.17066666666666666, "grad_norm": 0.022685786709189415, "kl": 0.0016987323760986328, "learning_rate": 1.7021276595744682e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 539.875, "epoch": 0.17173333333333332, "grad_norm": 0.005118262488394976, "kl": 0.04174947738647461, "learning_rate": 1.7127659574468086e-06, "loss": 0.0017, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 161 }, { "completion_length": 728.25, "epoch": 0.1728, "grad_norm": 0.009407681412994862, "kl": 0.0012710094451904297, "learning_rate": 1.723404255319149e-06, "loss": 0.0001, "reward": 0.4375, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 162 }, { "completion_length": 586.5, "epoch": 0.17386666666666667, "grad_norm": 0.02205805853009224, "kl": 0.0038857460021972656, "learning_rate": 1.7340425531914895e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 163 }, { "completion_length": 513.21875, "epoch": 0.17493333333333333, "grad_norm": 0.018200954422354698, "kl": 0.002648591995239258, "learning_rate": 1.7446808510638297e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 164 }, { "completion_length": 818.71875, "epoch": 0.176, "grad_norm": 0.010649412870407104, "kl": 0.001058816909790039, "learning_rate": 1.7553191489361702e-06, "loss": 0.0, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 165 }, { "completion_length": 609.1875, "epoch": 0.17706666666666668, "grad_norm": 0.030808577314019203, "kl": 0.002752065658569336, "learning_rate": 1.7659574468085106e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 166 }, { "completion_length": 688.53125, "epoch": 0.17813333333333334, "grad_norm": 0.024388346821069717, "kl": 0.0017253756523132324, "learning_rate": 1.776595744680851e-06, "loss": 0.0001, "reward": 0.375, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 167 }, { "completion_length": 766.625, "epoch": 0.1792, "grad_norm": 0.036688026040792465, "kl": 0.002141594886779785, "learning_rate": 1.7872340425531915e-06, "loss": 0.0001, "reward": 0.40625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 168 }, { "completion_length": 596.625, "epoch": 0.18026666666666666, "grad_norm": 0.00027937357663176954, "kl": 0.0018033981323242188, "learning_rate": 1.797872340425532e-06, "loss": 0.0001, "reward": 0.5, "reward_std": 0.375, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 169 }, { "completion_length": 666.3125, "epoch": 0.18133333333333335, "grad_norm": 0.07085057348012924, "kl": 0.002529919147491455, "learning_rate": 1.8085106382978722e-06, "loss": 0.0001, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 704.40625, "epoch": 0.1824, "grad_norm": 0.00043036678107455373, "kl": 0.0022946596145629883, "learning_rate": 1.819148936170213e-06, "loss": 0.0001, "reward": 0.5, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 171 }, { "completion_length": 689.875, "epoch": 0.18346666666666667, "grad_norm": 0.0165407694876194, "kl": 0.0030864477157592773, "learning_rate": 1.8297872340425531e-06, "loss": 0.0001, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 172 }, { "completion_length": 772.71875, "epoch": 0.18453333333333333, "grad_norm": 0.016702372580766678, "kl": 0.0010273456573486328, "learning_rate": 1.8404255319148938e-06, "loss": 0.0, "reward": 0.4375, "reward_std": 0.375, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 173 }, { "completion_length": 518.5, "epoch": 0.1856, "grad_norm": 0.0162484273314476, "kl": 0.002815842628479004, "learning_rate": 1.851063829787234e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 174 }, { "completion_length": 684.0625, "epoch": 0.18666666666666668, "grad_norm": 0.02503184974193573, "kl": 0.003011345863342285, "learning_rate": 1.8617021276595743e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 175 }, { "completion_length": 742.53125, "epoch": 0.18773333333333334, "grad_norm": 0.02499910071492195, "kl": 0.0018498897552490234, "learning_rate": 1.872340425531915e-06, "loss": 0.0001, "reward": 0.34375, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 176 }, { "completion_length": 617.34375, "epoch": 0.1888, "grad_norm": 0.025400439277291298, "kl": 0.0025887489318847656, "learning_rate": 1.8829787234042552e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 177 }, { "completion_length": 560.375, "epoch": 0.18986666666666666, "grad_norm": 0.0003198765916749835, "kl": 0.0034513473510742188, "learning_rate": 1.8936170212765958e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 178 }, { "completion_length": 597.09375, "epoch": 0.19093333333333334, "grad_norm": 0.026760054752230644, "kl": 0.0035753250122070312, "learning_rate": 1.904255319148936e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 179 }, { "completion_length": 761.4375, "epoch": 0.192, "grad_norm": 0.009813476353883743, "kl": 0.0015587806701660156, "learning_rate": 1.9148936170212767e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 582.9375, "epoch": 0.19306666666666666, "grad_norm": 0.021682659164071083, "kl": 0.004066944122314453, "learning_rate": 1.925531914893617e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 181 }, { "completion_length": 756.03125, "epoch": 0.19413333333333332, "grad_norm": 0.02755606360733509, "kl": 0.0009990930557250977, "learning_rate": 1.9361702127659576e-06, "loss": 0.0, "reward": 0.3125, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 182 }, { "completion_length": 752.3125, "epoch": 0.1952, "grad_norm": 0.02935238741338253, "kl": 0.001980900764465332, "learning_rate": 1.946808510638298e-06, "loss": 0.0001, "reward": 0.34375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 183 }, { "completion_length": 552.09375, "epoch": 0.19626666666666667, "grad_norm": 0.0165439173579216, "kl": 0.0014767646789550781, "learning_rate": 1.9574468085106385e-06, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 184 }, { "completion_length": 617.25, "epoch": 0.19733333333333333, "grad_norm": 0.009672555141150951, "kl": 0.004748225212097168, "learning_rate": 1.968085106382979e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 185 }, { "completion_length": 690.78125, "epoch": 0.1984, "grad_norm": 0.033036261796951294, "kl": 0.0038604736328125, "learning_rate": 1.978723404255319e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 186 }, { "completion_length": 665.21875, "epoch": 0.19946666666666665, "grad_norm": 0.012649191543459892, "kl": 0.002491474151611328, "learning_rate": 1.9893617021276595e-06, "loss": 0.0001, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 187 }, { "completion_length": 650.625, "epoch": 0.20053333333333334, "grad_norm": 0.008793331682682037, "kl": 0.0023102760314941406, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 188 }, { "completion_length": 661.59375, "epoch": 0.2016, "grad_norm": 0.02079566940665245, "kl": 0.001768350601196289, "learning_rate": 2.0106382978723404e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 189 }, { "completion_length": 554.5, "epoch": 0.20266666666666666, "grad_norm": 0.019554797559976578, "kl": 0.0032861828804016113, "learning_rate": 2.021276595744681e-06, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 524.15625, "epoch": 0.20373333333333332, "grad_norm": 0.020346352830529213, "kl": 0.0025827884674072266, "learning_rate": 2.0319148936170213e-06, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 191 }, { "completion_length": 474.03125, "epoch": 0.2048, "grad_norm": 0.01965460367500782, "kl": 0.004469156265258789, "learning_rate": 2.0425531914893617e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 192 }, { "completion_length": 511.75, "epoch": 0.20586666666666667, "grad_norm": 0.023867124691605568, "kl": 0.0029397010803222656, "learning_rate": 2.053191489361702e-06, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 193 }, { "completion_length": 656.40625, "epoch": 0.20693333333333333, "grad_norm": 0.01788882352411747, "kl": 0.004164695739746094, "learning_rate": 2.0638297872340426e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 194 }, { "completion_length": 683.875, "epoch": 0.208, "grad_norm": 0.010092389769852161, "kl": 0.003366708755493164, "learning_rate": 2.074468085106383e-06, "loss": 0.0001, "reward": 0.34375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 195 }, { "completion_length": 726.40625, "epoch": 0.20906666666666668, "grad_norm": 0.00036869486211799085, "kl": 0.0019375085830688477, "learning_rate": 2.0851063829787235e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 196 }, { "completion_length": 598.6875, "epoch": 0.21013333333333334, "grad_norm": 0.009064393118023872, "kl": 0.0025229454040527344, "learning_rate": 2.0957446808510635e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 197 }, { "completion_length": 601.21875, "epoch": 0.2112, "grad_norm": 0.0001451366551918909, "kl": 0.0018297433853149414, "learning_rate": 2.1063829787234044e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 198 }, { "completion_length": 530.8125, "epoch": 0.21226666666666666, "grad_norm": 0.0004379557794891298, "kl": 0.0016460418701171875, "learning_rate": 2.1170212765957444e-06, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 199 }, { "completion_length": 486.0, "epoch": 0.21333333333333335, "grad_norm": 0.03041146509349346, "kl": 0.009382069110870361, "learning_rate": 2.1276595744680853e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 200 }, { "completion_length": 584.53125, "epoch": 0.2144, "grad_norm": 0.01817508041858673, "kl": 0.0027344226837158203, "learning_rate": 2.1382978723404253e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 201 }, { "completion_length": 693.375, "epoch": 0.21546666666666667, "grad_norm": 0.0017712180269882083, "kl": 0.0029832124710083008, "learning_rate": 2.148936170212766e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 202 }, { "completion_length": 574.6875, "epoch": 0.21653333333333333, "grad_norm": 0.07188230752944946, "kl": 0.005743861198425293, "learning_rate": 2.1595744680851062e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 203 }, { "completion_length": 691.71875, "epoch": 0.2176, "grad_norm": 0.01790175586938858, "kl": 0.00594174861907959, "learning_rate": 2.170212765957447e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 204 }, { "completion_length": 509.4375, "epoch": 0.21866666666666668, "grad_norm": 0.12823010981082916, "kl": 0.005196571350097656, "learning_rate": 2.180851063829787e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 205 }, { "completion_length": 643.1875, "epoch": 0.21973333333333334, "grad_norm": 0.01034118328243494, "kl": 0.003435373306274414, "learning_rate": 2.191489361702128e-06, "loss": 0.0001, "reward": 0.46875, "reward_std": 0.47617512941360474, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 206 }, { "completion_length": 653.5, "epoch": 0.2208, "grad_norm": 0.001333974301815033, "kl": 0.001821279525756836, "learning_rate": 2.202127659574468e-06, "loss": 0.0001, "reward": 0.4375, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 207 }, { "completion_length": 660.21875, "epoch": 0.22186666666666666, "grad_norm": 0.19321835041046143, "kl": 0.007348299026489258, "learning_rate": 2.2127659574468085e-06, "loss": 0.0003, "reward": 0.34375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 208 }, { "completion_length": 619.46875, "epoch": 0.22293333333333334, "grad_norm": 0.05672023817896843, "kl": 0.011715054512023926, "learning_rate": 2.223404255319149e-06, "loss": 0.0005, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 209 }, { "completion_length": 465.75, "epoch": 0.224, "grad_norm": 0.00019568836432881653, "kl": 0.0030509233474731445, "learning_rate": 2.2340425531914894e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 553.125, "epoch": 0.22506666666666666, "grad_norm": 0.026909220963716507, "kl": 0.0026636123657226562, "learning_rate": 2.24468085106383e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 211 }, { "completion_length": 639.84375, "epoch": 0.22613333333333333, "grad_norm": 0.013105744495987892, "kl": 0.007585048675537109, "learning_rate": 2.2553191489361703e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 212 }, { "completion_length": 587.65625, "epoch": 0.2272, "grad_norm": 0.01975521631538868, "kl": 0.003843069076538086, "learning_rate": 2.2659574468085107e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.5193375647068024, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 213 }, { "completion_length": 599.5625, "epoch": 0.22826666666666667, "grad_norm": 0.0011258217273280025, "kl": 0.004014253616333008, "learning_rate": 2.276595744680851e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 214 }, { "completion_length": 733.875, "epoch": 0.22933333333333333, "grad_norm": 0.03465702757239342, "kl": 0.0034313201904296875, "learning_rate": 2.2872340425531916e-06, "loss": 0.0001, "reward": 0.40625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 215 }, { "completion_length": 625.03125, "epoch": 0.2304, "grad_norm": 0.009327179752290249, "kl": 0.004913806915283203, "learning_rate": 2.297872340425532e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 216 }, { "completion_length": 478.9375, "epoch": 0.23146666666666665, "grad_norm": 0.017738617956638336, "kl": 0.008952617645263672, "learning_rate": 2.308510638297872e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 217 }, { "completion_length": 657.15625, "epoch": 0.23253333333333334, "grad_norm": 0.016096297651529312, "kl": 0.002866029739379883, "learning_rate": 2.319148936170213e-06, "loss": 0.0001, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 218 }, { "completion_length": 622.34375, "epoch": 0.2336, "grad_norm": 0.018967749550938606, "kl": 0.0022373199462890625, "learning_rate": 2.329787234042553e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 219 }, { "completion_length": 592.03125, "epoch": 0.23466666666666666, "grad_norm": 0.018686668947339058, "kl": 0.004595398902893066, "learning_rate": 2.340425531914894e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 631.78125, "epoch": 0.23573333333333332, "grad_norm": 0.05838574841618538, "kl": 0.021146655082702637, "learning_rate": 2.351063829787234e-06, "loss": 0.0008, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 221 }, { "completion_length": 653.71875, "epoch": 0.2368, "grad_norm": 0.010850964114069939, "kl": 0.003604412078857422, "learning_rate": 2.3617021276595748e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 222 }, { "completion_length": 814.15625, "epoch": 0.23786666666666667, "grad_norm": 0.000482215458760038, "kl": 0.0016349554061889648, "learning_rate": 2.372340425531915e-06, "loss": 0.0001, "reward": 0.4375, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 223 }, { "completion_length": 631.875, "epoch": 0.23893333333333333, "grad_norm": 0.02401360496878624, "kl": 0.003062725067138672, "learning_rate": 2.3829787234042557e-06, "loss": 0.0001, "reward": 0.75, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 224 }, { "completion_length": 525.53125, "epoch": 0.24, "grad_norm": 0.0008045344147831202, "kl": 0.005678653717041016, "learning_rate": 2.3936170212765957e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 225 }, { "completion_length": 633.25, "epoch": 0.24106666666666668, "grad_norm": 0.012414127588272095, "kl": 0.0045855045318603516, "learning_rate": 2.4042553191489366e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 226 }, { "completion_length": 618.4375, "epoch": 0.24213333333333334, "grad_norm": 0.025065410882234573, "kl": 0.009622812271118164, "learning_rate": 2.4148936170212766e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 227 }, { "completion_length": 545.0625, "epoch": 0.2432, "grad_norm": 0.015575526282191277, "kl": 0.00996255874633789, "learning_rate": 2.425531914893617e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 228 }, { "completion_length": 497.4375, "epoch": 0.24426666666666666, "grad_norm": 0.01669159345328808, "kl": 0.0037565231323242188, "learning_rate": 2.4361702127659575e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 229 }, { "completion_length": 625.125, "epoch": 0.24533333333333332, "grad_norm": 0.00021798652596771717, "kl": 0.002303868532180786, "learning_rate": 2.446808510638298e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 679.375, "epoch": 0.2464, "grad_norm": 0.01050932053476572, "kl": 0.003066539764404297, "learning_rate": 2.4574468085106384e-06, "loss": 0.0001, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 231 }, { "completion_length": 470.40625, "epoch": 0.24746666666666667, "grad_norm": 0.0006186446407809854, "kl": 0.0037720203399658203, "learning_rate": 2.468085106382979e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 232 }, { "completion_length": 580.75, "epoch": 0.24853333333333333, "grad_norm": 0.014556772075593472, "kl": 0.0035796165466308594, "learning_rate": 2.4787234042553193e-06, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 233 }, { "completion_length": 534.0, "epoch": 0.2496, "grad_norm": 0.031690776348114014, "kl": 0.004459857940673828, "learning_rate": 2.4893617021276598e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 234 }, { "completion_length": 511.0, "epoch": 0.25066666666666665, "grad_norm": 0.030084887519478798, "kl": 0.006151676177978516, "learning_rate": 2.5e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 235 }, { "completion_length": 638.8125, "epoch": 0.2517333333333333, "grad_norm": 0.001415239297784865, "kl": 0.004750549793243408, "learning_rate": 2.5106382978723407e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 236 }, { "completion_length": 630.71875, "epoch": 0.2528, "grad_norm": 0.013918296433985233, "kl": 0.0028977394104003906, "learning_rate": 2.521276595744681e-06, "loss": 0.0001, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 237 }, { "completion_length": 563.40625, "epoch": 0.2538666666666667, "grad_norm": 0.0007024174556136131, "kl": 0.003877878189086914, "learning_rate": 2.531914893617021e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 238 }, { "completion_length": 651.28125, "epoch": 0.25493333333333335, "grad_norm": 0.022146346047520638, "kl": 0.004809379577636719, "learning_rate": 2.5425531914893616e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 239 }, { "completion_length": 848.34375, "epoch": 0.256, "grad_norm": 0.00100491545163095, "kl": 0.0048749446868896484, "learning_rate": 2.553191489361702e-06, "loss": 0.0002, "reward": 0.3125, "reward_std": 0.125, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 541.65625, "epoch": 0.25706666666666667, "grad_norm": 0.016655534505844116, "kl": 0.0076541900634765625, "learning_rate": 2.5638297872340425e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 241 }, { "completion_length": 553.59375, "epoch": 0.2581333333333333, "grad_norm": 0.01321939192712307, "kl": 0.004337787628173828, "learning_rate": 2.574468085106383e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 242 }, { "completion_length": 592.5, "epoch": 0.2592, "grad_norm": 0.01090206578373909, "kl": 0.003604412078857422, "learning_rate": 2.5851063829787234e-06, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 243 }, { "completion_length": 566.03125, "epoch": 0.26026666666666665, "grad_norm": 0.00014029307931195945, "kl": 0.007012844085693359, "learning_rate": 2.595744680851064e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 244 }, { "completion_length": 546.84375, "epoch": 0.2613333333333333, "grad_norm": 0.013933254405856133, "kl": 0.00244903564453125, "learning_rate": 2.6063829787234043e-06, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 245 }, { "completion_length": 528.875, "epoch": 0.2624, "grad_norm": 0.016392020508646965, "kl": 0.008983135223388672, "learning_rate": 2.6170212765957447e-06, "loss": 0.0004, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 246 }, { "completion_length": 662.375, "epoch": 0.2634666666666667, "grad_norm": 0.014161713421344757, "kl": 0.0021657943725585938, "learning_rate": 2.627659574468085e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 247 }, { "completion_length": 648.8125, "epoch": 0.26453333333333334, "grad_norm": 0.010316681116819382, "kl": 0.0027036666870117188, "learning_rate": 2.6382978723404256e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 248 }, { "completion_length": 548.6875, "epoch": 0.2656, "grad_norm": 0.03963794186711311, "kl": 0.00719141960144043, "learning_rate": 2.648936170212766e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 249 }, { "completion_length": 739.6875, "epoch": 0.26666666666666666, "grad_norm": 0.013032750226557255, "kl": 0.0015473365783691406, "learning_rate": 2.659574468085106e-06, "loss": 0.0001, "reward": 0.28125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 580.25, "epoch": 0.2677333333333333, "grad_norm": 0.00020730389223899692, "kl": 0.003509044647216797, "learning_rate": 2.670212765957447e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 251 }, { "completion_length": 633.28125, "epoch": 0.2688, "grad_norm": 0.00016932433936744928, "kl": 0.0015611648559570312, "learning_rate": 2.680851063829787e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 252 }, { "completion_length": 572.53125, "epoch": 0.26986666666666664, "grad_norm": 0.00880137924104929, "kl": 0.012176990509033203, "learning_rate": 2.691489361702128e-06, "loss": 0.0005, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 253 }, { "completion_length": 616.375, "epoch": 0.27093333333333336, "grad_norm": 0.009029196575284004, "kl": 0.004579067230224609, "learning_rate": 2.702127659574468e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 254 }, { "completion_length": 558.46875, "epoch": 0.272, "grad_norm": 0.03505995497107506, "kl": 0.004702329635620117, "learning_rate": 2.7127659574468088e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 255 }, { "completion_length": 576.25, "epoch": 0.2730666666666667, "grad_norm": 0.038831546902656555, "kl": 0.004638195037841797, "learning_rate": 2.723404255319149e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 256 }, { "completion_length": 573.78125, "epoch": 0.27413333333333334, "grad_norm": 0.01366236712783575, "kl": 0.0038547515869140625, "learning_rate": 2.7340425531914897e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 257 }, { "completion_length": 578.5625, "epoch": 0.2752, "grad_norm": 0.09003030508756638, "kl": 0.055806636810302734, "learning_rate": 2.7446808510638297e-06, "loss": 0.0022, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 258 }, { "completion_length": 786.34375, "epoch": 0.27626666666666666, "grad_norm": 0.00011297337186988443, "kl": 0.0038585662841796875, "learning_rate": 2.7553191489361706e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 259 }, { "completion_length": 642.5625, "epoch": 0.2773333333333333, "grad_norm": 0.024154657498002052, "kl": 0.0033812522888183594, "learning_rate": 2.7659574468085106e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 437.46875, "epoch": 0.2784, "grad_norm": 0.0002516274689696729, "kl": 0.006310462951660156, "learning_rate": 2.776595744680851e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 261 }, { "completion_length": 715.71875, "epoch": 0.27946666666666664, "grad_norm": 0.0006281447131186724, "kl": 0.002422332763671875, "learning_rate": 2.7872340425531915e-06, "loss": 0.0001, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 262 }, { "completion_length": 596.15625, "epoch": 0.28053333333333336, "grad_norm": 0.026324383914470673, "kl": 0.006761074066162109, "learning_rate": 2.797872340425532e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 263 }, { "completion_length": 719.34375, "epoch": 0.2816, "grad_norm": 0.016079621389508247, "kl": 0.0018062591552734375, "learning_rate": 2.8085106382978724e-06, "loss": 0.0001, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 264 }, { "completion_length": 600.5625, "epoch": 0.2826666666666667, "grad_norm": 0.013946063816547394, "kl": 0.0018639564514160156, "learning_rate": 2.819148936170213e-06, "loss": 0.0001, "reward": 0.375, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 265 }, { "completion_length": 586.84375, "epoch": 0.28373333333333334, "grad_norm": 0.014659928157925606, "kl": 0.0031958818435668945, "learning_rate": 2.8297872340425533e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 266 }, { "completion_length": 808.03125, "epoch": 0.2848, "grad_norm": 0.014611341990530491, "kl": 0.0013309717178344727, "learning_rate": 2.8404255319148938e-06, "loss": 0.0001, "reward": 0.46875, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 267 }, { "completion_length": 578.9375, "epoch": 0.28586666666666666, "grad_norm": 0.0004987601423636079, "kl": 0.002590179443359375, "learning_rate": 2.851063829787234e-06, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 268 }, { "completion_length": 590.25, "epoch": 0.2869333333333333, "grad_norm": 0.017039192840456963, "kl": 0.0062007904052734375, "learning_rate": 2.8617021276595747e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 269 }, { "completion_length": 657.1875, "epoch": 0.288, "grad_norm": 0.000450820050900802, "kl": 0.0035066604614257812, "learning_rate": 2.872340425531915e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 541.875, "epoch": 0.2890666666666667, "grad_norm": 0.061217278242111206, "kl": 0.004911899566650391, "learning_rate": 2.8829787234042556e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 271 }, { "completion_length": 534.5625, "epoch": 0.29013333333333335, "grad_norm": 0.010480289347469807, "kl": 0.00614166259765625, "learning_rate": 2.8936170212765956e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 272 }, { "completion_length": 708.875, "epoch": 0.2912, "grad_norm": 0.01584521494805813, "kl": 0.0018754005432128906, "learning_rate": 2.9042553191489365e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 273 }, { "completion_length": 708.9375, "epoch": 0.2922666666666667, "grad_norm": 0.00032572413329035044, "kl": 0.0022772550582885742, "learning_rate": 2.9148936170212765e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 274 }, { "completion_length": 528.75, "epoch": 0.29333333333333333, "grad_norm": 0.010505298152565956, "kl": 0.0035791397094726562, "learning_rate": 2.9255319148936174e-06, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 275 }, { "completion_length": 596.46875, "epoch": 0.2944, "grad_norm": 0.0008775272872298956, "kl": 0.007037639617919922, "learning_rate": 2.9361702127659574e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 276 }, { "completion_length": 695.78125, "epoch": 0.29546666666666666, "grad_norm": 0.011259150691330433, "kl": 0.004460334777832031, "learning_rate": 2.946808510638298e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 277 }, { "completion_length": 681.25, "epoch": 0.2965333333333333, "grad_norm": 0.022454949095845222, "kl": 0.011694908142089844, "learning_rate": 2.9574468085106383e-06, "loss": 0.0005, "reward": 0.53125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 278 }, { "completion_length": 720.25, "epoch": 0.2976, "grad_norm": 0.026837235316634178, "kl": 0.004060268402099609, "learning_rate": 2.9680851063829787e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 279 }, { "completion_length": 717.71875, "epoch": 0.2986666666666667, "grad_norm": 0.01963728480041027, "kl": 0.003849506378173828, "learning_rate": 2.978723404255319e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 678.46875, "epoch": 0.29973333333333335, "grad_norm": 0.01537579856812954, "kl": 0.004266500473022461, "learning_rate": 2.9893617021276596e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 281 }, { "completion_length": 739.09375, "epoch": 0.3008, "grad_norm": 0.013939772732555866, "kl": 0.002907276153564453, "learning_rate": 3e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 282 }, { "completion_length": 643.59375, "epoch": 0.30186666666666667, "grad_norm": 0.013309529982507229, "kl": 0.003556966781616211, "learning_rate": 2.999998842653789e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 283 }, { "completion_length": 523.53125, "epoch": 0.30293333333333333, "grad_norm": 0.000422754033934325, "kl": 0.004003763198852539, "learning_rate": 2.999995370616941e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 284 }, { "completion_length": 643.21875, "epoch": 0.304, "grad_norm": 0.018220622092485428, "kl": 0.008458614349365234, "learning_rate": 2.9999895838948146e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 285 }, { "completion_length": 690.625, "epoch": 0.30506666666666665, "grad_norm": 0.020835474133491516, "kl": 0.003901243209838867, "learning_rate": 2.999981482496339e-06, "loss": 0.0002, "reward": 0.40625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 286 }, { "completion_length": 735.84375, "epoch": 0.3061333333333333, "grad_norm": 0.000364295847248286, "kl": 0.0023581981658935547, "learning_rate": 2.9999710664340162e-06, "loss": 0.0001, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 287 }, { "completion_length": 688.78125, "epoch": 0.3072, "grad_norm": 0.011256012134253979, "kl": 0.002776622772216797, "learning_rate": 2.999958335723919e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 288 }, { "completion_length": 525.0625, "epoch": 0.3082666666666667, "grad_norm": 0.015167263336479664, "kl": 0.002924203872680664, "learning_rate": 2.999943290385692e-06, "loss": 0.0001, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 289 }, { "completion_length": 732.875, "epoch": 0.30933333333333335, "grad_norm": 0.017961522564291954, "kl": 0.005305767059326172, "learning_rate": 2.9999259304425536e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 568.65625, "epoch": 0.3104, "grad_norm": 0.028753379359841347, "kl": 0.004523038864135742, "learning_rate": 2.9999062559212913e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 291 }, { "completion_length": 699.8125, "epoch": 0.31146666666666667, "grad_norm": 0.010433925315737724, "kl": 0.016932964324951172, "learning_rate": 2.9998842668522657e-06, "loss": 0.0007, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 292 }, { "completion_length": 718.78125, "epoch": 0.31253333333333333, "grad_norm": 0.008864966221153736, "kl": 0.0072307586669921875, "learning_rate": 2.9998599632694086e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 293 }, { "completion_length": 619.6875, "epoch": 0.3136, "grad_norm": 0.015136854723095894, "kl": 0.002384185791015625, "learning_rate": 2.9998333452102236e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 294 }, { "completion_length": 656.25, "epoch": 0.31466666666666665, "grad_norm": 0.002506328048184514, "kl": 0.005772829055786133, "learning_rate": 2.9998044127157864e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 295 }, { "completion_length": 635.9375, "epoch": 0.3157333333333333, "grad_norm": 0.00012022285227430984, "kl": 0.0019412040710449219, "learning_rate": 2.999773165830743e-06, "loss": 0.0001, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 296 }, { "completion_length": 477.3125, "epoch": 0.3168, "grad_norm": 0.0016096349572762847, "kl": 0.005011558532714844, "learning_rate": 2.999739604603311e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 297 }, { "completion_length": 639.5, "epoch": 0.3178666666666667, "grad_norm": 0.029251884669065475, "kl": 0.0052928924560546875, "learning_rate": 2.99970372908528e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.47617512941360474, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 298 }, { "completion_length": 653.53125, "epoch": 0.31893333333333335, "grad_norm": 0.011562664993107319, "kl": 0.0031147003173828125, "learning_rate": 2.999665539332011e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 299 }, { "completion_length": 600.96875, "epoch": 0.32, "grad_norm": 0.018612360581755638, "kl": 0.006523609161376953, "learning_rate": 2.9996250354024346e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 300 }, { "completion_length": 567.1875, "epoch": 0.32106666666666667, "grad_norm": 0.01740344613790512, "kl": 0.005106925964355469, "learning_rate": 2.999582217359055e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 301 }, { "completion_length": 498.65625, "epoch": 0.3221333333333333, "grad_norm": 0.010179299861192703, "kl": 0.00528717041015625, "learning_rate": 2.999537085267945e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 302 }, { "completion_length": 647.53125, "epoch": 0.3232, "grad_norm": 0.00013672947534359992, "kl": 0.0020449161529541016, "learning_rate": 2.9994896391987487e-06, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 303 }, { "completion_length": 472.21875, "epoch": 0.32426666666666665, "grad_norm": 0.02147592045366764, "kl": 0.005955219268798828, "learning_rate": 2.9994398792246826e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 304 }, { "completion_length": 595.53125, "epoch": 0.3253333333333333, "grad_norm": 0.0020941004622727633, "kl": 0.0038394927978515625, "learning_rate": 2.9993878054225324e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 305 }, { "completion_length": 640.78125, "epoch": 0.3264, "grad_norm": 0.010173437185585499, "kl": 0.00591278076171875, "learning_rate": 2.9993334178726546e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 306 }, { "completion_length": 633.90625, "epoch": 0.3274666666666667, "grad_norm": 0.00913633219897747, "kl": 0.005397796630859375, "learning_rate": 2.9992767166589756e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 307 }, { "completion_length": 741.3125, "epoch": 0.32853333333333334, "grad_norm": 0.008947618305683136, "kl": 0.0025506019592285156, "learning_rate": 2.9992177018689933e-06, "loss": 0.0001, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 308 }, { "completion_length": 537.0, "epoch": 0.3296, "grad_norm": 0.02185234986245632, "kl": 0.00562286376953125, "learning_rate": 2.9991563735937752e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 309 }, { "completion_length": 485.78125, "epoch": 0.33066666666666666, "grad_norm": 0.0013148058205842972, "kl": 0.0036897659301757812, "learning_rate": 2.9990927319279583e-06, "loss": 0.0001, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 637.4375, "epoch": 0.3317333333333333, "grad_norm": 0.009814383462071419, "kl": 0.0017566680908203125, "learning_rate": 2.9990267769697495e-06, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 311 }, { "completion_length": 584.5, "epoch": 0.3328, "grad_norm": 0.015313081443309784, "kl": 0.0048722028732299805, "learning_rate": 2.9989585088209272e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 312 }, { "completion_length": 485.21875, "epoch": 0.33386666666666664, "grad_norm": 0.010899900458753109, "kl": 0.010268688201904297, "learning_rate": 2.9988879275868364e-06, "loss": 0.0004, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 313 }, { "completion_length": 331.4375, "epoch": 0.33493333333333336, "grad_norm": 0.8106955885887146, "kl": 0.018530845642089844, "learning_rate": 2.9988150333763933e-06, "loss": 0.0007, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 314 }, { "completion_length": 547.09375, "epoch": 0.336, "grad_norm": 0.0699722021818161, "kl": 0.012126922607421875, "learning_rate": 2.9987398263020837e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 315 }, { "completion_length": 648.125, "epoch": 0.3370666666666667, "grad_norm": 0.015973873436450958, "kl": 0.004998683929443359, "learning_rate": 2.998662306479961e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 316 }, { "completion_length": 618.375, "epoch": 0.33813333333333334, "grad_norm": 0.0001457137695979327, "kl": 0.0064165592193603516, "learning_rate": 2.9985824740296484e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 317 }, { "completion_length": 467.53125, "epoch": 0.3392, "grad_norm": 0.0029133378993719816, "kl": 0.009346961975097656, "learning_rate": 2.998500329074338e-06, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 318 }, { "completion_length": 681.6875, "epoch": 0.34026666666666666, "grad_norm": 0.0028332266956567764, "kl": 0.004339694976806641, "learning_rate": 2.99841587174079e-06, "loss": 0.0002, "reward": 0.28125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 319 }, { "completion_length": 603.75, "epoch": 0.3413333333333333, "grad_norm": 0.0002190942468587309, "kl": 0.007296562194824219, "learning_rate": 2.9983291021593325e-06, "loss": 0.0003, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 567.75, "epoch": 0.3424, "grad_norm": 0.00020475649216677994, "kl": 0.003010272979736328, "learning_rate": 2.9982400204638626e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 321 }, { "completion_length": 638.71875, "epoch": 0.34346666666666664, "grad_norm": 0.010518234223127365, "kl": 0.0036482810974121094, "learning_rate": 2.9981486267918442e-06, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 322 }, { "completion_length": 566.625, "epoch": 0.34453333333333336, "grad_norm": 0.010432605631649494, "kl": 0.0031833648681640625, "learning_rate": 2.9980549212843096e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 323 }, { "completion_length": 627.34375, "epoch": 0.3456, "grad_norm": 0.010619793087244034, "kl": 0.003939628601074219, "learning_rate": 2.9979589040858586e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 324 }, { "completion_length": 674.21875, "epoch": 0.3466666666666667, "grad_norm": 0.0001256561663467437, "kl": 0.0028426647186279297, "learning_rate": 2.997860575344658e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 325 }, { "completion_length": 604.625, "epoch": 0.34773333333333334, "grad_norm": 0.00021704714163206518, "kl": 0.004082679748535156, "learning_rate": 2.9977599352124413e-06, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 326 }, { "completion_length": 573.59375, "epoch": 0.3488, "grad_norm": 0.007511872332543135, "kl": 0.015904664993286133, "learning_rate": 2.9976569838445097e-06, "loss": 0.0006, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 327 }, { "completion_length": 607.875, "epoch": 0.34986666666666666, "grad_norm": 0.01350938156247139, "kl": 0.004421234130859375, "learning_rate": 2.99755172139973e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 328 }, { "completion_length": 724.625, "epoch": 0.3509333333333333, "grad_norm": 0.009416555985808372, "kl": 0.002410411834716797, "learning_rate": 2.9974441480405364e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 329 }, { "completion_length": 632.71875, "epoch": 0.352, "grad_norm": 0.001441554049961269, "kl": 0.006537914276123047, "learning_rate": 2.997334263932927e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 607.5625, "epoch": 0.35306666666666664, "grad_norm": 0.0003136749437544495, "kl": 0.001628875732421875, "learning_rate": 2.9972220692464686e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 331 }, { "completion_length": 715.78125, "epoch": 0.35413333333333336, "grad_norm": 0.012231791391968727, "kl": 0.0027086734771728516, "learning_rate": 2.997107564154291e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 332 }, { "completion_length": 621.34375, "epoch": 0.3552, "grad_norm": 0.017207741737365723, "kl": 0.003948211669921875, "learning_rate": 2.9969907488330905e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 333 }, { "completion_length": 630.5, "epoch": 0.3562666666666667, "grad_norm": 0.00022834233823232353, "kl": 0.004485607147216797, "learning_rate": 2.996871623463128e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 334 }, { "completion_length": 554.6875, "epoch": 0.35733333333333334, "grad_norm": 0.0023713880218565464, "kl": 0.0122222900390625, "learning_rate": 2.9967501882282296e-06, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 335 }, { "completion_length": 506.0625, "epoch": 0.3584, "grad_norm": 0.018982652574777603, "kl": 0.009603500366210938, "learning_rate": 2.996626443315785e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 336 }, { "completion_length": 568.5, "epoch": 0.35946666666666666, "grad_norm": 0.020026221871376038, "kl": 0.009020805358886719, "learning_rate": 2.9965003889167486e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 337 }, { "completion_length": 527.125, "epoch": 0.3605333333333333, "grad_norm": 0.0014817442279309034, "kl": 0.005825042724609375, "learning_rate": 2.996372025225639e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 338 }, { "completion_length": 537.8125, "epoch": 0.3616, "grad_norm": 0.014620468020439148, "kl": 0.005228996276855469, "learning_rate": 2.996241352440537e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 339 }, { "completion_length": 579.90625, "epoch": 0.3626666666666667, "grad_norm": 0.0027745182160288095, "kl": 0.00603485107421875, "learning_rate": 2.996108370763088e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 682.96875, "epoch": 0.36373333333333335, "grad_norm": 0.00047093009925447404, "kl": 0.01155233383178711, "learning_rate": 2.9959730803984997e-06, "loss": 0.0005, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 341 }, { "completion_length": 540.875, "epoch": 0.3648, "grad_norm": 0.016975590959191322, "kl": 0.006184577941894531, "learning_rate": 2.9958354815555427e-06, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 342 }, { "completion_length": 488.75, "epoch": 0.3658666666666667, "grad_norm": 0.013845536857843399, "kl": 0.007259368896484375, "learning_rate": 2.995695574446549e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 343 }, { "completion_length": 587.375, "epoch": 0.36693333333333333, "grad_norm": 0.019418131560087204, "kl": 0.007791042327880859, "learning_rate": 2.995553359287414e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 344 }, { "completion_length": 734.1875, "epoch": 0.368, "grad_norm": 0.015215403400361538, "kl": 0.0051288604736328125, "learning_rate": 2.9954088362975936e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 345 }, { "completion_length": 638.21875, "epoch": 0.36906666666666665, "grad_norm": 0.0166973527520895, "kl": 0.01331329345703125, "learning_rate": 2.9952620057001054e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 346 }, { "completion_length": 507.65625, "epoch": 0.3701333333333333, "grad_norm": 0.0006425857427529991, "kl": 0.0049152374267578125, "learning_rate": 2.9951128677215278e-06, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 347 }, { "completion_length": 527.3125, "epoch": 0.3712, "grad_norm": 0.003172910073772073, "kl": 0.005016803741455078, "learning_rate": 2.994961422591999e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 348 }, { "completion_length": 560.0625, "epoch": 0.3722666666666667, "grad_norm": 0.011149630881845951, "kl": 0.004617214202880859, "learning_rate": 2.99480767054522e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 349 }, { "completion_length": 610.09375, "epoch": 0.37333333333333335, "grad_norm": 0.017184428870677948, "kl": 0.0027256011962890625, "learning_rate": 2.9946516118184484e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 568.78125, "epoch": 0.3744, "grad_norm": 0.019933583214879036, "kl": 0.005936145782470703, "learning_rate": 2.994493246652504e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 351 }, { "completion_length": 714.65625, "epoch": 0.37546666666666667, "grad_norm": 0.01311857346445322, "kl": 0.004230976104736328, "learning_rate": 2.9943325752917634e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 352 }, { "completion_length": 639.3125, "epoch": 0.37653333333333333, "grad_norm": 0.020361067727208138, "kl": 0.004485130310058594, "learning_rate": 2.994169597984164e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 353 }, { "completion_length": 623.1875, "epoch": 0.3776, "grad_norm": 0.0006100770551711321, "kl": 0.0031099319458007812, "learning_rate": 2.9940043149812002e-06, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 354 }, { "completion_length": 666.28125, "epoch": 0.37866666666666665, "grad_norm": 0.012898469343781471, "kl": 0.0037097930908203125, "learning_rate": 2.9938367265379253e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 355 }, { "completion_length": 585.6875, "epoch": 0.3797333333333333, "grad_norm": 0.028065824881196022, "kl": 0.005574226379394531, "learning_rate": 2.9936668329129493e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 356 }, { "completion_length": 569.1875, "epoch": 0.3808, "grad_norm": 0.01310973521322012, "kl": 0.006196022033691406, "learning_rate": 2.9934946343684403e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 357 }, { "completion_length": 701.875, "epoch": 0.3818666666666667, "grad_norm": 0.0007879008771851659, "kl": 0.004968166351318359, "learning_rate": 2.9933201311701224e-06, "loss": 0.0002, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 358 }, { "completion_length": 503.15625, "epoch": 0.38293333333333335, "grad_norm": 0.015873542055487633, "kl": 0.0057773590087890625, "learning_rate": 2.9931433235872766e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 359 }, { "completion_length": 711.71875, "epoch": 0.384, "grad_norm": 0.0011296796146780252, "kl": 0.004998207092285156, "learning_rate": 2.99296421189274e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 681.46875, "epoch": 0.38506666666666667, "grad_norm": 0.016437439247965813, "kl": 0.0033011436462402344, "learning_rate": 2.992782796362904e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 361 }, { "completion_length": 545.40625, "epoch": 0.38613333333333333, "grad_norm": 0.03800331428647041, "kl": 0.010239601135253906, "learning_rate": 2.992599077277717e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 362 }, { "completion_length": 566.125, "epoch": 0.3872, "grad_norm": 0.017701519653201103, "kl": 0.00528717041015625, "learning_rate": 2.9924130549206804e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 363 }, { "completion_length": 585.71875, "epoch": 0.38826666666666665, "grad_norm": 0.009904692880809307, "kl": 0.010193347930908203, "learning_rate": 2.992224729578851e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 364 }, { "completion_length": 522.0625, "epoch": 0.3893333333333333, "grad_norm": 0.008593027479946613, "kl": 0.00626826286315918, "learning_rate": 2.992034101542839e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 365 }, { "completion_length": 570.75, "epoch": 0.3904, "grad_norm": 0.0003527602821122855, "kl": 0.007651805877685547, "learning_rate": 2.9918411711068073e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 366 }, { "completion_length": 695.53125, "epoch": 0.3914666666666667, "grad_norm": 0.0004007024399470538, "kl": 0.006751060485839844, "learning_rate": 2.9916459385684737e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 367 }, { "completion_length": 523.28125, "epoch": 0.39253333333333335, "grad_norm": 0.014467107132077217, "kl": 0.008258819580078125, "learning_rate": 2.991448404229105e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 368 }, { "completion_length": 642.9375, "epoch": 0.3936, "grad_norm": 0.018693504855036736, "kl": 0.0022597312927246094, "learning_rate": 2.991248568393524e-06, "loss": 0.0001, "reward": 0.3125, "reward_std": 0.25, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 369 }, { "completion_length": 632.0, "epoch": 0.39466666666666667, "grad_norm": 0.011440223082900047, "kl": 0.004567623138427734, "learning_rate": 2.991046431370102e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 539.65625, "epoch": 0.3957333333333333, "grad_norm": 0.012967856600880623, "kl": 0.0037984848022460938, "learning_rate": 2.990841993470762e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 371 }, { "completion_length": 607.125, "epoch": 0.3968, "grad_norm": 0.0011862716637551785, "kl": 0.0055255889892578125, "learning_rate": 2.9906352550109787e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 372 }, { "completion_length": 584.375, "epoch": 0.39786666666666665, "grad_norm": 0.0003888385253958404, "kl": 0.003849029541015625, "learning_rate": 2.990426216309776e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 373 }, { "completion_length": 713.78125, "epoch": 0.3989333333333333, "grad_norm": 0.0304151251912117, "kl": 0.0048961639404296875, "learning_rate": 2.990214877689727e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 374 }, { "completion_length": 642.375, "epoch": 0.4, "grad_norm": 0.015213887207210064, "kl": 0.0038127899169921875, "learning_rate": 2.9900012394769546e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 375 }, { "completion_length": 550.84375, "epoch": 0.4010666666666667, "grad_norm": 0.00041373155545443296, "kl": 0.0026640892028808594, "learning_rate": 2.98978530200113e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 376 }, { "completion_length": 522.28125, "epoch": 0.40213333333333334, "grad_norm": 0.021802354604005814, "kl": 0.0075893402099609375, "learning_rate": 2.989567065595472e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 377 }, { "completion_length": 601.90625, "epoch": 0.4032, "grad_norm": 0.014192330650985241, "kl": 0.01462554931640625, "learning_rate": 2.989346530596748e-06, "loss": 0.0006, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 378 }, { "completion_length": 622.65625, "epoch": 0.40426666666666666, "grad_norm": 0.02768220193684101, "kl": 0.006869316101074219, "learning_rate": 2.989123697345271e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 379 }, { "completion_length": 654.9375, "epoch": 0.4053333333333333, "grad_norm": 0.017390117049217224, "kl": 0.009341239929199219, "learning_rate": 2.9888985661849026e-06, "loss": 0.0004, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 536.375, "epoch": 0.4064, "grad_norm": 0.000210919592063874, "kl": 0.004836082458496094, "learning_rate": 2.988671137463048e-06, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 381 }, { "completion_length": 474.15625, "epoch": 0.40746666666666664, "grad_norm": 0.007432183716446161, "kl": 0.012440681457519531, "learning_rate": 2.988441411530659e-06, "loss": 0.0005, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 382 }, { "completion_length": 739.25, "epoch": 0.40853333333333336, "grad_norm": 0.009663654491305351, "kl": 0.009844779968261719, "learning_rate": 2.9882093887422323e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 383 }, { "completion_length": 518.90625, "epoch": 0.4096, "grad_norm": 0.00031278515234589577, "kl": 0.008198738098144531, "learning_rate": 2.987975069455809e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 384 }, { "completion_length": 494.46875, "epoch": 0.4106666666666667, "grad_norm": 0.0005137796397320926, "kl": 0.00933074951171875, "learning_rate": 2.9877384540329735e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 385 }, { "completion_length": 507.78125, "epoch": 0.41173333333333334, "grad_norm": 0.017697621136903763, "kl": 0.004237174987792969, "learning_rate": 2.987499542838854e-06, "loss": 0.0002, "reward": 0.875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 386 }, { "completion_length": 611.4375, "epoch": 0.4128, "grad_norm": 0.01701945997774601, "kl": 0.004082679748535156, "learning_rate": 2.9872583362421204e-06, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 387 }, { "completion_length": 546.59375, "epoch": 0.41386666666666666, "grad_norm": 0.015463443472981453, "kl": 0.0034155845642089844, "learning_rate": 2.9870148346149868e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 388 }, { "completion_length": 617.46875, "epoch": 0.4149333333333333, "grad_norm": 0.009415809996426105, "kl": 0.007979393005371094, "learning_rate": 2.9867690383332063e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 389 }, { "completion_length": 589.25, "epoch": 0.416, "grad_norm": 0.00993348378688097, "kl": 0.007511138916015625, "learning_rate": 2.986520947776075e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 742.34375, "epoch": 0.41706666666666664, "grad_norm": 0.0004101224767509848, "kl": 0.002597332000732422, "learning_rate": 2.9862705633264276e-06, "loss": 0.0001, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 391 }, { "completion_length": 603.0, "epoch": 0.41813333333333336, "grad_norm": 0.018657488748431206, "kl": 0.015562057495117188, "learning_rate": 2.98601788537064e-06, "loss": 0.0006, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 392 }, { "completion_length": 647.65625, "epoch": 0.4192, "grad_norm": 0.014791999943554401, "kl": 0.009648799896240234, "learning_rate": 2.985762914298626e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 393 }, { "completion_length": 571.0625, "epoch": 0.4202666666666667, "grad_norm": 0.02015090547502041, "kl": 0.011153697967529297, "learning_rate": 2.9855056505038393e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 394 }, { "completion_length": 647.21875, "epoch": 0.42133333333333334, "grad_norm": 0.0240541510283947, "kl": 0.004594326019287109, "learning_rate": 2.985246094383271e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 395 }, { "completion_length": 550.875, "epoch": 0.4224, "grad_norm": 0.011368232779204845, "kl": 0.009186744689941406, "learning_rate": 2.984984246337449e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 396 }, { "completion_length": 687.90625, "epoch": 0.42346666666666666, "grad_norm": 0.00970472488552332, "kl": 0.0023741722106933594, "learning_rate": 2.984720106770439e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 397 }, { "completion_length": 524.3125, "epoch": 0.4245333333333333, "grad_norm": 0.001062108320184052, "kl": 0.009205818176269531, "learning_rate": 2.984453676089842e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 398 }, { "completion_length": 684.625, "epoch": 0.4256, "grad_norm": 0.014250725507736206, "kl": 0.004937171936035156, "learning_rate": 2.9841849547067944e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 399 }, { "completion_length": 852.0625, "epoch": 0.4266666666666667, "grad_norm": 0.0004637645906768739, "kl": 0.002128124237060547, "learning_rate": 2.983913943035968e-06, "loss": 0.0001, "reward": 0.21875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 400 }, { "completion_length": 507.3125, "epoch": 0.42773333333333335, "grad_norm": 0.026904702186584473, "kl": 0.008220672607421875, "learning_rate": 2.983640641495569e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 401 }, { "completion_length": 512.6875, "epoch": 0.4288, "grad_norm": 0.0018714962061494589, "kl": 0.009521484375, "learning_rate": 2.983365050507336e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 402 }, { "completion_length": 682.1875, "epoch": 0.4298666666666667, "grad_norm": 0.02158605121076107, "kl": 0.004206657409667969, "learning_rate": 2.983087170496542e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 403 }, { "completion_length": 630.625, "epoch": 0.43093333333333333, "grad_norm": 0.013902643695473671, "kl": 0.007696628570556641, "learning_rate": 2.98280700189199e-06, "loss": 0.0003, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 404 }, { "completion_length": 575.5625, "epoch": 0.432, "grad_norm": 0.00044944375986233354, "kl": 0.008257865905761719, "learning_rate": 2.982524545126018e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 405 }, { "completion_length": 752.125, "epoch": 0.43306666666666666, "grad_norm": 0.012442001141607761, "kl": 0.004263877868652344, "learning_rate": 2.982239800634492e-06, "loss": 0.0002, "reward": 0.3125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 406 }, { "completion_length": 704.59375, "epoch": 0.4341333333333333, "grad_norm": 0.015578828752040863, "kl": 0.007513999938964844, "learning_rate": 2.9819527688568096e-06, "loss": 0.0003, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 407 }, { "completion_length": 541.09375, "epoch": 0.4352, "grad_norm": 0.024595100432634354, "kl": 0.0043697357177734375, "learning_rate": 2.9816634502358974e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 408 }, { "completion_length": 557.75, "epoch": 0.4362666666666667, "grad_norm": 0.0006291675381362438, "kl": 0.0064373016357421875, "learning_rate": 2.9813718452182116e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 409 }, { "completion_length": 581.15625, "epoch": 0.43733333333333335, "grad_norm": 0.016718359664082527, "kl": 0.006863117218017578, "learning_rate": 2.9810779542537355e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 410 }, { "completion_length": 606.4375, "epoch": 0.4384, "grad_norm": 0.0004325674963183701, "kl": 0.007610321044921875, "learning_rate": 2.980781777795981e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 411 }, { "completion_length": 566.4375, "epoch": 0.43946666666666667, "grad_norm": 0.0015424692537635565, "kl": 0.006273746490478516, "learning_rate": 2.9804833163019864e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 412 }, { "completion_length": 720.15625, "epoch": 0.44053333333333333, "grad_norm": 0.012978849932551384, "kl": 0.004194736480712891, "learning_rate": 2.9801825702323157e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 413 }, { "completion_length": 674.09375, "epoch": 0.4416, "grad_norm": 0.01466032862663269, "kl": 0.003066539764404297, "learning_rate": 2.979879540051059e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 414 }, { "completion_length": 653.25, "epoch": 0.44266666666666665, "grad_norm": 0.012147861532866955, "kl": 0.005400657653808594, "learning_rate": 2.9795742262258305e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 415 }, { "completion_length": 631.0, "epoch": 0.4437333333333333, "grad_norm": 0.01153766829520464, "kl": 0.0030612945556640625, "learning_rate": 2.979266629227769e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 416 }, { "completion_length": 555.15625, "epoch": 0.4448, "grad_norm": 0.00019905416411347687, "kl": 0.005218505859375, "learning_rate": 2.978956749531536e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 417 }, { "completion_length": 699.03125, "epoch": 0.4458666666666667, "grad_norm": 0.0004437533498276025, "kl": 0.0034351348876953125, "learning_rate": 2.9786445876153146e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 418 }, { "completion_length": 606.03125, "epoch": 0.44693333333333335, "grad_norm": 0.01068231463432312, "kl": 0.0047397613525390625, "learning_rate": 2.9783301439608125e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 419 }, { "completion_length": 565.5625, "epoch": 0.448, "grad_norm": 0.000570202711969614, "kl": 0.004669189453125, "learning_rate": 2.9780134190532553e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 420 }, { "completion_length": 710.375, "epoch": 0.44906666666666667, "grad_norm": 0.00023157105897553265, "kl": 0.0029668807983398438, "learning_rate": 2.9776944133813904e-06, "loss": 0.0001, "reward": 0.40625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 421 }, { "completion_length": 712.15625, "epoch": 0.45013333333333333, "grad_norm": 0.009561925195157528, "kl": 0.0024852752685546875, "learning_rate": 2.9773731274374846e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 422 }, { "completion_length": 551.46875, "epoch": 0.4512, "grad_norm": 0.011633405461907387, "kl": 0.008056640625, "learning_rate": 2.977049561717324e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 423 }, { "completion_length": 579.25, "epoch": 0.45226666666666665, "grad_norm": 0.008544915355741978, "kl": 0.013455867767333984, "learning_rate": 2.9767237167202104e-06, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 424 }, { "completion_length": 562.5625, "epoch": 0.4533333333333333, "grad_norm": 0.010396486148238182, "kl": 0.005542755126953125, "learning_rate": 2.976395592948966e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 425 }, { "completion_length": 648.84375, "epoch": 0.4544, "grad_norm": 0.001284679165109992, "kl": 0.008027076721191406, "learning_rate": 2.976065190909927e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 426 }, { "completion_length": 612.0625, "epoch": 0.4554666666666667, "grad_norm": 0.0012355963699519634, "kl": 0.015245437622070312, "learning_rate": 2.9757325111129464e-06, "loss": 0.0006, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 427 }, { "completion_length": 519.875, "epoch": 0.45653333333333335, "grad_norm": 0.0004237108223605901, "kl": 0.012912750244140625, "learning_rate": 2.975397554071392e-06, "loss": 0.0005, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 428 }, { "completion_length": 503.0, "epoch": 0.4576, "grad_norm": 0.011089821346104145, "kl": 0.0089111328125, "learning_rate": 2.975060320302145e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 429 }, { "completion_length": 557.6875, "epoch": 0.45866666666666667, "grad_norm": 0.015200500376522541, "kl": 0.005700111389160156, "learning_rate": 2.9747208103256007e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 430 }, { "completion_length": 613.84375, "epoch": 0.4597333333333333, "grad_norm": 0.012821314856410027, "kl": 0.0043888092041015625, "learning_rate": 2.9743790246656667e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 431 }, { "completion_length": 674.25, "epoch": 0.4608, "grad_norm": 0.013197031803429127, "kl": 0.002574920654296875, "learning_rate": 2.9740349638497614e-06, "loss": 0.0001, "reward": 0.375, "reward_std": 0.125, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 432 }, { "completion_length": 677.46875, "epoch": 0.46186666666666665, "grad_norm": 0.00019827534561045468, "kl": 0.0025281906127929688, "learning_rate": 2.9736886284088156e-06, "loss": 0.0001, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 433 }, { "completion_length": 604.96875, "epoch": 0.4629333333333333, "grad_norm": 0.007748033851385117, "kl": 0.0039234161376953125, "learning_rate": 2.973340018877269e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 434 }, { "completion_length": 557.125, "epoch": 0.464, "grad_norm": 0.011543280445039272, "kl": 0.008193016052246094, "learning_rate": 2.972989135793071e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 435 }, { "completion_length": 427.46875, "epoch": 0.4650666666666667, "grad_norm": 0.0009555015712976456, "kl": 0.012777328491210938, "learning_rate": 2.9726359796976785e-06, "loss": 0.0005, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "step": 436 }, { "completion_length": 597.9375, "epoch": 0.46613333333333334, "grad_norm": 0.010240907780826092, "kl": 0.0067806243896484375, "learning_rate": 2.972280551136057e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 437 }, { "completion_length": 580.15625, "epoch": 0.4672, "grad_norm": 0.013614580035209656, "kl": 0.008882522583007812, "learning_rate": 2.971922850656679e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 438 }, { "completion_length": 615.84375, "epoch": 0.46826666666666666, "grad_norm": 0.010193286463618279, "kl": 0.00386810302734375, "learning_rate": 2.9715628788115222e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 439 }, { "completion_length": 550.90625, "epoch": 0.4693333333333333, "grad_norm": 0.006852325052022934, "kl": 0.006357669830322266, "learning_rate": 2.9712006361560686e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 440 }, { "completion_length": 620.875, "epoch": 0.4704, "grad_norm": 0.0256188977509737, "kl": 0.003753662109375, "learning_rate": 2.970836123249305e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 441 }, { "completion_length": 513.78125, "epoch": 0.47146666666666665, "grad_norm": 0.000208748722798191, "kl": 0.00464630126953125, "learning_rate": 2.970469340653722e-06, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 442 }, { "completion_length": 522.6875, "epoch": 0.47253333333333336, "grad_norm": 0.011258557438850403, "kl": 0.007799625396728516, "learning_rate": 2.9701002889353128e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 443 }, { "completion_length": 558.75, "epoch": 0.4736, "grad_norm": 0.015158751979470253, "kl": 0.0062770843505859375, "learning_rate": 2.9697289686635704e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 444 }, { "completion_length": 681.59375, "epoch": 0.4746666666666667, "grad_norm": 0.01578466407954693, "kl": 0.007943153381347656, "learning_rate": 2.96935538041149e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 445 }, { "completion_length": 624.5, "epoch": 0.47573333333333334, "grad_norm": 0.0004024689842481166, "kl": 0.006413459777832031, "learning_rate": 2.968979524755567e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 446 }, { "completion_length": 637.375, "epoch": 0.4768, "grad_norm": 0.00016359915025532246, "kl": 0.003912925720214844, "learning_rate": 2.9686014022757936e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 447 }, { "completion_length": 657.46875, "epoch": 0.47786666666666666, "grad_norm": 0.009645196609199047, "kl": 0.0038394927978515625, "learning_rate": 2.968221013555662e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 448 }, { "completion_length": 700.8125, "epoch": 0.4789333333333333, "grad_norm": 0.012868169695138931, "kl": 0.0039033889770507812, "learning_rate": 2.967838359182161e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 449 }, { "completion_length": 630.3125, "epoch": 0.48, "grad_norm": 0.013048221357166767, "kl": 0.0038213729858398438, "learning_rate": 2.967453439745775e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 450 }, { "completion_length": 704.125, "epoch": 0.48106666666666664, "grad_norm": 0.00035750700044445693, "kl": 0.0032601356506347656, "learning_rate": 2.9670662558404837e-06, "loss": 0.0001, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 451 }, { "completion_length": 534.78125, "epoch": 0.48213333333333336, "grad_norm": 0.02216750755906105, "kl": 0.004940032958984375, "learning_rate": 2.966676808063762e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 452 }, { "completion_length": 533.15625, "epoch": 0.4832, "grad_norm": 0.04013386741280556, "kl": 0.008158683776855469, "learning_rate": 2.9662850970165785e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 453 }, { "completion_length": 660.8125, "epoch": 0.4842666666666667, "grad_norm": 0.000826124451123178, "kl": 0.005814552307128906, "learning_rate": 2.965891123303392e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 454 }, { "completion_length": 662.75, "epoch": 0.48533333333333334, "grad_norm": 0.007340976968407631, "kl": 0.004125118255615234, "learning_rate": 2.965494887532156e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 455 }, { "completion_length": 620.65625, "epoch": 0.4864, "grad_norm": 0.01874152384698391, "kl": 0.017952919006347656, "learning_rate": 2.9650963903143124e-06, "loss": 0.0007, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 456 }, { "completion_length": 658.90625, "epoch": 0.48746666666666666, "grad_norm": 0.0012816210510209203, "kl": 0.0049343109130859375, "learning_rate": 2.964695632264793e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 457 }, { "completion_length": 600.90625, "epoch": 0.4885333333333333, "grad_norm": 0.0003645849064923823, "kl": 0.005745887756347656, "learning_rate": 2.96429261400202e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 458 }, { "completion_length": 605.625, "epoch": 0.4896, "grad_norm": 0.013659607619047165, "kl": 0.004446506500244141, "learning_rate": 2.9638873361479016e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 459 }, { "completion_length": 655.3125, "epoch": 0.49066666666666664, "grad_norm": 0.012068059295415878, "kl": 0.005688667297363281, "learning_rate": 2.9634797993278337e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 460 }, { "completion_length": 612.84375, "epoch": 0.49173333333333336, "grad_norm": 0.0002629317168612033, "kl": 0.007597923278808594, "learning_rate": 2.9630700041706976e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 461 }, { "completion_length": 753.5625, "epoch": 0.4928, "grad_norm": 0.00022212577459868044, "kl": 0.0024080276489257812, "learning_rate": 2.9626579513088605e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 462 }, { "completion_length": 504.5, "epoch": 0.4938666666666667, "grad_norm": 0.002488181460648775, "kl": 0.006175994873046875, "learning_rate": 2.9622436413781723e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 463 }, { "completion_length": 664.5, "epoch": 0.49493333333333334, "grad_norm": 0.00023711935500614345, "kl": 0.008127212524414062, "learning_rate": 2.9618270750179667e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 464 }, { "completion_length": 571.03125, "epoch": 0.496, "grad_norm": 0.011891782283782959, "kl": 0.006152153015136719, "learning_rate": 2.961408252871058e-06, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 465 }, { "completion_length": 681.4375, "epoch": 0.49706666666666666, "grad_norm": 0.011133427731692791, "kl": 0.004070281982421875, "learning_rate": 2.9609871755837436e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 466 }, { "completion_length": 640.75, "epoch": 0.4981333333333333, "grad_norm": 0.0002494356594979763, "kl": 0.008803367614746094, "learning_rate": 2.9605638438057997e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 467 }, { "completion_length": 665.6875, "epoch": 0.4992, "grad_norm": 0.011637119576334953, "kl": 0.0034303665161132812, "learning_rate": 2.9601382581904815e-06, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 468 }, { "completion_length": 555.3125, "epoch": 0.5002666666666666, "grad_norm": 0.00023262508329935372, "kl": 0.014132499694824219, "learning_rate": 2.9597104193945213e-06, "loss": 0.0006, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 469 }, { "completion_length": 575.125, "epoch": 0.5013333333333333, "grad_norm": 0.0003765725705306977, "kl": 0.0035448074340820312, "learning_rate": 2.9592803280781303e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 470 }, { "completion_length": 743.28125, "epoch": 0.5024, "grad_norm": 0.009008532389998436, "kl": 0.0036334991455078125, "learning_rate": 2.958847984904994e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 471 }, { "completion_length": 788.90625, "epoch": 0.5034666666666666, "grad_norm": 0.015579535625874996, "kl": 0.0026912689208984375, "learning_rate": 2.9584133905422743e-06, "loss": 0.0001, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 472 }, { "completion_length": 712.8125, "epoch": 0.5045333333333333, "grad_norm": 0.018008917570114136, "kl": 0.0029935836791992188, "learning_rate": 2.9579765456606046e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 473 }, { "completion_length": 660.0625, "epoch": 0.5056, "grad_norm": 0.011660581454634666, "kl": 0.0046939849853515625, "learning_rate": 2.9575374509340937e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 474 }, { "completion_length": 655.125, "epoch": 0.5066666666666667, "grad_norm": 0.0002525939780753106, "kl": 0.006257057189941406, "learning_rate": 2.9570961070403205e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 475 }, { "completion_length": 574.09375, "epoch": 0.5077333333333334, "grad_norm": 0.0014308923855423927, "kl": 0.006047248840332031, "learning_rate": 2.9566525146603358e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 476 }, { "completion_length": 797.71875, "epoch": 0.5088, "grad_norm": 0.009624009020626545, "kl": 0.003665924072265625, "learning_rate": 2.9562066744786588e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 477 }, { "completion_length": 643.8125, "epoch": 0.5098666666666667, "grad_norm": 0.011025181040167809, "kl": 0.00516057014465332, "learning_rate": 2.955758587183279e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 478 }, { "completion_length": 557.90625, "epoch": 0.5109333333333334, "grad_norm": 0.00021827536693308502, "kl": 0.015137672424316406, "learning_rate": 2.9553082534656518e-06, "loss": 0.0006, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 479 }, { "completion_length": 676.75, "epoch": 0.512, "grad_norm": 0.0007678312249481678, "kl": 0.005969524383544922, "learning_rate": 2.9548556740207e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 480 }, { "completion_length": 683.96875, "epoch": 0.5130666666666667, "grad_norm": 0.020808730274438858, "kl": 0.0050449371337890625, "learning_rate": 2.954400849546812e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 481 }, { "completion_length": 597.3125, "epoch": 0.5141333333333333, "grad_norm": 0.033232733607292175, "kl": 0.0066585540771484375, "learning_rate": 2.9539437807458407e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 482 }, { "completion_length": 531.4375, "epoch": 0.5152, "grad_norm": 0.03282199427485466, "kl": 0.01348114013671875, "learning_rate": 2.9534844683231005e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 483 }, { "completion_length": 738.75, "epoch": 0.5162666666666667, "grad_norm": 0.010669718496501446, "kl": 0.0037622451782226562, "learning_rate": 2.9530229129873715e-06, "loss": 0.0002, "reward": 0.28125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 484 }, { "completion_length": 577.84375, "epoch": 0.5173333333333333, "grad_norm": 0.01322509627789259, "kl": 0.010467529296875, "learning_rate": 2.952559115450891e-06, "loss": 0.0004, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 485 }, { "completion_length": 576.59375, "epoch": 0.5184, "grad_norm": 0.000289003161014989, "kl": 0.011141777038574219, "learning_rate": 2.9520930764293584e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 486 }, { "completion_length": 583.6875, "epoch": 0.5194666666666666, "grad_norm": 0.016726212576031685, "kl": 0.0044362545013427734, "learning_rate": 2.9516247966419324e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 487 }, { "completion_length": 620.3125, "epoch": 0.5205333333333333, "grad_norm": 0.014382002875208855, "kl": 0.005072593688964844, "learning_rate": 2.9511542768112284e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 488 }, { "completion_length": 665.375, "epoch": 0.5216, "grad_norm": 0.016073143109679222, "kl": 0.0068950653076171875, "learning_rate": 2.9506815176633184e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 489 }, { "completion_length": 684.5, "epoch": 0.5226666666666666, "grad_norm": 0.0070151654072105885, "kl": 0.003946781158447266, "learning_rate": 2.9502065199277312e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 490 }, { "completion_length": 633.875, "epoch": 0.5237333333333334, "grad_norm": 0.0003880112781189382, "kl": 0.00537109375, "learning_rate": 2.9497292843374493e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 491 }, { "completion_length": 538.625, "epoch": 0.5248, "grad_norm": 0.0012069186195731163, "kl": 0.0043125152587890625, "learning_rate": 2.949249811628907e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 492 }, { "completion_length": 594.65625, "epoch": 0.5258666666666667, "grad_norm": 0.016607198864221573, "kl": 0.0039958953857421875, "learning_rate": 2.948768102541994e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 493 }, { "completion_length": 565.5625, "epoch": 0.5269333333333334, "grad_norm": 0.022813716903328896, "kl": 0.015956878662109375, "learning_rate": 2.9482841578200484e-06, "loss": 0.0006, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 494 }, { "completion_length": 515.25, "epoch": 0.528, "grad_norm": 0.0003781349223572761, "kl": 0.010016441345214844, "learning_rate": 2.9477979782098592e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 495 }, { "completion_length": 620.53125, "epoch": 0.5290666666666667, "grad_norm": 0.009888713248074055, "kl": 0.012181758880615234, "learning_rate": 2.947309564461663e-06, "loss": 0.0005, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 496 }, { "completion_length": 615.78125, "epoch": 0.5301333333333333, "grad_norm": 0.009243768639862537, "kl": 0.00537872314453125, "learning_rate": 2.9468189173291466e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 497 }, { "completion_length": 591.0, "epoch": 0.5312, "grad_norm": 0.00041428921394981444, "kl": 0.004521369934082031, "learning_rate": 2.94632603756944e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 498 }, { "completion_length": 717.78125, "epoch": 0.5322666666666667, "grad_norm": 0.02224251627922058, "kl": 0.0028748512268066406, "learning_rate": 2.94583092594312e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 499 }, { "completion_length": 524.34375, "epoch": 0.5333333333333333, "grad_norm": 0.014848812483251095, "kl": 0.0067806243896484375, "learning_rate": 2.945333583214208e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 500 }, { "completion_length": 634.0, "epoch": 0.5344, "grad_norm": 0.0010310994694009423, "kl": 0.0086517333984375, "learning_rate": 2.9448340101501676e-06, "loss": 0.0003, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 501 }, { "completion_length": 675.8125, "epoch": 0.5354666666666666, "grad_norm": 0.01651492342352867, "kl": 0.0029125213623046875, "learning_rate": 2.9443322075219035e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 502 }, { "completion_length": 800.625, "epoch": 0.5365333333333333, "grad_norm": 0.0074979462660849094, "kl": 0.0037398338317871094, "learning_rate": 2.943828176103762e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 503 }, { "completion_length": 437.6875, "epoch": 0.5376, "grad_norm": 0.001844571204856038, "kl": 0.012218475341796875, "learning_rate": 2.9433219166735286e-06, "loss": 0.0005, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 504 }, { "completion_length": 554.5, "epoch": 0.5386666666666666, "grad_norm": 0.0010526195401325822, "kl": 0.010954856872558594, "learning_rate": 2.9428134300124254e-06, "loss": 0.0004, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 505 }, { "completion_length": 658.96875, "epoch": 0.5397333333333333, "grad_norm": 0.008975003845989704, "kl": 0.0026636123657226562, "learning_rate": 2.9423027169051134e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 506 }, { "completion_length": 578.5, "epoch": 0.5408, "grad_norm": 0.006297847256064415, "kl": 0.005532264709472656, "learning_rate": 2.9417897781396884e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 507 }, { "completion_length": 575.71875, "epoch": 0.5418666666666667, "grad_norm": 0.012038152664899826, "kl": 0.004849433898925781, "learning_rate": 2.9412746145076803e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 508 }, { "completion_length": 617.71875, "epoch": 0.5429333333333334, "grad_norm": 0.017482763156294823, "kl": 0.003939628601074219, "learning_rate": 2.9407572268040536e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 509 }, { "completion_length": 543.125, "epoch": 0.544, "grad_norm": 0.012404415756464005, "kl": 0.00839853286743164, "learning_rate": 2.9402376158272022e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 510 }, { "completion_length": 495.25, "epoch": 0.5450666666666667, "grad_norm": 0.015496907755732536, "kl": 0.0070056915283203125, "learning_rate": 2.9397157823789543e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 511 }, { "completion_length": 552.3125, "epoch": 0.5461333333333334, "grad_norm": 0.013821126893162727, "kl": 0.01119232177734375, "learning_rate": 2.9391917272645656e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 512 }, { "completion_length": 667.90625, "epoch": 0.5472, "grad_norm": 0.0202901903539896, "kl": 0.008314132690429688, "learning_rate": 2.938665451292719e-06, "loss": 0.0003, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 513 }, { "completion_length": 758.46875, "epoch": 0.5482666666666667, "grad_norm": 0.006708648521453142, "kl": 0.0052204132080078125, "learning_rate": 2.938136955275527e-06, "loss": 0.0002, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 514 }, { "completion_length": 534.09375, "epoch": 0.5493333333333333, "grad_norm": 0.03503170236945152, "kl": 0.013090133666992188, "learning_rate": 2.9376062400285266e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 515 }, { "completion_length": 588.25, "epoch": 0.5504, "grad_norm": 0.017718300223350525, "kl": 0.0069103240966796875, "learning_rate": 2.937073306370679e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 516 }, { "completion_length": 517.46875, "epoch": 0.5514666666666667, "grad_norm": 0.00019085436360910535, "kl": 0.005069732666015625, "learning_rate": 2.9365381551243696e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 517 }, { "completion_length": 602.4375, "epoch": 0.5525333333333333, "grad_norm": 0.00015097300638444722, "kl": 0.006511688232421875, "learning_rate": 2.936000787115406e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 518 }, { "completion_length": 594.125, "epoch": 0.5536, "grad_norm": 0.0005697436863556504, "kl": 0.005973339080810547, "learning_rate": 2.9354612031730146e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 519 }, { "completion_length": 640.1875, "epoch": 0.5546666666666666, "grad_norm": 0.01023330632597208, "kl": 0.005776405334472656, "learning_rate": 2.934919404129844e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 520 }, { "completion_length": 773.28125, "epoch": 0.5557333333333333, "grad_norm": 0.00032951452885754406, "kl": 0.0018930435180664062, "learning_rate": 2.9343753908219588e-06, "loss": 0.0001, "reward": 0.28125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 521 }, { "completion_length": 619.25, "epoch": 0.5568, "grad_norm": 0.010818441398441792, "kl": 0.0055370330810546875, "learning_rate": 2.933829164088841e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 522 }, { "completion_length": 548.84375, "epoch": 0.5578666666666666, "grad_norm": 0.020488116890192032, "kl": 0.0062808990478515625, "learning_rate": 2.93328072477339e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 523 }, { "completion_length": 614.875, "epoch": 0.5589333333333333, "grad_norm": 0.0006227426929399371, "kl": 0.005901336669921875, "learning_rate": 2.9327300737219165e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 524 }, { "completion_length": 548.96875, "epoch": 0.56, "grad_norm": 0.0004909950657747686, "kl": 0.0046291351318359375, "learning_rate": 2.9321772117841463e-06, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 525 }, { "completion_length": 571.90625, "epoch": 0.5610666666666667, "grad_norm": 0.00804485846310854, "kl": 0.0064792633056640625, "learning_rate": 2.9316221398132164e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 526 }, { "completion_length": 580.59375, "epoch": 0.5621333333333334, "grad_norm": 0.010090681724250317, "kl": 0.008022308349609375, "learning_rate": 2.931064858665674e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 527 }, { "completion_length": 470.125, "epoch": 0.5632, "grad_norm": 0.013203335925936699, "kl": 0.008550643920898438, "learning_rate": 2.9305053692014753e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 528 }, { "completion_length": 676.21875, "epoch": 0.5642666666666667, "grad_norm": 0.03657197579741478, "kl": 0.025409698486328125, "learning_rate": 2.9299436722839844e-06, "loss": 0.001, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 529 }, { "completion_length": 616.625, "epoch": 0.5653333333333334, "grad_norm": 0.00027229863917455077, "kl": 0.005267143249511719, "learning_rate": 2.929379768779972e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 530 }, { "completion_length": 741.8125, "epoch": 0.5664, "grad_norm": 0.0091477669775486, "kl": 0.005991935729980469, "learning_rate": 2.928813659559612e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 531 }, { "completion_length": 569.96875, "epoch": 0.5674666666666667, "grad_norm": 0.009948879480361938, "kl": 0.005314826965332031, "learning_rate": 2.9282453454964856e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 532 }, { "completion_length": 542.28125, "epoch": 0.5685333333333333, "grad_norm": 0.02141588181257248, "kl": 0.006855010986328125, "learning_rate": 2.9276748274675734e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 533 }, { "completion_length": 438.3125, "epoch": 0.5696, "grad_norm": 0.000568330055102706, "kl": 0.008587837219238281, "learning_rate": 2.9271021063532586e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "step": 534 }, { "completion_length": 460.3125, "epoch": 0.5706666666666667, "grad_norm": 0.0013857261510565877, "kl": 0.0279541015625, "learning_rate": 2.926527183037322e-06, "loss": 0.0011, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 535 }, { "completion_length": 609.25, "epoch": 0.5717333333333333, "grad_norm": 0.013318554498255253, "kl": 0.005204200744628906, "learning_rate": 2.9259500584069446e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 536 }, { "completion_length": 736.96875, "epoch": 0.5728, "grad_norm": 0.0175465140491724, "kl": 0.005375862121582031, "learning_rate": 2.925370733352704e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 537 }, { "completion_length": 542.71875, "epoch": 0.5738666666666666, "grad_norm": 0.015509523451328278, "kl": 0.008704185485839844, "learning_rate": 2.9247892087685733e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 538 }, { "completion_length": 491.5, "epoch": 0.5749333333333333, "grad_norm": 0.0045721265487372875, "kl": 0.011397361755371094, "learning_rate": 2.9242054855519192e-06, "loss": 0.0005, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 539 }, { "completion_length": 628.46875, "epoch": 0.576, "grad_norm": 0.008767809718847275, "kl": 0.005634307861328125, "learning_rate": 2.923619564603501e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 540 }, { "completion_length": 759.6875, "epoch": 0.5770666666666666, "grad_norm": 0.0032128519378602505, "kl": 0.006397247314453125, "learning_rate": 2.9230314468274705e-06, "loss": 0.0003, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 541 }, { "completion_length": 805.34375, "epoch": 0.5781333333333334, "grad_norm": 0.00847261119633913, "kl": 0.00362396240234375, "learning_rate": 2.922441133131369e-06, "loss": 0.0001, "reward": 0.34375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 542 }, { "completion_length": 614.71875, "epoch": 0.5792, "grad_norm": 0.00023280727327801287, "kl": 0.0040683746337890625, "learning_rate": 2.921848624426126e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 543 }, { "completion_length": 611.375, "epoch": 0.5802666666666667, "grad_norm": 0.018690019845962524, "kl": 0.005413055419921875, "learning_rate": 2.9212539216260585e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 544 }, { "completion_length": 541.625, "epoch": 0.5813333333333334, "grad_norm": 0.018009712919592857, "kl": 0.005265235900878906, "learning_rate": 2.9206570256488684e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 545 }, { "completion_length": 666.59375, "epoch": 0.5824, "grad_norm": 0.009851488284766674, "kl": 0.005206108093261719, "learning_rate": 2.9200579374156446e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 546 }, { "completion_length": 653.21875, "epoch": 0.5834666666666667, "grad_norm": 0.016272353008389473, "kl": 0.005240440368652344, "learning_rate": 2.919456657850855e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 547 }, { "completion_length": 715.59375, "epoch": 0.5845333333333333, "grad_norm": 0.01591268926858902, "kl": 0.0035142898559570312, "learning_rate": 2.918853187882353e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 548 }, { "completion_length": 692.5625, "epoch": 0.5856, "grad_norm": 0.0006671418668702245, "kl": 0.004006862640380859, "learning_rate": 2.918247528441369e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 549 }, { "completion_length": 749.46875, "epoch": 0.5866666666666667, "grad_norm": 0.015433361753821373, "kl": 0.007351398468017578, "learning_rate": 2.9176396804625136e-06, "loss": 0.0003, "reward": 0.34375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 550 }, { "completion_length": 667.9375, "epoch": 0.5877333333333333, "grad_norm": 0.010214360430836678, "kl": 0.0036258697509765625, "learning_rate": 2.9170296448837736e-06, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 551 }, { "completion_length": 877.34375, "epoch": 0.5888, "grad_norm": 0.0010206216247752309, "kl": 0.003863811492919922, "learning_rate": 2.9164174226465136e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.125, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 552 }, { "completion_length": 432.21875, "epoch": 0.5898666666666667, "grad_norm": 0.013620034791529179, "kl": 0.00737762451171875, "learning_rate": 2.9158030146954693e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 553 }, { "completion_length": 458.6875, "epoch": 0.5909333333333333, "grad_norm": 0.00031959477928467095, "kl": 0.0073699951171875, "learning_rate": 2.915186421978752e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 554 }, { "completion_length": 566.3125, "epoch": 0.592, "grad_norm": 0.015480337664484978, "kl": 0.005290508270263672, "learning_rate": 2.9145676454478435e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 555 }, { "completion_length": 613.3125, "epoch": 0.5930666666666666, "grad_norm": 0.0004408096428960562, "kl": 0.0056610107421875, "learning_rate": 2.9139466860575948e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 556 }, { "completion_length": 658.15625, "epoch": 0.5941333333333333, "grad_norm": 0.016599472612142563, "kl": 0.004977226257324219, "learning_rate": 2.9133235447662256e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 557 }, { "completion_length": 716.71875, "epoch": 0.5952, "grad_norm": 0.0059960405342280865, "kl": 0.007671833038330078, "learning_rate": 2.912698222535324e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 558 }, { "completion_length": 691.4375, "epoch": 0.5962666666666666, "grad_norm": 0.010529587976634502, "kl": 0.00292205810546875, "learning_rate": 2.9120707203298415e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 559 }, { "completion_length": 666.09375, "epoch": 0.5973333333333334, "grad_norm": 0.011277162469923496, "kl": 0.005146980285644531, "learning_rate": 2.911441039118095e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 560 }, { "completion_length": 573.125, "epoch": 0.5984, "grad_norm": 0.008928652852773666, "kl": 0.004477500915527344, "learning_rate": 2.9108091798717634e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 561 }, { "completion_length": 670.65625, "epoch": 0.5994666666666667, "grad_norm": 0.011688292026519775, "kl": 0.007103919982910156, "learning_rate": 2.910175143565886e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 562 }, { "completion_length": 527.40625, "epoch": 0.6005333333333334, "grad_norm": 0.008980919606983662, "kl": 0.012593269348144531, "learning_rate": 2.9095389311788626e-06, "loss": 0.0005, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 563 }, { "completion_length": 519.8125, "epoch": 0.6016, "grad_norm": 0.013014058582484722, "kl": 0.012108802795410156, "learning_rate": 2.9089005436924505e-06, "loss": 0.0005, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 564 }, { "completion_length": 577.375, "epoch": 0.6026666666666667, "grad_norm": 0.013014730997383595, "kl": 0.004660606384277344, "learning_rate": 2.9082599820917634e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 565 }, { "completion_length": 518.21875, "epoch": 0.6037333333333333, "grad_norm": 0.0003920719027519226, "kl": 0.0044040679931640625, "learning_rate": 2.9076172473652697e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 566 }, { "completion_length": 693.09375, "epoch": 0.6048, "grad_norm": 0.0004542594833765179, "kl": 0.00556182861328125, "learning_rate": 2.9069723405047926e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 567 }, { "completion_length": 545.25, "epoch": 0.6058666666666667, "grad_norm": 0.012369707226753235, "kl": 0.007365226745605469, "learning_rate": 2.906325262505505e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 568 }, { "completion_length": 504.3125, "epoch": 0.6069333333333333, "grad_norm": 0.012067841365933418, "kl": 0.007879257202148438, "learning_rate": 2.9056760143659314e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 569 }, { "completion_length": 632.21875, "epoch": 0.608, "grad_norm": 0.00045792717719450593, "kl": 0.008809089660644531, "learning_rate": 2.9050245970879456e-06, "loss": 0.0004, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 570 }, { "completion_length": 493.5, "epoch": 0.6090666666666666, "grad_norm": 0.018400445580482483, "kl": 0.008938789367675781, "learning_rate": 2.9043710116767675e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 571 }, { "completion_length": 598.1875, "epoch": 0.6101333333333333, "grad_norm": 0.026352543383836746, "kl": 0.014644622802734375, "learning_rate": 2.9037152591409635e-06, "loss": 0.0006, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 572 }, { "completion_length": 680.46875, "epoch": 0.6112, "grad_norm": 0.000608589150942862, "kl": 0.0038356781005859375, "learning_rate": 2.903057340492444e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 573 }, { "completion_length": 560.59375, "epoch": 0.6122666666666666, "grad_norm": 0.018229441717267036, "kl": 0.005358695983886719, "learning_rate": 2.9023972567464606e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 574 }, { "completion_length": 649.96875, "epoch": 0.6133333333333333, "grad_norm": 0.0003965966170653701, "kl": 0.006955146789550781, "learning_rate": 2.901735008921609e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 575 }, { "completion_length": 500.75, "epoch": 0.6144, "grad_norm": 0.013563805259764194, "kl": 0.010982990264892578, "learning_rate": 2.901070598039822e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 576 }, { "completion_length": 741.46875, "epoch": 0.6154666666666667, "grad_norm": 0.010929805226624012, "kl": 0.0038003921508789062, "learning_rate": 2.90040402512637e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 577 }, { "completion_length": 637.9375, "epoch": 0.6165333333333334, "grad_norm": 0.003929684404283762, "kl": 0.010465621948242188, "learning_rate": 2.8997352912098615e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 578 }, { "completion_length": 657.3125, "epoch": 0.6176, "grad_norm": 0.01188444159924984, "kl": 0.006396293640136719, "learning_rate": 2.8990643973222383e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 579 }, { "completion_length": 552.28125, "epoch": 0.6186666666666667, "grad_norm": 0.014514490030705929, "kl": 0.006892204284667969, "learning_rate": 2.8983913444987754e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 580 }, { "completion_length": 530.5, "epoch": 0.6197333333333334, "grad_norm": 0.01996157504618168, "kl": 0.0036325454711914062, "learning_rate": 2.8977161337780804e-06, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 581 }, { "completion_length": 596.0625, "epoch": 0.6208, "grad_norm": 0.0003033584216609597, "kl": 0.00481414794921875, "learning_rate": 2.89703876620209e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 582 }, { "completion_length": 545.5625, "epoch": 0.6218666666666667, "grad_norm": 0.020244471728801727, "kl": 0.00811004638671875, "learning_rate": 2.8963592428160685e-06, "loss": 0.0003, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 583 }, { "completion_length": 499.875, "epoch": 0.6229333333333333, "grad_norm": 0.0268291924148798, "kl": 0.019049644470214844, "learning_rate": 2.895677564668608e-06, "loss": 0.0008, "reward": 0.875, "reward_std": 0.125, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 584 }, { "completion_length": 640.90625, "epoch": 0.624, "grad_norm": 0.014487153850495815, "kl": 0.007864952087402344, "learning_rate": 2.8949937328116252e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 585 }, { "completion_length": 591.90625, "epoch": 0.6250666666666667, "grad_norm": 0.011975343339145184, "kl": 0.006648063659667969, "learning_rate": 2.894307748300361e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 586 }, { "completion_length": 628.4375, "epoch": 0.6261333333333333, "grad_norm": 0.00022000343597028404, "kl": 0.010987281799316406, "learning_rate": 2.8936196121933773e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 587 }, { "completion_length": 601.46875, "epoch": 0.6272, "grad_norm": 0.01484644040465355, "kl": 0.006609916687011719, "learning_rate": 2.8929293255525563e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 588 }, { "completion_length": 784.15625, "epoch": 0.6282666666666666, "grad_norm": 0.017806801944971085, "kl": 0.008015155792236328, "learning_rate": 2.892236889443098e-06, "loss": 0.0003, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 589 }, { "completion_length": 730.15625, "epoch": 0.6293333333333333, "grad_norm": 0.0002842250687535852, "kl": 0.003571033477783203, "learning_rate": 2.8915423049335213e-06, "loss": 0.0001, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 590 }, { "completion_length": 572.0625, "epoch": 0.6304, "grad_norm": 0.0016260062111541629, "kl": 0.0060710906982421875, "learning_rate": 2.8908455730956588e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 591 }, { "completion_length": 612.40625, "epoch": 0.6314666666666666, "grad_norm": 0.019706984981894493, "kl": 0.013186454772949219, "learning_rate": 2.890146695004657e-06, "loss": 0.0005, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 592 }, { "completion_length": 593.5625, "epoch": 0.6325333333333333, "grad_norm": 0.010683751665055752, "kl": 0.010266304016113281, "learning_rate": 2.8894456717389744e-06, "loss": 0.0004, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 593 }, { "completion_length": 500.25, "epoch": 0.6336, "grad_norm": 0.008839493617415428, "kl": 0.021924972534179688, "learning_rate": 2.88874250438038e-06, "loss": 0.0009, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 594 }, { "completion_length": 665.15625, "epoch": 0.6346666666666667, "grad_norm": 0.027645152062177658, "kl": 0.014159202575683594, "learning_rate": 2.8880371940139512e-06, "loss": 0.0006, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 595 }, { "completion_length": 591.78125, "epoch": 0.6357333333333334, "grad_norm": 0.0020488055888563395, "kl": 0.0067462921142578125, "learning_rate": 2.8873297417280723e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 596 }, { "completion_length": 606.875, "epoch": 0.6368, "grad_norm": 0.00035236490657553077, "kl": 0.003966331481933594, "learning_rate": 2.8866201486144333e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 597 }, { "completion_length": 680.375, "epoch": 0.6378666666666667, "grad_norm": 0.008822206407785416, "kl": 0.00411224365234375, "learning_rate": 2.885908415768027e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 598 }, { "completion_length": 662.0, "epoch": 0.6389333333333334, "grad_norm": 0.0002078510296996683, "kl": 0.00244903564453125, "learning_rate": 2.8851945442871484e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 599 }, { "completion_length": 457.53125, "epoch": 0.64, "grad_norm": 0.016664685681462288, "kl": 0.019021034240722656, "learning_rate": 2.884478535273393e-06, "loss": 0.0008, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 600 }, { "completion_length": 644.15625, "epoch": 0.6410666666666667, "grad_norm": 0.00854107178747654, "kl": 0.0026426315307617188, "learning_rate": 2.8837603898316547e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 601 }, { "completion_length": 610.3125, "epoch": 0.6421333333333333, "grad_norm": 0.005155178718268871, "kl": 0.008219718933105469, "learning_rate": 2.8830401090701236e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 602 }, { "completion_length": 532.6875, "epoch": 0.6432, "grad_norm": 0.0006381439743563533, "kl": 0.007794380187988281, "learning_rate": 2.8823176941002853e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 603 }, { "completion_length": 720.9375, "epoch": 0.6442666666666667, "grad_norm": 0.00022054360306356102, "kl": 0.004654407501220703, "learning_rate": 2.8815931460369196e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 604 }, { "completion_length": 619.84375, "epoch": 0.6453333333333333, "grad_norm": 0.0006738692172802985, "kl": 0.008760452270507812, "learning_rate": 2.8808664659980966e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 605 }, { "completion_length": 537.625, "epoch": 0.6464, "grad_norm": 0.0003963988274335861, "kl": 0.004450798034667969, "learning_rate": 2.880137655105176e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 606 }, { "completion_length": 650.84375, "epoch": 0.6474666666666666, "grad_norm": 0.005231819115579128, "kl": 0.0061779022216796875, "learning_rate": 2.879406714482808e-06, "loss": 0.0002, "reward": 0.375, "reward_std": 0.125, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 607 }, { "completion_length": 608.15625, "epoch": 0.6485333333333333, "grad_norm": 0.0002694206195883453, "kl": 0.005240440368652344, "learning_rate": 2.8786736452589266e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 608 }, { "completion_length": 525.65625, "epoch": 0.6496, "grad_norm": 0.012278929352760315, "kl": 0.004891872406005859, "learning_rate": 2.877938448564752e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 609 }, { "completion_length": 552.09375, "epoch": 0.6506666666666666, "grad_norm": 0.016463812440633774, "kl": 0.010085105895996094, "learning_rate": 2.8772011255347877e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 610 }, { "completion_length": 598.59375, "epoch": 0.6517333333333334, "grad_norm": 0.015558790415525436, "kl": 0.0031652450561523438, "learning_rate": 2.876461677306817e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 611 }, { "completion_length": 687.40625, "epoch": 0.6528, "grad_norm": 0.0081456508487463, "kl": 0.0034189224243164062, "learning_rate": 2.875720105021903e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 612 }, { "completion_length": 658.625, "epoch": 0.6538666666666667, "grad_norm": 0.0002637979050632566, "kl": 0.005127906799316406, "learning_rate": 2.8749764098243874e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 613 }, { "completion_length": 585.75, "epoch": 0.6549333333333334, "grad_norm": 0.014391692355275154, "kl": 0.006214141845703125, "learning_rate": 2.874230592861887e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 614 }, { "completion_length": 642.28125, "epoch": 0.656, "grad_norm": 0.01190915983170271, "kl": 0.005779266357421875, "learning_rate": 2.8734826552852934e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 615 }, { "completion_length": 559.125, "epoch": 0.6570666666666667, "grad_norm": 0.024006783962249756, "kl": 0.017531394958496094, "learning_rate": 2.8727325982487705e-06, "loss": 0.0007, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 616 }, { "completion_length": 519.125, "epoch": 0.6581333333333333, "grad_norm": 0.001533330068923533, "kl": 0.009514808654785156, "learning_rate": 2.871980422909752e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 617 }, { "completion_length": 483.3125, "epoch": 0.6592, "grad_norm": 0.011283020488917828, "kl": 0.010379791259765625, "learning_rate": 2.8712261304289407e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 618 }, { "completion_length": 521.96875, "epoch": 0.6602666666666667, "grad_norm": 0.024091627448797226, "kl": 0.005954742431640625, "learning_rate": 2.8704697219703076e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 619 }, { "completion_length": 677.375, "epoch": 0.6613333333333333, "grad_norm": 0.23988154530525208, "kl": 0.019903182983398438, "learning_rate": 2.869711198701087e-06, "loss": 0.0008, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 620 }, { "completion_length": 706.8125, "epoch": 0.6624, "grad_norm": 0.01414924394339323, "kl": 0.0028858184814453125, "learning_rate": 2.868950561791778e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 621 }, { "completion_length": 616.15625, "epoch": 0.6634666666666666, "grad_norm": 0.00026602670550346375, "kl": 0.005688667297363281, "learning_rate": 2.868187812416141e-06, "loss": 0.0002, "reward": 0.375, "reward_std": 0.0, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 622 }, { "completion_length": 601.5, "epoch": 0.6645333333333333, "grad_norm": 0.009553692303597927, "kl": 0.0051097869873046875, "learning_rate": 2.8674229517511965e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 623 }, { "completion_length": 571.09375, "epoch": 0.6656, "grad_norm": 0.01156543381512165, "kl": 0.009934425354003906, "learning_rate": 2.8666559809772215e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 624 }, { "completion_length": 721.96875, "epoch": 0.6666666666666666, "grad_norm": 0.008135410025715828, "kl": 0.003688812255859375, "learning_rate": 2.865886901277751e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 625 }, { "completion_length": 602.40625, "epoch": 0.6677333333333333, "grad_norm": 0.013730203732848167, "kl": 0.006855010986328125, "learning_rate": 2.8651157138395744e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 626 }, { "completion_length": 525.75, "epoch": 0.6688, "grad_norm": 0.010397675447165966, "kl": 0.0043659210205078125, "learning_rate": 2.8643424198527314e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 627 }, { "completion_length": 678.90625, "epoch": 0.6698666666666667, "grad_norm": 0.013718275353312492, "kl": 0.006817817687988281, "learning_rate": 2.863567020510515e-06, "loss": 0.0003, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 628 }, { "completion_length": 723.6875, "epoch": 0.6709333333333334, "grad_norm": 0.00019352907838765532, "kl": 0.018953323364257812, "learning_rate": 2.862789517009465e-06, "loss": 0.0008, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 629 }, { "completion_length": 653.03125, "epoch": 0.672, "grad_norm": 0.03498412296175957, "kl": 0.006134986877441406, "learning_rate": 2.86200991054937e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 630 }, { "completion_length": 567.8125, "epoch": 0.6730666666666667, "grad_norm": 0.019495613873004913, "kl": 0.006812095642089844, "learning_rate": 2.861228202333261e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 631 }, { "completion_length": 669.6875, "epoch": 0.6741333333333334, "grad_norm": 0.0003170263080392033, "kl": 0.006834983825683594, "learning_rate": 2.860444393567416e-06, "loss": 0.0003, "reward": 0.28125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 632 }, { "completion_length": 709.6875, "epoch": 0.6752, "grad_norm": 0.015090515837073326, "kl": 0.004731655120849609, "learning_rate": 2.8596584854613513e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 633 }, { "completion_length": 745.9375, "epoch": 0.6762666666666667, "grad_norm": 0.019890518859028816, "kl": 0.005583763122558594, "learning_rate": 2.8588704792278248e-06, "loss": 0.0002, "reward": 0.40625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 634 }, { "completion_length": 772.0, "epoch": 0.6773333333333333, "grad_norm": 0.018118126317858696, "kl": 0.0025959014892578125, "learning_rate": 2.8580803760828303e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 635 }, { "completion_length": 755.65625, "epoch": 0.6784, "grad_norm": 0.013760793954133987, "kl": 0.005806922912597656, "learning_rate": 2.8572881772455993e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 636 }, { "completion_length": 586.3125, "epoch": 0.6794666666666667, "grad_norm": 0.009293455630540848, "kl": 0.011334419250488281, "learning_rate": 2.856493883938595e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 637 }, { "completion_length": 636.125, "epoch": 0.6805333333333333, "grad_norm": 0.008400475606322289, "kl": 0.0062255859375, "learning_rate": 2.855697497387515e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 638 }, { "completion_length": 651.96875, "epoch": 0.6816, "grad_norm": 0.0003445114998612553, "kl": 0.0043849945068359375, "learning_rate": 2.8548990188212853e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 639 }, { "completion_length": 578.21875, "epoch": 0.6826666666666666, "grad_norm": 0.018615154549479485, "kl": 0.010465621948242188, "learning_rate": 2.8540984494720613e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 640 }, { "completion_length": 505.03125, "epoch": 0.6837333333333333, "grad_norm": 0.020088370889425278, "kl": 0.00720977783203125, "learning_rate": 2.853295790575223e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 641 }, { "completion_length": 619.375, "epoch": 0.6848, "grad_norm": 0.009899207390844822, "kl": 0.0068721771240234375, "learning_rate": 2.852491043369377e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 642 }, { "completion_length": 576.34375, "epoch": 0.6858666666666666, "grad_norm": 0.01290865894407034, "kl": 0.010202407836914062, "learning_rate": 2.851684209096352e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 643 }, { "completion_length": 764.46875, "epoch": 0.6869333333333333, "grad_norm": 0.0002215656277257949, "kl": 0.004214286804199219, "learning_rate": 2.8508752890011957e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 644 }, { "completion_length": 696.03125, "epoch": 0.688, "grad_norm": 0.01402111817151308, "kl": 0.00518035888671875, "learning_rate": 2.850064284332176e-06, "loss": 0.0002, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 645 }, { "completion_length": 655.71875, "epoch": 0.6890666666666667, "grad_norm": 0.011736895889043808, "kl": 0.004242897033691406, "learning_rate": 2.849251196340777e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 646 }, { "completion_length": 639.75, "epoch": 0.6901333333333334, "grad_norm": 0.01001067366451025, "kl": 0.005946159362792969, "learning_rate": 2.848436026281698e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 647 }, { "completion_length": 627.34375, "epoch": 0.6912, "grad_norm": 0.0003673291066661477, "kl": 0.0066547393798828125, "learning_rate": 2.847618775412851e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 648 }, { "completion_length": 731.71875, "epoch": 0.6922666666666667, "grad_norm": 0.00027935576508753, "kl": 0.007260322570800781, "learning_rate": 2.8467994449953587e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 649 }, { "completion_length": 566.78125, "epoch": 0.6933333333333334, "grad_norm": 0.011498169973492622, "kl": 0.0058536529541015625, "learning_rate": 2.845978036293553e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 650 }, { "completion_length": 628.65625, "epoch": 0.6944, "grad_norm": 0.00047453658771701157, "kl": 0.003914833068847656, "learning_rate": 2.845154550574973e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 651 }, { "completion_length": 674.0625, "epoch": 0.6954666666666667, "grad_norm": 0.010088195092976093, "kl": 0.007476806640625, "learning_rate": 2.8443289891103634e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 652 }, { "completion_length": 437.09375, "epoch": 0.6965333333333333, "grad_norm": 0.020231474190950394, "kl": 0.008612632751464844, "learning_rate": 2.8435013531736702e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 653 }, { "completion_length": 657.5625, "epoch": 0.6976, "grad_norm": 0.023091481998562813, "kl": 0.0053730010986328125, "learning_rate": 2.842671644042043e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 654 }, { "completion_length": 694.78125, "epoch": 0.6986666666666667, "grad_norm": 0.016826534643769264, "kl": 0.0056552886962890625, "learning_rate": 2.8418398629958283e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 655 }, { "completion_length": 627.46875, "epoch": 0.6997333333333333, "grad_norm": 0.0012312447652220726, "kl": 0.00795745849609375, "learning_rate": 2.8410060113185724e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 656 }, { "completion_length": 580.90625, "epoch": 0.7008, "grad_norm": 0.010590738616883755, "kl": 0.0080108642578125, "learning_rate": 2.840170090297014e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 657 }, { "completion_length": 544.78125, "epoch": 0.7018666666666666, "grad_norm": 0.010773815214633942, "kl": 0.009365081787109375, "learning_rate": 2.839332101221088e-06, "loss": 0.0004, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 658 }, { "completion_length": 740.875, "epoch": 0.7029333333333333, "grad_norm": 0.010175376199185848, "kl": 0.003631591796875, "learning_rate": 2.8384920453839167e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 659 }, { "completion_length": 591.71875, "epoch": 0.704, "grad_norm": 0.00030824533314444125, "kl": 0.004932403564453125, "learning_rate": 2.8376499240818166e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 660 }, { "completion_length": 554.8125, "epoch": 0.7050666666666666, "grad_norm": 0.01454853918403387, "kl": 0.010072708129882812, "learning_rate": 2.8368057386142873e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 661 }, { "completion_length": 644.28125, "epoch": 0.7061333333333333, "grad_norm": 0.034467704594135284, "kl": 0.0032863616943359375, "learning_rate": 2.835959490284015e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 662 }, { "completion_length": 636.40625, "epoch": 0.7072, "grad_norm": 0.021194133907556534, "kl": 0.004588127136230469, "learning_rate": 2.8351111803968714e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 663 }, { "completion_length": 533.125, "epoch": 0.7082666666666667, "grad_norm": 0.009460462257266045, "kl": 0.005370140075683594, "learning_rate": 2.834260810261905e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 664 }, { "completion_length": 540.71875, "epoch": 0.7093333333333334, "grad_norm": 0.00046741889673285186, "kl": 0.006671905517578125, "learning_rate": 2.833408381191348e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 665 }, { "completion_length": 614.59375, "epoch": 0.7104, "grad_norm": 0.001474593416787684, "kl": 0.009721755981445312, "learning_rate": 2.8325538945006067e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 666 }, { "completion_length": 663.875, "epoch": 0.7114666666666667, "grad_norm": 0.01318595465272665, "kl": 0.011205673217773438, "learning_rate": 2.8316973515082644e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 667 }, { "completion_length": 645.71875, "epoch": 0.7125333333333334, "grad_norm": 0.0014674562262371182, "kl": 0.011259078979492188, "learning_rate": 2.830838753536076e-06, "loss": 0.0005, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 668 }, { "completion_length": 639.75, "epoch": 0.7136, "grad_norm": 0.0007133535691536963, "kl": 0.008459091186523438, "learning_rate": 2.829978101908969e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 669 }, { "completion_length": 791.59375, "epoch": 0.7146666666666667, "grad_norm": 0.007292947266250849, "kl": 0.0045871734619140625, "learning_rate": 2.829115397955039e-06, "loss": 0.0002, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 670 }, { "completion_length": 720.5, "epoch": 0.7157333333333333, "grad_norm": 0.018207119777798653, "kl": 0.007128715515136719, "learning_rate": 2.828250643005549e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 671 }, { "completion_length": 697.0, "epoch": 0.7168, "grad_norm": 0.0002471816842444241, "kl": 0.002880096435546875, "learning_rate": 2.827383838394926e-06, "loss": 0.0001, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 672 }, { "completion_length": 598.5, "epoch": 0.7178666666666667, "grad_norm": 0.010509217157959938, "kl": 0.0073108673095703125, "learning_rate": 2.8265149854607618e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 673 }, { "completion_length": 679.15625, "epoch": 0.7189333333333333, "grad_norm": 0.023570960387587547, "kl": 0.006847381591796875, "learning_rate": 2.8256440855438077e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 674 }, { "completion_length": 633.65625, "epoch": 0.72, "grad_norm": 0.00032179427216760814, "kl": 0.0038509368896484375, "learning_rate": 2.8247711399879734e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 675 }, { "completion_length": 669.09375, "epoch": 0.7210666666666666, "grad_norm": 0.017245784401893616, "kl": 0.005671024322509766, "learning_rate": 2.8238961501403262e-06, "loss": 0.0002, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 676 }, { "completion_length": 689.90625, "epoch": 0.7221333333333333, "grad_norm": 0.0002460820251144469, "kl": 0.004245758056640625, "learning_rate": 2.8230191173510885e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 677 }, { "completion_length": 530.03125, "epoch": 0.7232, "grad_norm": 0.013272827491164207, "kl": 0.009649276733398438, "learning_rate": 2.8221400429736333e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 678 }, { "completion_length": 708.65625, "epoch": 0.7242666666666666, "grad_norm": 0.008196728304028511, "kl": 0.00290679931640625, "learning_rate": 2.8212589283644856e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 679 }, { "completion_length": 650.75, "epoch": 0.7253333333333334, "grad_norm": 0.02227405458688736, "kl": 0.004702568054199219, "learning_rate": 2.820375774883318e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 680 }, { "completion_length": 771.9375, "epoch": 0.7264, "grad_norm": 0.0005650169914588332, "kl": 0.005627155303955078, "learning_rate": 2.81949058389295e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 681 }, { "completion_length": 651.375, "epoch": 0.7274666666666667, "grad_norm": 0.0003658430650830269, "kl": 0.0029048919677734375, "learning_rate": 2.8186033567593446e-06, "loss": 0.0001, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 682 }, { "completion_length": 474.78125, "epoch": 0.7285333333333334, "grad_norm": 0.012081252411007881, "kl": 0.009145736694335938, "learning_rate": 2.8177140948516067e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 683 }, { "completion_length": 613.40625, "epoch": 0.7296, "grad_norm": 0.01865590550005436, "kl": 0.0041790008544921875, "learning_rate": 2.8168227995419826e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 684 }, { "completion_length": 664.84375, "epoch": 0.7306666666666667, "grad_norm": 0.009056413546204567, "kl": 0.004294395446777344, "learning_rate": 2.815929472205854e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 685 }, { "completion_length": 647.28125, "epoch": 0.7317333333333333, "grad_norm": 0.013040699064731598, "kl": 0.013623237609863281, "learning_rate": 2.8150341142217405e-06, "loss": 0.0005, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 686 }, { "completion_length": 675.25, "epoch": 0.7328, "grad_norm": 0.0002812909660860896, "kl": 0.0044422149658203125, "learning_rate": 2.8141367269712943e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 687 }, { "completion_length": 796.1875, "epoch": 0.7338666666666667, "grad_norm": 0.016197621822357178, "kl": 0.0035953521728515625, "learning_rate": 2.8132373118392986e-06, "loss": 0.0001, "reward": 0.34375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 688 }, { "completion_length": 656.375, "epoch": 0.7349333333333333, "grad_norm": 0.01772383600473404, "kl": 0.0032711029052734375, "learning_rate": 2.8123358702136667e-06, "loss": 0.0001, "reward": 0.4375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 689 }, { "completion_length": 628.375, "epoch": 0.736, "grad_norm": 0.0004787276266142726, "kl": 0.003551483154296875, "learning_rate": 2.8114324034854378e-06, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 690 }, { "completion_length": 618.5, "epoch": 0.7370666666666666, "grad_norm": 0.0020484740380197763, "kl": 0.0101165771484375, "learning_rate": 2.8105269130487782e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 691 }, { "completion_length": 439.3125, "epoch": 0.7381333333333333, "grad_norm": 0.0009759754175320268, "kl": 0.012451171875, "learning_rate": 2.809619400300975e-06, "loss": 0.0005, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 692 }, { "completion_length": 602.03125, "epoch": 0.7392, "grad_norm": 0.0007432409911416471, "kl": 0.008961677551269531, "learning_rate": 2.808709866642437e-06, "loss": 0.0004, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 693 }, { "completion_length": 424.96875, "epoch": 0.7402666666666666, "grad_norm": 0.0004706382460426539, "kl": 0.012332916259765625, "learning_rate": 2.8077983134766917e-06, "loss": 0.0005, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 694 }, { "completion_length": 792.59375, "epoch": 0.7413333333333333, "grad_norm": 0.000559438718482852, "kl": 0.004839897155761719, "learning_rate": 2.806884742210382e-06, "loss": 0.0002, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 695 }, { "completion_length": 641.71875, "epoch": 0.7424, "grad_norm": 0.009233251214027405, "kl": 0.004183769226074219, "learning_rate": 2.8059691542532654e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 696 }, { "completion_length": 524.125, "epoch": 0.7434666666666667, "grad_norm": 0.013877530582249165, "kl": 0.008880615234375, "learning_rate": 2.8050515510182122e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 697 }, { "completion_length": 851.6875, "epoch": 0.7445333333333334, "grad_norm": 0.010849891230463982, "kl": 0.0026521682739257812, "learning_rate": 2.8041319339212018e-06, "loss": 0.0001, "reward": 0.3125, "reward_std": 0.25, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 698 }, { "completion_length": 624.78125, "epoch": 0.7456, "grad_norm": 0.019717881456017494, "kl": 0.0074558258056640625, "learning_rate": 2.8032103043813213e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 699 }, { "completion_length": 626.625, "epoch": 0.7466666666666667, "grad_norm": 0.00045851836330257356, "kl": 0.0047664642333984375, "learning_rate": 2.802286663820763e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 700 }, { "completion_length": 665.8125, "epoch": 0.7477333333333334, "grad_norm": 0.008402171544730663, "kl": 0.004455089569091797, "learning_rate": 2.801361013664823e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 701 }, { "completion_length": 547.6875, "epoch": 0.7488, "grad_norm": 0.0009568886598572135, "kl": 0.009202957153320312, "learning_rate": 2.800433355341898e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 702 }, { "completion_length": 679.78125, "epoch": 0.7498666666666667, "grad_norm": 0.015504049137234688, "kl": 0.00505828857421875, "learning_rate": 2.7995036902834842e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 703 }, { "completion_length": 574.875, "epoch": 0.7509333333333333, "grad_norm": 0.00042479511466808617, "kl": 0.0062694549560546875, "learning_rate": 2.7985720199241735e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 704 }, { "completion_length": 696.0, "epoch": 0.752, "grad_norm": 0.01733817346394062, "kl": 0.004398345947265625, "learning_rate": 2.7976383457016535e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 705 }, { "completion_length": 717.15625, "epoch": 0.7530666666666667, "grad_norm": 0.010603638365864754, "kl": 0.00623321533203125, "learning_rate": 2.7967026690567026e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 706 }, { "completion_length": 618.71875, "epoch": 0.7541333333333333, "grad_norm": 0.011342287994921207, "kl": 0.006252288818359375, "learning_rate": 2.7957649914331906e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 707 }, { "completion_length": 526.84375, "epoch": 0.7552, "grad_norm": 0.01683817431330681, "kl": 0.004589080810546875, "learning_rate": 2.7948253142780738e-06, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 708 }, { "completion_length": 528.3125, "epoch": 0.7562666666666666, "grad_norm": 0.00047780765453353524, "kl": 0.0067462921142578125, "learning_rate": 2.793883639041395e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 709 }, { "completion_length": 618.4375, "epoch": 0.7573333333333333, "grad_norm": 0.012495452538132668, "kl": 0.00507354736328125, "learning_rate": 2.792939967176279e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 710 }, { "completion_length": 722.0, "epoch": 0.7584, "grad_norm": 0.019856996834278107, "kl": 0.0037784576416015625, "learning_rate": 2.791994300138934e-06, "loss": 0.0002, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 711 }, { "completion_length": 667.5625, "epoch": 0.7594666666666666, "grad_norm": 0.02640010602772236, "kl": 0.0057086944580078125, "learning_rate": 2.791046639388644e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 712 }, { "completion_length": 653.6875, "epoch": 0.7605333333333333, "grad_norm": 0.01939987763762474, "kl": 0.006229400634765625, "learning_rate": 2.7900969863877726e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 713 }, { "completion_length": 612.75, "epoch": 0.7616, "grad_norm": 0.010076269507408142, "kl": 0.004137992858886719, "learning_rate": 2.789145342601755e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 714 }, { "completion_length": 532.28125, "epoch": 0.7626666666666667, "grad_norm": 0.00031461994512937963, "kl": 0.007311820983886719, "learning_rate": 2.7881917094991003e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 715 }, { "completion_length": 565.21875, "epoch": 0.7637333333333334, "grad_norm": 0.008332892321050167, "kl": 0.011868476867675781, "learning_rate": 2.787236088551386e-06, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 716 }, { "completion_length": 597.15625, "epoch": 0.7648, "grad_norm": 0.0184959527105093, "kl": 0.010919570922851562, "learning_rate": 2.786278481233259e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 717 }, { "completion_length": 551.8125, "epoch": 0.7658666666666667, "grad_norm": 0.0002757021284196526, "kl": 0.007518768310546875, "learning_rate": 2.785318889022429e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 718 }, { "completion_length": 662.96875, "epoch": 0.7669333333333334, "grad_norm": 0.00022415598505176604, "kl": 0.007508277893066406, "learning_rate": 2.784357313399671e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 719 }, { "completion_length": 617.5625, "epoch": 0.768, "grad_norm": 0.008371838368475437, "kl": 0.00464630126953125, "learning_rate": 2.7833937558488187e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 720 }, { "completion_length": 627.71875, "epoch": 0.7690666666666667, "grad_norm": 0.02649940736591816, "kl": 0.00829315185546875, "learning_rate": 2.7824282178567654e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 721 }, { "completion_length": 648.875, "epoch": 0.7701333333333333, "grad_norm": 0.013536710292100906, "kl": 0.00525665283203125, "learning_rate": 2.7814607009134595e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 722 }, { "completion_length": 584.15625, "epoch": 0.7712, "grad_norm": 0.00929069984704256, "kl": 0.004647254943847656, "learning_rate": 2.7804912065119048e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 723 }, { "completion_length": 482.78125, "epoch": 0.7722666666666667, "grad_norm": 0.019654832780361176, "kl": 0.0072193145751953125, "learning_rate": 2.7795197361481544e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 724 }, { "completion_length": 615.34375, "epoch": 0.7733333333333333, "grad_norm": 0.000366286258213222, "kl": 0.005772590637207031, "learning_rate": 2.7785462913213127e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 725 }, { "completion_length": 755.375, "epoch": 0.7744, "grad_norm": 0.00995542947202921, "kl": 0.006714820861816406, "learning_rate": 2.777570873533529e-06, "loss": 0.0003, "reward": 0.3125, "reward_std": 0.125, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 726 }, { "completion_length": 737.3125, "epoch": 0.7754666666666666, "grad_norm": 0.013506078161299229, "kl": 0.004246711730957031, "learning_rate": 2.776593484289999e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 727 }, { "completion_length": 655.59375, "epoch": 0.7765333333333333, "grad_norm": 0.00957317091524601, "kl": 0.006318092346191406, "learning_rate": 2.7756141250989593e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 728 }, { "completion_length": 533.6875, "epoch": 0.7776, "grad_norm": 0.0006646747351624072, "kl": 0.008450508117675781, "learning_rate": 2.7746327974716863e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 729 }, { "completion_length": 540.625, "epoch": 0.7786666666666666, "grad_norm": 0.008065053261816502, "kl": 0.010904312133789062, "learning_rate": 2.7736495029224953e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 730 }, { "completion_length": 746.65625, "epoch": 0.7797333333333333, "grad_norm": 0.0084078973159194, "kl": 0.0047130584716796875, "learning_rate": 2.7726642429687353e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 731 }, { "completion_length": 520.0, "epoch": 0.7808, "grad_norm": 0.0010971873998641968, "kl": 0.009288787841796875, "learning_rate": 2.7716770191307885e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 732 }, { "completion_length": 731.96875, "epoch": 0.7818666666666667, "grad_norm": 0.015010342933237553, "kl": 0.004813194274902344, "learning_rate": 2.770687832932069e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 733 }, { "completion_length": 639.25, "epoch": 0.7829333333333334, "grad_norm": 0.000582807173486799, "kl": 0.0036945343017578125, "learning_rate": 2.769696685899017e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 734 }, { "completion_length": 692.75, "epoch": 0.784, "grad_norm": 0.016185257583856583, "kl": 0.00920867919921875, "learning_rate": 2.7687035795611003e-06, "loss": 0.0004, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 735 }, { "completion_length": 608.28125, "epoch": 0.7850666666666667, "grad_norm": 0.012611024081707, "kl": 0.004680633544921875, "learning_rate": 2.7677085154508086e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 736 }, { "completion_length": 617.25, "epoch": 0.7861333333333334, "grad_norm": 0.06754057109355927, "kl": 0.014749526977539062, "learning_rate": 2.766711495103654e-06, "loss": 0.0006, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 737 }, { "completion_length": 635.5, "epoch": 0.7872, "grad_norm": 0.01845647767186165, "kl": 0.008381843566894531, "learning_rate": 2.7657125200581663e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 738 }, { "completion_length": 643.8125, "epoch": 0.7882666666666667, "grad_norm": 0.03364390879869461, "kl": 0.007282257080078125, "learning_rate": 2.7647115918558927e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 739 }, { "completion_length": 743.375, "epoch": 0.7893333333333333, "grad_norm": 0.00030285323737189174, "kl": 0.003833770751953125, "learning_rate": 2.763708712041394e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 740 }, { "completion_length": 649.28125, "epoch": 0.7904, "grad_norm": 0.0003280292439740151, "kl": 0.004235267639160156, "learning_rate": 2.7627038821622417e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 741 }, { "completion_length": 514.28125, "epoch": 0.7914666666666667, "grad_norm": 0.022062215954065323, "kl": 0.005588531494140625, "learning_rate": 2.7616971037690173e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 742 }, { "completion_length": 654.125, "epoch": 0.7925333333333333, "grad_norm": 0.013974696397781372, "kl": 0.00518035888671875, "learning_rate": 2.7606883784153092e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 743 }, { "completion_length": 709.96875, "epoch": 0.7936, "grad_norm": 0.011198596097528934, "kl": 0.004878997802734375, "learning_rate": 2.7596777076577106e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 744 }, { "completion_length": 668.875, "epoch": 0.7946666666666666, "grad_norm": 0.00036224836367182434, "kl": 0.0069904327392578125, "learning_rate": 2.7586650930558147e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 745 }, { "completion_length": 598.15625, "epoch": 0.7957333333333333, "grad_norm": 0.017516283318400383, "kl": 0.00899505615234375, "learning_rate": 2.7576505361722175e-06, "loss": 0.0004, "reward": 0.6875, "reward_std": 0.375, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 746 }, { "completion_length": 622.6875, "epoch": 0.7968, "grad_norm": 0.0014303566422313452, "kl": 0.00586700439453125, "learning_rate": 2.7566340385725087e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 747 }, { "completion_length": 741.15625, "epoch": 0.7978666666666666, "grad_norm": 0.0013913874281570315, "kl": 0.00809478759765625, "learning_rate": 2.755615601825276e-06, "loss": 0.0003, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 748 }, { "completion_length": 478.34375, "epoch": 0.7989333333333334, "grad_norm": 0.014842181466519833, "kl": 0.012628555297851562, "learning_rate": 2.754595227502097e-06, "loss": 0.0005, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 749 }, { "completion_length": 538.1875, "epoch": 0.8, "grad_norm": 0.0003115941653959453, "kl": 0.0087127685546875, "learning_rate": 2.7535729171775408e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 750 }, { "completion_length": 573.875, "epoch": 0.8010666666666667, "grad_norm": 0.016353828832507133, "kl": 0.006892204284667969, "learning_rate": 2.7525486724291622e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 751 }, { "completion_length": 601.0, "epoch": 0.8021333333333334, "grad_norm": 0.023019231855869293, "kl": 0.009263992309570312, "learning_rate": 2.7515224948375037e-06, "loss": 0.0004, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 752 }, { "completion_length": 606.40625, "epoch": 0.8032, "grad_norm": 0.01757683791220188, "kl": 0.005967140197753906, "learning_rate": 2.7504943859860883e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 753 }, { "completion_length": 539.34375, "epoch": 0.8042666666666667, "grad_norm": 0.00023874426551628858, "kl": 0.011943817138671875, "learning_rate": 2.7494643474614196e-06, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 754 }, { "completion_length": 655.15625, "epoch": 0.8053333333333333, "grad_norm": 0.00019749377679545432, "kl": 0.004841804504394531, "learning_rate": 2.7484323808529795e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 755 }, { "completion_length": 655.25, "epoch": 0.8064, "grad_norm": 0.0009243980748578906, "kl": 0.00533294677734375, "learning_rate": 2.7473984877532248e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 756 }, { "completion_length": 533.5, "epoch": 0.8074666666666667, "grad_norm": 0.0009689651778899133, "kl": 0.008434295654296875, "learning_rate": 2.7463626697575855e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 757 }, { "completion_length": 570.96875, "epoch": 0.8085333333333333, "grad_norm": 0.011601416394114494, "kl": 0.0076141357421875, "learning_rate": 2.7453249284644608e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 758 }, { "completion_length": 681.96875, "epoch": 0.8096, "grad_norm": 0.0002894381177611649, "kl": 0.0059566497802734375, "learning_rate": 2.7442852654752197e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 759 }, { "completion_length": 771.9375, "epoch": 0.8106666666666666, "grad_norm": 0.0072905984707176685, "kl": 0.004899024963378906, "learning_rate": 2.743243682394195e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 760 }, { "completion_length": 650.40625, "epoch": 0.8117333333333333, "grad_norm": 0.0006961122853681445, "kl": 0.005787849426269531, "learning_rate": 2.7422001808286824e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 761 }, { "completion_length": 634.53125, "epoch": 0.8128, "grad_norm": 0.014018021523952484, "kl": 0.007555961608886719, "learning_rate": 2.74115476238894e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 762 }, { "completion_length": 603.125, "epoch": 0.8138666666666666, "grad_norm": 0.0005528103210963309, "kl": 0.00939178466796875, "learning_rate": 2.7401074286881813e-06, "loss": 0.0004, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 763 }, { "completion_length": 676.53125, "epoch": 0.8149333333333333, "grad_norm": 0.008303823880851269, "kl": 0.0216064453125, "learning_rate": 2.7390581813425774e-06, "loss": 0.0009, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 764 }, { "completion_length": 541.3125, "epoch": 0.816, "grad_norm": 0.014395921491086483, "kl": 0.0075931549072265625, "learning_rate": 2.7380070219712514e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 765 }, { "completion_length": 590.59375, "epoch": 0.8170666666666667, "grad_norm": 0.00041994437924586236, "kl": 0.012170791625976562, "learning_rate": 2.736953952196277e-06, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 766 }, { "completion_length": 609.03125, "epoch": 0.8181333333333334, "grad_norm": 0.0004047114634886384, "kl": 0.0049571990966796875, "learning_rate": 2.7358989736426756e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 767 }, { "completion_length": 681.9375, "epoch": 0.8192, "grad_norm": 0.0005269525572657585, "kl": 0.0059909820556640625, "learning_rate": 2.734842087938415e-06, "loss": 0.0002, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 768 }, { "completion_length": 672.78125, "epoch": 0.8202666666666667, "grad_norm": 0.00023020764638204128, "kl": 0.0041637420654296875, "learning_rate": 2.733783296714405e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 769 }, { "completion_length": 691.21875, "epoch": 0.8213333333333334, "grad_norm": 0.012420719489455223, "kl": 0.0045928955078125, "learning_rate": 2.7327226016044963e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 770 }, { "completion_length": 530.15625, "epoch": 0.8224, "grad_norm": 0.0016836767317727208, "kl": 0.013116836547851562, "learning_rate": 2.731660004245478e-06, "loss": 0.0005, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 771 }, { "completion_length": 576.78125, "epoch": 0.8234666666666667, "grad_norm": 0.0179737638682127, "kl": 0.00467681884765625, "learning_rate": 2.7305955062770735e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 772 }, { "completion_length": 517.375, "epoch": 0.8245333333333333, "grad_norm": 0.0007867430103942752, "kl": 0.009656906127929688, "learning_rate": 2.72952910934194e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 773 }, { "completion_length": 651.34375, "epoch": 0.8256, "grad_norm": 0.0005440239910967648, "kl": 0.00899505615234375, "learning_rate": 2.728460815085665e-06, "loss": 0.0004, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 774 }, { "completion_length": 679.84375, "epoch": 0.8266666666666667, "grad_norm": 0.0005016731447540224, "kl": 0.0071620941162109375, "learning_rate": 2.7273906251567635e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 775 }, { "completion_length": 630.0625, "epoch": 0.8277333333333333, "grad_norm": 0.01220296323299408, "kl": 0.0043735504150390625, "learning_rate": 2.7263185412066755e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 776 }, { "completion_length": 586.875, "epoch": 0.8288, "grad_norm": 0.010459656827151775, "kl": 0.008470535278320312, "learning_rate": 2.725244564889764e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 777 }, { "completion_length": 650.46875, "epoch": 0.8298666666666666, "grad_norm": 0.015976592898368835, "kl": 0.0063800811767578125, "learning_rate": 2.7241686978633133e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 778 }, { "completion_length": 715.0625, "epoch": 0.8309333333333333, "grad_norm": 0.01457007136195898, "kl": 0.0038299560546875, "learning_rate": 2.7230909417875224e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 779 }, { "completion_length": 575.8125, "epoch": 0.832, "grad_norm": 0.012336722575128078, "kl": 0.005451202392578125, "learning_rate": 2.722011298325509e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 780 }, { "completion_length": 705.84375, "epoch": 0.8330666666666666, "grad_norm": 0.0003338439855724573, "kl": 0.005756378173828125, "learning_rate": 2.7209297691433e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 781 }, { "completion_length": 785.0, "epoch": 0.8341333333333333, "grad_norm": 0.009804078377783298, "kl": 0.004208564758300781, "learning_rate": 2.719846355909835e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.4233439117670059, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 782 }, { "completion_length": 712.90625, "epoch": 0.8352, "grad_norm": 0.00017027444846462458, "kl": 0.004589080810546875, "learning_rate": 2.7187610602969586e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 783 }, { "completion_length": 616.03125, "epoch": 0.8362666666666667, "grad_norm": 0.008157726377248764, "kl": 0.0052242279052734375, "learning_rate": 2.7176738839794217e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 784 }, { "completion_length": 639.3125, "epoch": 0.8373333333333334, "grad_norm": 0.009266475215554237, "kl": 0.005977630615234375, "learning_rate": 2.7165848286348766e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 785 }, { "completion_length": 561.875, "epoch": 0.8384, "grad_norm": 0.00040640970109961927, "kl": 0.008733749389648438, "learning_rate": 2.7154938959438756e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 786 }, { "completion_length": 569.78125, "epoch": 0.8394666666666667, "grad_norm": 0.01029556430876255, "kl": 0.008544921875, "learning_rate": 2.714401087589867e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 787 }, { "completion_length": 572.125, "epoch": 0.8405333333333334, "grad_norm": 0.01159331202507019, "kl": 0.0066928863525390625, "learning_rate": 2.7133064052591952e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 788 }, { "completion_length": 577.9375, "epoch": 0.8416, "grad_norm": 0.000474442116683349, "kl": 0.0075206756591796875, "learning_rate": 2.7122098506410955e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 789 }, { "completion_length": 718.9375, "epoch": 0.8426666666666667, "grad_norm": 0.013127983547747135, "kl": 0.00617218017578125, "learning_rate": 2.711111425427692e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 790 }, { "completion_length": 811.53125, "epoch": 0.8437333333333333, "grad_norm": 0.0004485237586777657, "kl": 0.00365447998046875, "learning_rate": 2.7100111313139953e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 791 }, { "completion_length": 612.21875, "epoch": 0.8448, "grad_norm": 0.012691598385572433, "kl": 0.0037984848022460938, "learning_rate": 2.7089089699979008e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 792 }, { "completion_length": 617.0625, "epoch": 0.8458666666666667, "grad_norm": 0.012110497802495956, "kl": 0.008352279663085938, "learning_rate": 2.7078049431801846e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 793 }, { "completion_length": 566.75, "epoch": 0.8469333333333333, "grad_norm": 0.0008438747609034181, "kl": 0.0049190521240234375, "learning_rate": 2.706699052564503e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 794 }, { "completion_length": 551.6875, "epoch": 0.848, "grad_norm": 0.01227597240358591, "kl": 0.006608009338378906, "learning_rate": 2.705591299857385e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 795 }, { "completion_length": 527.5625, "epoch": 0.8490666666666666, "grad_norm": 0.0002504742005839944, "kl": 0.005626678466796875, "learning_rate": 2.7044816867682364e-06, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.0, "step": 796 }, { "completion_length": 652.125, "epoch": 0.8501333333333333, "grad_norm": 0.011703969910740852, "kl": 0.0037784576416015625, "learning_rate": 2.703370215009332e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 797 }, { "completion_length": 725.78125, "epoch": 0.8512, "grad_norm": 0.0005230876267887652, "kl": 0.007472038269042969, "learning_rate": 2.7022568862958153e-06, "loss": 0.0003, "reward": 0.40625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 798 }, { "completion_length": 653.1875, "epoch": 0.8522666666666666, "grad_norm": 0.01244034431874752, "kl": 0.00424957275390625, "learning_rate": 2.7011417023456963e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 799 }, { "completion_length": 677.03125, "epoch": 0.8533333333333334, "grad_norm": 0.00029842305229976773, "kl": 0.0073032379150390625, "learning_rate": 2.700024664879846e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 800 }, { "completion_length": 657.46875, "epoch": 0.8544, "grad_norm": 0.014329240657389164, "kl": 0.007369041442871094, "learning_rate": 2.6989057756219958e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 801 }, { "completion_length": 481.375, "epoch": 0.8554666666666667, "grad_norm": 0.000803402450401336, "kl": 0.008209228515625, "learning_rate": 2.6977850362987373e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 802 }, { "completion_length": 609.71875, "epoch": 0.8565333333333334, "grad_norm": 0.0003260039957240224, "kl": 0.0054073333740234375, "learning_rate": 2.6966624486395138e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 803 }, { "completion_length": 663.03125, "epoch": 0.8576, "grad_norm": 0.007598612457513809, "kl": 0.0051708221435546875, "learning_rate": 2.6955380143766217e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 804 }, { "completion_length": 698.3125, "epoch": 0.8586666666666667, "grad_norm": 0.012806645594537258, "kl": 0.0043735504150390625, "learning_rate": 2.694411735245208e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 805 }, { "completion_length": 534.28125, "epoch": 0.8597333333333333, "grad_norm": 0.017031138762831688, "kl": 0.00995635986328125, "learning_rate": 2.693283612983266e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 806 }, { "completion_length": 732.9375, "epoch": 0.8608, "grad_norm": 0.016913702711462975, "kl": 0.0032014846801757812, "learning_rate": 2.6921536493316326e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.375, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 807 }, { "completion_length": 512.96875, "epoch": 0.8618666666666667, "grad_norm": 0.0006967844674363732, "kl": 0.00762939453125, "learning_rate": 2.691021846033987e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 808 }, { "completion_length": 529.84375, "epoch": 0.8629333333333333, "grad_norm": 0.018760880455374718, "kl": 0.010786056518554688, "learning_rate": 2.689888204836847e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 809 }, { "completion_length": 715.28125, "epoch": 0.864, "grad_norm": 0.011731596663594246, "kl": 0.009011268615722656, "learning_rate": 2.6887527274895657e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 810 }, { "completion_length": 514.28125, "epoch": 0.8650666666666667, "grad_norm": 0.013049411587417126, "kl": 0.0443572998046875, "learning_rate": 2.687615415744331e-06, "loss": 0.0018, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 811 }, { "completion_length": 715.875, "epoch": 0.8661333333333333, "grad_norm": 0.00025296068633906543, "kl": 0.0043735504150390625, "learning_rate": 2.68647627135616e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 812 }, { "completion_length": 583.46875, "epoch": 0.8672, "grad_norm": 0.026675868779420853, "kl": 0.005352020263671875, "learning_rate": 2.6853352960829e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 813 }, { "completion_length": 566.15625, "epoch": 0.8682666666666666, "grad_norm": 0.014673098921775818, "kl": 0.009843826293945312, "learning_rate": 2.684192491685221e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 814 }, { "completion_length": 524.625, "epoch": 0.8693333333333333, "grad_norm": 0.0006159254116937518, "kl": 0.0074615478515625, "learning_rate": 2.6830478599266172e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.0, "step": 815 }, { "completion_length": 569.25, "epoch": 0.8704, "grad_norm": 0.0010987553978338838, "kl": 0.008037567138671875, "learning_rate": 2.6819014025734022e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 816 }, { "completion_length": 591.65625, "epoch": 0.8714666666666666, "grad_norm": 0.010781894437968731, "kl": 0.003757476806640625, "learning_rate": 2.680753121394707e-06, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 817 }, { "completion_length": 648.90625, "epoch": 0.8725333333333334, "grad_norm": 0.013959936797618866, "kl": 0.017754554748535156, "learning_rate": 2.679603018162476e-06, "loss": 0.0007, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 818 }, { "completion_length": 748.0625, "epoch": 0.8736, "grad_norm": 0.013710324652493, "kl": 0.0039520263671875, "learning_rate": 2.678451094651467e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 819 }, { "completion_length": 740.5625, "epoch": 0.8746666666666667, "grad_norm": 0.008411146700382233, "kl": 0.008114814758300781, "learning_rate": 2.6772973526392455e-06, "loss": 0.0003, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 820 }, { "completion_length": 720.5625, "epoch": 0.8757333333333334, "grad_norm": 0.0003357541863806546, "kl": 0.006561279296875, "learning_rate": 2.676141793906183e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 821 }, { "completion_length": 706.5625, "epoch": 0.8768, "grad_norm": 0.012535568326711655, "kl": 0.0034885406494140625, "learning_rate": 2.6749844202354553e-06, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 822 }, { "completion_length": 660.25, "epoch": 0.8778666666666667, "grad_norm": 0.013969277031719685, "kl": 0.0056915283203125, "learning_rate": 2.673825233413038e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 823 }, { "completion_length": 580.78125, "epoch": 0.8789333333333333, "grad_norm": 0.0002416718052700162, "kl": 0.0065288543701171875, "learning_rate": 2.6726642352277056e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 824 }, { "completion_length": 650.28125, "epoch": 0.88, "grad_norm": 0.00032705653575249016, "kl": 0.0067119598388671875, "learning_rate": 2.6715014274710265e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 825 }, { "completion_length": 554.65625, "epoch": 0.8810666666666667, "grad_norm": 0.016795441508293152, "kl": 0.0068950653076171875, "learning_rate": 2.670336811937363e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 826 }, { "completion_length": 575.03125, "epoch": 0.8821333333333333, "grad_norm": 0.014281952753663063, "kl": 0.005947113037109375, "learning_rate": 2.669170390423866e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 827 }, { "completion_length": 688.78125, "epoch": 0.8832, "grad_norm": 0.0006481899181380868, "kl": 0.005497932434082031, "learning_rate": 2.6680021647304735e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 828 }, { "completion_length": 644.78125, "epoch": 0.8842666666666666, "grad_norm": 0.012468862347304821, "kl": 0.00623321533203125, "learning_rate": 2.6668321366599074e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 829 }, { "completion_length": 739.34375, "epoch": 0.8853333333333333, "grad_norm": 0.0005253269337117672, "kl": 0.004868507385253906, "learning_rate": 2.6656603080176714e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 830 }, { "completion_length": 641.9375, "epoch": 0.8864, "grad_norm": 0.010739587247371674, "kl": 0.006115913391113281, "learning_rate": 2.6644866806120474e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 831 }, { "completion_length": 691.03125, "epoch": 0.8874666666666666, "grad_norm": 0.015617311000823975, "kl": 0.005164146423339844, "learning_rate": 2.663311256254093e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 832 }, { "completion_length": 777.375, "epoch": 0.8885333333333333, "grad_norm": 0.018240660429000854, "kl": 0.0063190460205078125, "learning_rate": 2.662134036757639e-06, "loss": 0.0003, "reward": 0.4375, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 833 }, { "completion_length": 491.875, "epoch": 0.8896, "grad_norm": 0.022015118971467018, "kl": 0.0075855255126953125, "learning_rate": 2.6609550239392854e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 834 }, { "completion_length": 534.4375, "epoch": 0.8906666666666667, "grad_norm": 0.012538203969597816, "kl": 0.006648063659667969, "learning_rate": 2.6597742196184007e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 835 }, { "completion_length": 537.09375, "epoch": 0.8917333333333334, "grad_norm": 0.00024037387629505247, "kl": 0.0073909759521484375, "learning_rate": 2.658591625617117e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 836 }, { "completion_length": 475.84375, "epoch": 0.8928, "grad_norm": 0.0004951999289914966, "kl": 0.006587982177734375, "learning_rate": 2.65740724376033e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 837 }, { "completion_length": 535.21875, "epoch": 0.8938666666666667, "grad_norm": 0.008811399340629578, "kl": 0.009235382080078125, "learning_rate": 2.6562210758756914e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 838 }, { "completion_length": 673.03125, "epoch": 0.8949333333333334, "grad_norm": 0.014581223018467426, "kl": 0.005611419677734375, "learning_rate": 2.65503312379361e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 839 }, { "completion_length": 642.75, "epoch": 0.896, "grad_norm": 0.007945782504975796, "kl": 0.00630950927734375, "learning_rate": 2.65384338934725e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 840 }, { "completion_length": 625.5, "epoch": 0.8970666666666667, "grad_norm": 0.009022640995681286, "kl": 0.00653839111328125, "learning_rate": 2.6526518743725227e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 841 }, { "completion_length": 719.28125, "epoch": 0.8981333333333333, "grad_norm": 0.013554815202951431, "kl": 0.0047817230224609375, "learning_rate": 2.6514585807080895e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 842 }, { "completion_length": 570.9375, "epoch": 0.8992, "grad_norm": 0.0012489096261560917, "kl": 0.005629539489746094, "learning_rate": 2.6502635101953553e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 843 }, { "completion_length": 836.84375, "epoch": 0.9002666666666667, "grad_norm": 0.00023374531883746386, "kl": 0.004832267761230469, "learning_rate": 2.649066664678467e-06, "loss": 0.0002, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 844 }, { "completion_length": 616.09375, "epoch": 0.9013333333333333, "grad_norm": 0.017865004017949104, "kl": 0.006592750549316406, "learning_rate": 2.6478680460043114e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 845 }, { "completion_length": 832.21875, "epoch": 0.9024, "grad_norm": 0.010193963535130024, "kl": 0.0046405792236328125, "learning_rate": 2.64666765602251e-06, "loss": 0.0002, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 846 }, { "completion_length": 586.28125, "epoch": 0.9034666666666666, "grad_norm": 0.00983855128288269, "kl": 0.0053768157958984375, "learning_rate": 2.645465496585419e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 847 }, { "completion_length": 615.625, "epoch": 0.9045333333333333, "grad_norm": 0.010635520331561565, "kl": 0.00635528564453125, "learning_rate": 2.6442615695481244e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 848 }, { "completion_length": 507.0625, "epoch": 0.9056, "grad_norm": 0.01147035975009203, "kl": 0.0072689056396484375, "learning_rate": 2.6430558767684408e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 849 }, { "completion_length": 627.34375, "epoch": 0.9066666666666666, "grad_norm": 0.013354601338505745, "kl": 0.005587577819824219, "learning_rate": 2.641848420106906e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 850 }, { "completion_length": 609.90625, "epoch": 0.9077333333333333, "grad_norm": 0.0003687863936647773, "kl": 0.004558563232421875, "learning_rate": 2.640639201426781e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 851 }, { "completion_length": 656.3125, "epoch": 0.9088, "grad_norm": 0.014340121299028397, "kl": 0.00799560546875, "learning_rate": 2.6394282225940447e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 852 }, { "completion_length": 533.90625, "epoch": 0.9098666666666667, "grad_norm": 0.013962801545858383, "kl": 0.009637832641601562, "learning_rate": 2.6382154854773934e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 853 }, { "completion_length": 684.8125, "epoch": 0.9109333333333334, "grad_norm": 0.00043485674541443586, "kl": 0.0052814483642578125, "learning_rate": 2.637000991948236e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 854 }, { "completion_length": 514.0625, "epoch": 0.912, "grad_norm": 0.0018594093853607774, "kl": 0.009302139282226562, "learning_rate": 2.6357847438806916e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 855 }, { "completion_length": 522.375, "epoch": 0.9130666666666667, "grad_norm": 0.0006943045882508159, "kl": 0.004974365234375, "learning_rate": 2.634566743151587e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 856 }, { "completion_length": 606.03125, "epoch": 0.9141333333333334, "grad_norm": 0.0013079551281407475, "kl": 0.0059871673583984375, "learning_rate": 2.6333469916404536e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 857 }, { "completion_length": 609.96875, "epoch": 0.9152, "grad_norm": 0.0015670583816245198, "kl": 0.00766754150390625, "learning_rate": 2.6321254912295243e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 858 }, { "completion_length": 550.03125, "epoch": 0.9162666666666667, "grad_norm": 0.012882603332400322, "kl": 0.01419830322265625, "learning_rate": 2.630902243803732e-06, "loss": 0.0006, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 859 }, { "completion_length": 615.34375, "epoch": 0.9173333333333333, "grad_norm": 0.0013106217375025153, "kl": 0.00576019287109375, "learning_rate": 2.6296772512507024e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 860 }, { "completion_length": 604.6875, "epoch": 0.9184, "grad_norm": 0.008072001859545708, "kl": 0.0063381195068359375, "learning_rate": 2.628450515460758e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 861 }, { "completion_length": 713.375, "epoch": 0.9194666666666667, "grad_norm": 0.008954057469964027, "kl": 0.0043926239013671875, "learning_rate": 2.6272220383269087e-06, "loss": 0.0002, "reward": 0.3125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 862 }, { "completion_length": 634.59375, "epoch": 0.9205333333333333, "grad_norm": 0.015773821622133255, "kl": 0.005482673645019531, "learning_rate": 2.625991821744852e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 863 }, { "completion_length": 567.3125, "epoch": 0.9216, "grad_norm": 0.0008676642901264131, "kl": 0.0107879638671875, "learning_rate": 2.624759867612971e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 864 }, { "completion_length": 635.03125, "epoch": 0.9226666666666666, "grad_norm": 0.014835801906883717, "kl": 0.0065441131591796875, "learning_rate": 2.623526177832328e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 865 }, { "completion_length": 591.5, "epoch": 0.9237333333333333, "grad_norm": 0.030355295166373253, "kl": 0.004802703857421875, "learning_rate": 2.6222907543066645e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 866 }, { "completion_length": 820.78125, "epoch": 0.9248, "grad_norm": 0.016608120873570442, "kl": 0.0042018890380859375, "learning_rate": 2.621053598942398e-06, "loss": 0.0002, "reward": 0.375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 867 }, { "completion_length": 654.0, "epoch": 0.9258666666666666, "grad_norm": 0.0017810954013839364, "kl": 0.0068206787109375, "learning_rate": 2.6198147136486174e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 868 }, { "completion_length": 552.78125, "epoch": 0.9269333333333334, "grad_norm": 0.016631867736577988, "kl": 0.00550079345703125, "learning_rate": 2.6185741003370826e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 869 }, { "completion_length": 627.96875, "epoch": 0.928, "grad_norm": 0.014242174103856087, "kl": 0.0095977783203125, "learning_rate": 2.617331760922218e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 870 }, { "completion_length": 753.5625, "epoch": 0.9290666666666667, "grad_norm": 0.013521532528102398, "kl": 0.00359344482421875, "learning_rate": 2.6160876973211136e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 871 }, { "completion_length": 636.15625, "epoch": 0.9301333333333334, "grad_norm": 0.007346646394580603, "kl": 0.007966995239257812, "learning_rate": 2.6148419114535184e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 872 }, { "completion_length": 641.0, "epoch": 0.9312, "grad_norm": 0.01146378181874752, "kl": 0.0060272216796875, "learning_rate": 2.61359440524184e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 873 }, { "completion_length": 544.875, "epoch": 0.9322666666666667, "grad_norm": 0.0008338844054378569, "kl": 0.007816314697265625, "learning_rate": 2.6123451806111406e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 874 }, { "completion_length": 621.9375, "epoch": 0.9333333333333333, "grad_norm": 0.009419040754437447, "kl": 0.008195877075195312, "learning_rate": 2.611094239489134e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 875 }, { "completion_length": 627.40625, "epoch": 0.9344, "grad_norm": 0.0013613710179924965, "kl": 0.004856109619140625, "learning_rate": 2.6098415838061832e-06, "loss": 0.0002, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 876 }, { "completion_length": 562.0625, "epoch": 0.9354666666666667, "grad_norm": 0.011018440127372742, "kl": 0.0117034912109375, "learning_rate": 2.6085872154952964e-06, "loss": 0.0005, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 877 }, { "completion_length": 699.15625, "epoch": 0.9365333333333333, "grad_norm": 0.011109993793070316, "kl": 0.006313323974609375, "learning_rate": 2.6073311364921243e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 878 }, { "completion_length": 758.21875, "epoch": 0.9376, "grad_norm": 0.00026839610654860735, "kl": 0.00447845458984375, "learning_rate": 2.6060733487349584e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 879 }, { "completion_length": 490.3125, "epoch": 0.9386666666666666, "grad_norm": 0.02261410653591156, "kl": 0.00911712646484375, "learning_rate": 2.604813854164727e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 880 }, { "completion_length": 597.78125, "epoch": 0.9397333333333333, "grad_norm": 0.01019956637173891, "kl": 0.0049457550048828125, "learning_rate": 2.6035526547249907e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 881 }, { "completion_length": 771.84375, "epoch": 0.9408, "grad_norm": 0.015273666940629482, "kl": 0.0030908584594726562, "learning_rate": 2.6022897523619424e-06, "loss": 0.0001, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 882 }, { "completion_length": 615.71875, "epoch": 0.9418666666666666, "grad_norm": 0.015822449699044228, "kl": 0.007829666137695312, "learning_rate": 2.6010251490244037e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 883 }, { "completion_length": 589.3125, "epoch": 0.9429333333333333, "grad_norm": 0.0004960999940522015, "kl": 0.0063838958740234375, "learning_rate": 2.599758846663818e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 884 }, { "completion_length": 556.125, "epoch": 0.944, "grad_norm": 0.00045131350634619594, "kl": 0.005328178405761719, "learning_rate": 2.598490847234253e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 885 }, { "completion_length": 528.375, "epoch": 0.9450666666666667, "grad_norm": 0.021820463240146637, "kl": 0.008905410766601562, "learning_rate": 2.5972211526923948e-06, "loss": 0.0004, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 886 }, { "completion_length": 547.6875, "epoch": 0.9461333333333334, "grad_norm": 0.00029295377316884696, "kl": 0.00946807861328125, "learning_rate": 2.595949764997545e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 887 }, { "completion_length": 752.90625, "epoch": 0.9472, "grad_norm": 0.00024067796766757965, "kl": 0.0046749114990234375, "learning_rate": 2.5946766861116167e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 888 }, { "completion_length": 576.71875, "epoch": 0.9482666666666667, "grad_norm": 0.012543228454887867, "kl": 0.0071582794189453125, "learning_rate": 2.5934019179991356e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 889 }, { "completion_length": 605.03125, "epoch": 0.9493333333333334, "grad_norm": 0.0004865596420131624, "kl": 0.0048980712890625, "learning_rate": 2.5921254626272312e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 890 }, { "completion_length": 653.34375, "epoch": 0.9504, "grad_norm": 0.017550453543663025, "kl": 0.008927345275878906, "learning_rate": 2.5908473219656386e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 891 }, { "completion_length": 667.40625, "epoch": 0.9514666666666667, "grad_norm": 0.020741192623972893, "kl": 0.0067195892333984375, "learning_rate": 2.5895674979866925e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 892 }, { "completion_length": 493.875, "epoch": 0.9525333333333333, "grad_norm": 0.015714094042778015, "kl": 0.0076313018798828125, "learning_rate": 2.588285992665325e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 893 }, { "completion_length": 611.625, "epoch": 0.9536, "grad_norm": 0.0002899506944231689, "kl": 0.004901885986328125, "learning_rate": 2.5870028079790647e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 894 }, { "completion_length": 585.5, "epoch": 0.9546666666666667, "grad_norm": 0.024114040657877922, "kl": 0.0068569183349609375, "learning_rate": 2.5857179459080285e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 895 }, { "completion_length": 555.125, "epoch": 0.9557333333333333, "grad_norm": 0.00023932506155688316, "kl": 0.0075092315673828125, "learning_rate": 2.584431408434924e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 896 }, { "completion_length": 556.0625, "epoch": 0.9568, "grad_norm": 0.0007229121983982623, "kl": 0.009876251220703125, "learning_rate": 2.583143197545044e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 897 }, { "completion_length": 671.09375, "epoch": 0.9578666666666666, "grad_norm": 0.011179661378264427, "kl": 0.0036182403564453125, "learning_rate": 2.5818533152262628e-06, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 898 }, { "completion_length": 538.8125, "epoch": 0.9589333333333333, "grad_norm": 0.015439197421073914, "kl": 0.0070400238037109375, "learning_rate": 2.580561763469034e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 899 }, { "completion_length": 675.21875, "epoch": 0.96, "grad_norm": 0.012190368957817554, "kl": 0.004290580749511719, "learning_rate": 2.5792685442663883e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 900 }, { "completion_length": 516.03125, "epoch": 0.9610666666666666, "grad_norm": 0.0008082238491624594, "kl": 0.008180618286132812, "learning_rate": 2.577973659613928e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 901 }, { "completion_length": 691.1875, "epoch": 0.9621333333333333, "grad_norm": 0.01949690654873848, "kl": 0.009410858154296875, "learning_rate": 2.5766771115098273e-06, "loss": 0.0004, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 902 }, { "completion_length": 521.8125, "epoch": 0.9632, "grad_norm": 0.015193672850728035, "kl": 0.010288238525390625, "learning_rate": 2.5753789019548255e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 903 }, { "completion_length": 550.0, "epoch": 0.9642666666666667, "grad_norm": 0.0012995407450944185, "kl": 0.009920120239257812, "learning_rate": 2.574079032952226e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 904 }, { "completion_length": 671.53125, "epoch": 0.9653333333333334, "grad_norm": 0.0004439554177224636, "kl": 0.00638580322265625, "learning_rate": 2.572777506507895e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 905 }, { "completion_length": 589.3125, "epoch": 0.9664, "grad_norm": 0.0004652448697015643, "kl": 0.005245208740234375, "learning_rate": 2.571474324630253e-06, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 906 }, { "completion_length": 703.53125, "epoch": 0.9674666666666667, "grad_norm": 0.01991853676736355, "kl": 0.005695343017578125, "learning_rate": 2.570169489330278e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 907 }, { "completion_length": 498.9375, "epoch": 0.9685333333333334, "grad_norm": 0.012732761912047863, "kl": 0.013568878173828125, "learning_rate": 2.568863002621498e-06, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 908 }, { "completion_length": 573.09375, "epoch": 0.9696, "grad_norm": 0.008016749285161495, "kl": 0.010602951049804688, "learning_rate": 2.567554866519989e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 909 }, { "completion_length": 698.3125, "epoch": 0.9706666666666667, "grad_norm": 0.000736990652512759, "kl": 0.0063190460205078125, "learning_rate": 2.5662450830443734e-06, "loss": 0.0003, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 910 }, { "completion_length": 629.21875, "epoch": 0.9717333333333333, "grad_norm": 0.010720309801399708, "kl": 0.007903099060058594, "learning_rate": 2.5649336542158156e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 911 }, { "completion_length": 703.84375, "epoch": 0.9728, "grad_norm": 0.0002641054743435234, "kl": 0.004146575927734375, "learning_rate": 2.5636205820580173e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 912 }, { "completion_length": 747.34375, "epoch": 0.9738666666666667, "grad_norm": 0.00811687670648098, "kl": 0.009695053100585938, "learning_rate": 2.562305868597218e-06, "loss": 0.0004, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 913 }, { "completion_length": 487.875, "epoch": 0.9749333333333333, "grad_norm": 0.021018683910369873, "kl": 0.0070133209228515625, "learning_rate": 2.5609895158621892e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 914 }, { "completion_length": 597.84375, "epoch": 0.976, "grad_norm": 0.011356687173247337, "kl": 0.0041790008544921875, "learning_rate": 2.559671525884232e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 915 }, { "completion_length": 434.5, "epoch": 0.9770666666666666, "grad_norm": 0.0010487177642062306, "kl": 0.0097198486328125, "learning_rate": 2.558351900697174e-06, "loss": 0.0004, "reward": 0.875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 916 }, { "completion_length": 536.96875, "epoch": 0.9781333333333333, "grad_norm": 0.016089199110865593, "kl": 0.0068759918212890625, "learning_rate": 2.5570306423373663e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 917 }, { "completion_length": 595.9375, "epoch": 0.9792, "grad_norm": 0.0010833560954779387, "kl": 0.006916046142578125, "learning_rate": 2.5557077528436792e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 918 }, { "completion_length": 646.78125, "epoch": 0.9802666666666666, "grad_norm": 0.018550047650933266, "kl": 0.0067882537841796875, "learning_rate": 2.5543832342575023e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 919 }, { "completion_length": 685.6875, "epoch": 0.9813333333333333, "grad_norm": 0.0005851062596775591, "kl": 0.007938385009765625, "learning_rate": 2.5530570886227364e-06, "loss": 0.0003, "reward": 0.34375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 920 }, { "completion_length": 554.78125, "epoch": 0.9824, "grad_norm": 0.01962989941239357, "kl": 0.007152557373046875, "learning_rate": 2.551729317985795e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 921 }, { "completion_length": 663.1875, "epoch": 0.9834666666666667, "grad_norm": 0.00033909830381162465, "kl": 0.008436203002929688, "learning_rate": 2.550399924395599e-06, "loss": 0.0003, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 922 }, { "completion_length": 549.6875, "epoch": 0.9845333333333334, "grad_norm": 0.0010208606254309416, "kl": 0.005443572998046875, "learning_rate": 2.5490689099035716e-06, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 923 }, { "completion_length": 731.5625, "epoch": 0.9856, "grad_norm": 0.000740385614335537, "kl": 0.0057125091552734375, "learning_rate": 2.5477362765636408e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 924 }, { "completion_length": 623.0625, "epoch": 0.9866666666666667, "grad_norm": 0.0004005076189059764, "kl": 0.0047740936279296875, "learning_rate": 2.5464020264322293e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 925 }, { "completion_length": 707.5625, "epoch": 0.9877333333333334, "grad_norm": 0.0004680531856138259, "kl": 0.003253936767578125, "learning_rate": 2.5450661615682574e-06, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 926 }, { "completion_length": 578.0625, "epoch": 0.9888, "grad_norm": 0.0007405169890262187, "kl": 0.012470245361328125, "learning_rate": 2.5437286840331353e-06, "loss": 0.0005, "reward": 0.875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 927 }, { "completion_length": 607.8125, "epoch": 0.9898666666666667, "grad_norm": 0.015324437990784645, "kl": 0.0040874481201171875, "learning_rate": 2.5423895958907624e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 928 }, { "completion_length": 621.625, "epoch": 0.9909333333333333, "grad_norm": 0.019542980939149857, "kl": 0.005070686340332031, "learning_rate": 2.541048899207523e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 929 }, { "completion_length": 588.15625, "epoch": 0.992, "grad_norm": 0.01357611920684576, "kl": 0.005295753479003906, "learning_rate": 2.539706596052286e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 930 }, { "completion_length": 506.4375, "epoch": 0.9930666666666667, "grad_norm": 0.000509569828864187, "kl": 0.0061435699462890625, "learning_rate": 2.538362688496395e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 931 }, { "completion_length": 641.84375, "epoch": 0.9941333333333333, "grad_norm": 0.0003489904338493943, "kl": 0.008029937744140625, "learning_rate": 2.537017178613673e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 932 }, { "completion_length": 631.75, "epoch": 0.9952, "grad_norm": 0.007361831143498421, "kl": 0.007538318634033203, "learning_rate": 2.535670068480414e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 933 }, { "completion_length": 675.4375, "epoch": 0.9962666666666666, "grad_norm": 0.007297220639884472, "kl": 0.0068416595458984375, "learning_rate": 2.5343213601753824e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 934 }, { "completion_length": 712.3125, "epoch": 0.9973333333333333, "grad_norm": 0.009332768619060516, "kl": 0.0061969757080078125, "learning_rate": 2.532971055779807e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 935 }, { "completion_length": 580.71875, "epoch": 0.9984, "grad_norm": 0.021246889606118202, "kl": 0.008609771728515625, "learning_rate": 2.531619157377382e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 936 }, { "completion_length": 623.875, "epoch": 0.9994666666666666, "grad_norm": 0.016928791999816895, "kl": 0.009525299072265625, "learning_rate": 2.530265667054259e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 937 }, { "completion_length": 676.0, "epoch": 1.0, "grad_norm": 0.016928791999816895, "kl": 0.004459381103515625, "learning_rate": 2.5289105868990477e-06, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 938 }, { "completion_length": 614.21875, "epoch": 1.0010666666666668, "grad_norm": 0.01553938165307045, "kl": 0.00904083251953125, "learning_rate": 2.5275539190028104e-06, "loss": 0.0004, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 939 }, { "completion_length": 669.75, "epoch": 1.0021333333333333, "grad_norm": 0.0011938703246414661, "kl": 0.0057277679443359375, "learning_rate": 2.52619566545906e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 940 }, { "completion_length": 501.84375, "epoch": 1.0032, "grad_norm": 0.0005276530864648521, "kl": 0.01006317138671875, "learning_rate": 2.5248358283637552e-06, "loss": 0.0004, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 941 }, { "completion_length": 710.71875, "epoch": 1.0042666666666666, "grad_norm": 0.0003861947625409812, "kl": 0.008724212646484375, "learning_rate": 2.5234744098153e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 942 }, { "completion_length": 536.21875, "epoch": 1.0053333333333334, "grad_norm": 0.011328183114528656, "kl": 0.004550933837890625, "learning_rate": 2.5221114119145376e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 943 }, { "completion_length": 494.125, "epoch": 1.0064, "grad_norm": 0.004493300337344408, "kl": 0.01004791259765625, "learning_rate": 2.5207468367647483e-06, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 944 }, { "completion_length": 595.09375, "epoch": 1.0074666666666667, "grad_norm": 0.01210754830390215, "kl": 0.005889892578125, "learning_rate": 2.5193806864716466e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 945 }, { "completion_length": 517.09375, "epoch": 1.0085333333333333, "grad_norm": 0.0004555713967420161, "kl": 0.01024627685546875, "learning_rate": 2.518012963143378e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 946 }, { "completion_length": 531.6875, "epoch": 1.0096, "grad_norm": 0.015268140472471714, "kl": 0.00548553466796875, "learning_rate": 2.516643668890515e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 947 }, { "completion_length": 545.6875, "epoch": 1.0106666666666666, "grad_norm": 0.0002634288393892348, "kl": 0.008087158203125, "learning_rate": 2.5152728058260543e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 948 }, { "completion_length": 752.34375, "epoch": 1.0117333333333334, "grad_norm": 0.00045437461812980473, "kl": 0.004314422607421875, "learning_rate": 2.5139003760654132e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 949 }, { "completion_length": 628.78125, "epoch": 1.0128, "grad_norm": 0.009531361982226372, "kl": 0.0041332244873046875, "learning_rate": 2.5125263817264273e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 950 }, { "completion_length": 829.59375, "epoch": 1.0138666666666667, "grad_norm": 0.013533144257962704, "kl": 0.004551887512207031, "learning_rate": 2.5111508249293456e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 951 }, { "completion_length": 522.8125, "epoch": 1.0149333333333332, "grad_norm": 0.011375023983418941, "kl": 0.0072841644287109375, "learning_rate": 2.509773707796829e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 952 }, { "completion_length": 688.09375, "epoch": 1.016, "grad_norm": 0.010334829799830914, "kl": 0.006244659423828125, "learning_rate": 2.508395032453946e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 953 }, { "completion_length": 563.40625, "epoch": 1.0170666666666666, "grad_norm": 0.026415111497044563, "kl": 0.010816574096679688, "learning_rate": 2.507014801028169e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 954 }, { "completion_length": 686.46875, "epoch": 1.0181333333333333, "grad_norm": 0.01483286265283823, "kl": 0.0068492889404296875, "learning_rate": 2.505633015649373e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 955 }, { "completion_length": 611.1875, "epoch": 1.0192, "grad_norm": 0.0008380042854696512, "kl": 0.00547027587890625, "learning_rate": 2.5042496784498292e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 956 }, { "completion_length": 625.125, "epoch": 1.0202666666666667, "grad_norm": 0.0005107581382617354, "kl": 0.0049533843994140625, "learning_rate": 2.502864791564205e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 957 }, { "completion_length": 614.1875, "epoch": 1.0213333333333334, "grad_norm": 0.019187120720744133, "kl": 0.0066375732421875, "learning_rate": 2.5014783571295578e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 958 }, { "completion_length": 601.09375, "epoch": 1.0224, "grad_norm": 0.01322366762906313, "kl": 0.008024215698242188, "learning_rate": 2.500090377285335e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 959 }, { "completion_length": 662.15625, "epoch": 1.0234666666666667, "grad_norm": 0.01069776713848114, "kl": 0.010537147521972656, "learning_rate": 2.4987008541733663e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 960 }, { "completion_length": 528.90625, "epoch": 1.0245333333333333, "grad_norm": 0.0006989406538195908, "kl": 0.005641937255859375, "learning_rate": 2.497309789937865e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 961 }, { "completion_length": 638.28125, "epoch": 1.0256, "grad_norm": 0.010846171528100967, "kl": 0.0063877105712890625, "learning_rate": 2.495917186725421e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 962 }, { "completion_length": 676.4375, "epoch": 1.0266666666666666, "grad_norm": 0.0005630499217659235, "kl": 0.010509490966796875, "learning_rate": 2.494523046685e-06, "loss": 0.0004, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 963 }, { "completion_length": 653.78125, "epoch": 1.0277333333333334, "grad_norm": 0.00038896300247870386, "kl": 0.004772186279296875, "learning_rate": 2.4931273719679395e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 964 }, { "completion_length": 719.21875, "epoch": 1.0288, "grad_norm": 0.012909491546452045, "kl": 0.0064449310302734375, "learning_rate": 2.4917301647279434e-06, "loss": 0.0003, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 965 }, { "completion_length": 530.28125, "epoch": 1.0298666666666667, "grad_norm": 0.0008845519041642547, "kl": 0.007686614990234375, "learning_rate": 2.4903314271210824e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 966 }, { "completion_length": 510.1875, "epoch": 1.0309333333333333, "grad_norm": 0.0003987682575825602, "kl": 0.009368896484375, "learning_rate": 2.488931161305788e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 967 }, { "completion_length": 696.78125, "epoch": 1.032, "grad_norm": 0.0005256184958852828, "kl": 0.006094932556152344, "learning_rate": 2.4875293694428505e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 968 }, { "completion_length": 588.625, "epoch": 1.0330666666666666, "grad_norm": 0.009046658873558044, "kl": 0.011715888977050781, "learning_rate": 2.486126053695414e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 969 }, { "completion_length": 629.6875, "epoch": 1.0341333333333333, "grad_norm": 0.0019655709620565176, "kl": 0.007925033569335938, "learning_rate": 2.4847212162289746e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 970 }, { "completion_length": 510.4375, "epoch": 1.0352, "grad_norm": 0.015002971515059471, "kl": 0.00620269775390625, "learning_rate": 2.483314859211377e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 971 }, { "completion_length": 584.6875, "epoch": 1.0362666666666667, "grad_norm": 0.013410159386694431, "kl": 0.007330894470214844, "learning_rate": 2.48190698481281e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 972 }, { "completion_length": 727.46875, "epoch": 1.0373333333333334, "grad_norm": 0.01597684621810913, "kl": 0.007228851318359375, "learning_rate": 2.480497595205805e-06, "loss": 0.0003, "reward": 0.375, "reward_std": 0.125, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 973 }, { "completion_length": 578.1875, "epoch": 1.0384, "grad_norm": 0.007179691921919584, "kl": 0.006122589111328125, "learning_rate": 2.4790866925652307e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 974 }, { "completion_length": 609.65625, "epoch": 1.0394666666666668, "grad_norm": 0.010455789044499397, "kl": 0.005130767822265625, "learning_rate": 2.477674279068291e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 975 }, { "completion_length": 575.0625, "epoch": 1.0405333333333333, "grad_norm": 0.023190859705209732, "kl": 0.02565765380859375, "learning_rate": 2.4762603568945215e-06, "loss": 0.001, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 976 }, { "completion_length": 691.1875, "epoch": 1.0416, "grad_norm": 0.01128036342561245, "kl": 0.00531005859375, "learning_rate": 2.474844928225784e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 977 }, { "completion_length": 674.15625, "epoch": 1.0426666666666666, "grad_norm": 0.013844319619238377, "kl": 0.010164260864257812, "learning_rate": 2.473427995246269e-06, "loss": 0.0004, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 978 }, { "completion_length": 689.125, "epoch": 1.0437333333333334, "grad_norm": 0.021254122257232666, "kl": 0.006207466125488281, "learning_rate": 2.4720095601424834e-06, "loss": 0.0002, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 979 }, { "completion_length": 550.9375, "epoch": 1.0448, "grad_norm": 0.0023792232386767864, "kl": 0.0110321044921875, "learning_rate": 2.4705896251032555e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 980 }, { "completion_length": 589.75, "epoch": 1.0458666666666667, "grad_norm": 0.00040645390981808305, "kl": 0.007541656494140625, "learning_rate": 2.4691681923197277e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 981 }, { "completion_length": 582.9375, "epoch": 1.0469333333333333, "grad_norm": 0.019268469884991646, "kl": 0.011278152465820312, "learning_rate": 2.4677452639853527e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 982 }, { "completion_length": 681.53125, "epoch": 1.048, "grad_norm": 0.015344303101301193, "kl": 0.0064182281494140625, "learning_rate": 2.4663208422958906e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 983 }, { "completion_length": 562.75, "epoch": 1.0490666666666666, "grad_norm": 0.00044597667874768376, "kl": 0.0089874267578125, "learning_rate": 2.464894929449408e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 984 }, { "completion_length": 553.8125, "epoch": 1.0501333333333334, "grad_norm": 0.01745174452662468, "kl": 0.007213592529296875, "learning_rate": 2.463467527646272e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 985 }, { "completion_length": 572.25, "epoch": 1.0512, "grad_norm": 0.002041492611169815, "kl": 0.007061004638671875, "learning_rate": 2.4620386390891443e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 986 }, { "completion_length": 586.8125, "epoch": 1.0522666666666667, "grad_norm": 0.00039585557533428073, "kl": 0.0061321258544921875, "learning_rate": 2.460608265982985e-06, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 987 }, { "completion_length": 642.375, "epoch": 1.0533333333333332, "grad_norm": 0.00029571764753200114, "kl": 0.005196571350097656, "learning_rate": 2.459176410535043e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 988 }, { "completion_length": 797.21875, "epoch": 1.0544, "grad_norm": 0.016163086518645287, "kl": 0.007063865661621094, "learning_rate": 2.457743074954855e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 989 }, { "completion_length": 530.59375, "epoch": 1.0554666666666668, "grad_norm": 0.012930072844028473, "kl": 0.009815216064453125, "learning_rate": 2.4563082614542412e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 990 }, { "completion_length": 638.375, "epoch": 1.0565333333333333, "grad_norm": 0.00034069232060573995, "kl": 0.004581451416015625, "learning_rate": 2.4548719722473035e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 991 }, { "completion_length": 486.21875, "epoch": 1.0576, "grad_norm": 0.03185776621103287, "kl": 0.009411811828613281, "learning_rate": 2.45343420955042e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 992 }, { "completion_length": 568.03125, "epoch": 1.0586666666666666, "grad_norm": 0.0012702896492555737, "kl": 0.0073108673095703125, "learning_rate": 2.4519949755822433e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 993 }, { "completion_length": 547.71875, "epoch": 1.0597333333333334, "grad_norm": 0.018346931785345078, "kl": 0.0054149627685546875, "learning_rate": 2.450554272563695e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 994 }, { "completion_length": 594.1875, "epoch": 1.0608, "grad_norm": 0.013586803339421749, "kl": 0.0073909759521484375, "learning_rate": 2.4491121027179663e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 995 }, { "completion_length": 719.28125, "epoch": 1.0618666666666667, "grad_norm": 0.0006317626102827489, "kl": 0.0041046142578125, "learning_rate": 2.447668468270509e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 996 }, { "completion_length": 501.625, "epoch": 1.0629333333333333, "grad_norm": 0.013026021420955658, "kl": 0.009717941284179688, "learning_rate": 2.4462233714490373e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 997 }, { "completion_length": 657.09375, "epoch": 1.064, "grad_norm": 0.0006782050477340817, "kl": 0.00873565673828125, "learning_rate": 2.4447768144835197e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 998 }, { "completion_length": 618.90625, "epoch": 1.0650666666666666, "grad_norm": 0.02651985175907612, "kl": 0.008718490600585938, "learning_rate": 2.44332879960618e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 999 }, { "completion_length": 530.9375, "epoch": 1.0661333333333334, "grad_norm": 0.011718569323420525, "kl": 0.010198593139648438, "learning_rate": 2.441879329051491e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1000 }, { "completion_length": 517.75, "epoch": 1.0672, "grad_norm": 0.003662874223664403, "kl": 0.0063915252685546875, "learning_rate": 2.440428405056171e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 1001 }, { "completion_length": 527.75, "epoch": 1.0682666666666667, "grad_norm": 0.0005655893473885953, "kl": 0.009126663208007812, "learning_rate": 2.4389760298591824e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1002 }, { "completion_length": 620.90625, "epoch": 1.0693333333333332, "grad_norm": 0.000720503565389663, "kl": 0.0051937103271484375, "learning_rate": 2.437522205701727e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1003 }, { "completion_length": 522.875, "epoch": 1.0704, "grad_norm": 0.013333648443222046, "kl": 0.008050918579101562, "learning_rate": 2.4360669348272407e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1004 }, { "completion_length": 550.4375, "epoch": 1.0714666666666666, "grad_norm": 0.0025833994150161743, "kl": 0.00891876220703125, "learning_rate": 2.4346102194813937e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1005 }, { "completion_length": 544.46875, "epoch": 1.0725333333333333, "grad_norm": 0.0005505552981048822, "kl": 0.00524139404296875, "learning_rate": 2.4331520619120854e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1006 }, { "completion_length": 548.1875, "epoch": 1.0735999999999999, "grad_norm": 0.00027913643862120807, "kl": 0.0047855377197265625, "learning_rate": 2.4316924643694386e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1007 }, { "completion_length": 816.25, "epoch": 1.0746666666666667, "grad_norm": 0.000232126607443206, "kl": 0.007966995239257812, "learning_rate": 2.4302314291058004e-06, "loss": 0.0003, "reward": 0.3125, "reward_std": 0.125, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 1008 }, { "completion_length": 651.84375, "epoch": 1.0757333333333334, "grad_norm": 0.0006476174457930028, "kl": 0.00614166259765625, "learning_rate": 2.4287689583757355e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1009 }, { "completion_length": 722.5, "epoch": 1.0768, "grad_norm": 0.015134432353079319, "kl": 0.005955696105957031, "learning_rate": 2.427305054436024e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1010 }, { "completion_length": 645.5, "epoch": 1.0778666666666668, "grad_norm": 0.010260011069476604, "kl": 0.00554656982421875, "learning_rate": 2.4258397195456573e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1011 }, { "completion_length": 573.6875, "epoch": 1.0789333333333333, "grad_norm": 0.01338683720678091, "kl": 0.008762359619140625, "learning_rate": 2.4243729559658347e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1012 }, { "completion_length": 648.84375, "epoch": 1.08, "grad_norm": 0.015186600387096405, "kl": 0.007651329040527344, "learning_rate": 2.422904765959962e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1013 }, { "completion_length": 669.6875, "epoch": 1.0810666666666666, "grad_norm": 0.007131900172680616, "kl": 0.005222320556640625, "learning_rate": 2.4214351517936423e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1014 }, { "completion_length": 637.25, "epoch": 1.0821333333333334, "grad_norm": 0.022795015946030617, "kl": 0.005680084228515625, "learning_rate": 2.4199641157346813e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1015 }, { "completion_length": 518.0625, "epoch": 1.0832, "grad_norm": 0.01759764365851879, "kl": 0.0081329345703125, "learning_rate": 2.4184916600530743e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1016 }, { "completion_length": 440.5, "epoch": 1.0842666666666667, "grad_norm": 0.0016089283162727952, "kl": 0.008075714111328125, "learning_rate": 2.4170177870210112e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1017 }, { "completion_length": 569.25, "epoch": 1.0853333333333333, "grad_norm": 0.0004580810200423002, "kl": 0.009702682495117188, "learning_rate": 2.4155424989128654e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1018 }, { "completion_length": 582.84375, "epoch": 1.0864, "grad_norm": 0.010900857858359814, "kl": 0.005916595458984375, "learning_rate": 2.414065798005197e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1019 }, { "completion_length": 634.875, "epoch": 1.0874666666666666, "grad_norm": 0.031134778633713722, "kl": 0.0062160491943359375, "learning_rate": 2.4125876865767443e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1020 }, { "completion_length": 629.875, "epoch": 1.0885333333333334, "grad_norm": 0.0008806391851976514, "kl": 0.008904457092285156, "learning_rate": 2.4111081669084224e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1021 }, { "completion_length": 672.09375, "epoch": 1.0896, "grad_norm": 0.010930759832262993, "kl": 0.0058956146240234375, "learning_rate": 2.4096272412833213e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1022 }, { "completion_length": 731.0625, "epoch": 1.0906666666666667, "grad_norm": 0.012975483201444149, "kl": 0.008886337280273438, "learning_rate": 2.4081449119866983e-06, "loss": 0.0004, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1023 }, { "completion_length": 765.1875, "epoch": 1.0917333333333334, "grad_norm": 0.011402794159948826, "kl": 0.004702568054199219, "learning_rate": 2.4066611813059774e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1024 }, { "completion_length": 592.46875, "epoch": 1.0928, "grad_norm": 0.010501089505851269, "kl": 0.010557174682617188, "learning_rate": 2.405176051530746e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1025 }, { "completion_length": 560.25, "epoch": 1.0938666666666668, "grad_norm": 0.0003612639557104558, "kl": 0.006855964660644531, "learning_rate": 2.40368952495275e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1026 }, { "completion_length": 554.40625, "epoch": 1.0949333333333333, "grad_norm": 0.021788932383060455, "kl": 0.008289337158203125, "learning_rate": 2.4022016038658896e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1027 }, { "completion_length": 512.0, "epoch": 1.096, "grad_norm": 0.012089061550796032, "kl": 0.008085250854492188, "learning_rate": 2.4007122905662187e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.125, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1028 }, { "completion_length": 593.3125, "epoch": 1.0970666666666666, "grad_norm": 0.024261735379695892, "kl": 0.010622024536132812, "learning_rate": 2.399221587351939e-06, "loss": 0.0004, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1029 }, { "completion_length": 777.4375, "epoch": 1.0981333333333334, "grad_norm": 0.0114437285810709, "kl": 0.0060825347900390625, "learning_rate": 2.3977294965233963e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1030 }, { "completion_length": 633.21875, "epoch": 1.0992, "grad_norm": 0.018525194376707077, "kl": 0.0136871337890625, "learning_rate": 2.3962360203830777e-06, "loss": 0.0005, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1031 }, { "completion_length": 684.28125, "epoch": 1.1002666666666667, "grad_norm": 0.014475378207862377, "kl": 0.0061550140380859375, "learning_rate": 2.3947411612356092e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1032 }, { "completion_length": 569.34375, "epoch": 1.1013333333333333, "grad_norm": 0.001047609024681151, "kl": 0.008119583129882812, "learning_rate": 2.3932449213877505e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1033 }, { "completion_length": 521.25, "epoch": 1.1024, "grad_norm": 0.0003591480781324208, "kl": 0.00827789306640625, "learning_rate": 2.391747303148391e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1034 }, { "completion_length": 627.59375, "epoch": 1.1034666666666666, "grad_norm": 0.0006856649415567517, "kl": 0.0076847076416015625, "learning_rate": 2.390248308828548e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1035 }, { "completion_length": 654.59375, "epoch": 1.1045333333333334, "grad_norm": 0.012299902737140656, "kl": 0.0045928955078125, "learning_rate": 2.3887479407413617e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1036 }, { "completion_length": 587.46875, "epoch": 1.1056, "grad_norm": 0.009356142953038216, "kl": 0.00801849365234375, "learning_rate": 2.387246201202093e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1037 }, { "completion_length": 575.0, "epoch": 1.1066666666666667, "grad_norm": 0.01558208279311657, "kl": 0.008428573608398438, "learning_rate": 2.3857430925281186e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1038 }, { "completion_length": 482.9375, "epoch": 1.1077333333333332, "grad_norm": 0.012071652337908745, "kl": 0.010345458984375, "learning_rate": 2.3842386170389277e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1039 }, { "completion_length": 528.0, "epoch": 1.1088, "grad_norm": 0.001177618163637817, "kl": 0.0172119140625, "learning_rate": 2.382732777056119e-06, "loss": 0.0007, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1040 }, { "completion_length": 618.59375, "epoch": 1.1098666666666666, "grad_norm": 0.011930396780371666, "kl": 0.01010894775390625, "learning_rate": 2.3812255749033975e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1041 }, { "completion_length": 588.65625, "epoch": 1.1109333333333333, "grad_norm": 0.018861323595046997, "kl": 0.008932113647460938, "learning_rate": 2.379717012906568e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1042 }, { "completion_length": 584.1875, "epoch": 1.112, "grad_norm": 0.010661518201231956, "kl": 0.00939178466796875, "learning_rate": 2.3782070933935363e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1043 }, { "completion_length": 633.59375, "epoch": 1.1130666666666666, "grad_norm": 0.002169155515730381, "kl": 0.0052051544189453125, "learning_rate": 2.3766958186943022e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1044 }, { "completion_length": 554.375, "epoch": 1.1141333333333334, "grad_norm": 0.0004730051732622087, "kl": 0.007932662963867188, "learning_rate": 2.3751831911409554e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1045 }, { "completion_length": 683.125, "epoch": 1.1152, "grad_norm": 0.014937917701900005, "kl": 0.007822036743164062, "learning_rate": 2.373669213067675e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1046 }, { "completion_length": 615.625, "epoch": 1.1162666666666667, "grad_norm": 0.011868114583194256, "kl": 0.00522613525390625, "learning_rate": 2.3721538868107225e-06, "loss": 0.0002, "reward": 0.34375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 1047 }, { "completion_length": 621.21875, "epoch": 1.1173333333333333, "grad_norm": 0.013019127771258354, "kl": 0.0067615509033203125, "learning_rate": 2.370637214708442e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1048 }, { "completion_length": 565.71875, "epoch": 1.1184, "grad_norm": 0.01918908953666687, "kl": 0.010660171508789062, "learning_rate": 2.3691191991012524e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1049 }, { "completion_length": 599.90625, "epoch": 1.1194666666666666, "grad_norm": 0.012187689542770386, "kl": 0.011203765869140625, "learning_rate": 2.367599842331646e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1050 }, { "completion_length": 613.34375, "epoch": 1.1205333333333334, "grad_norm": 0.012218169867992401, "kl": 0.008213043212890625, "learning_rate": 2.3660791467441863e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1051 }, { "completion_length": 589.0, "epoch": 1.1216, "grad_norm": 0.008478758856654167, "kl": 0.0052204132080078125, "learning_rate": 2.364557114685501e-06, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1052 }, { "completion_length": 562.5625, "epoch": 1.1226666666666667, "grad_norm": 0.010902628302574158, "kl": 0.006885528564453125, "learning_rate": 2.3630337485042807e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1053 }, { "completion_length": 429.28125, "epoch": 1.1237333333333333, "grad_norm": 0.009677627123892307, "kl": 0.01123809814453125, "learning_rate": 2.3615090505512755e-06, "loss": 0.0005, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1054 }, { "completion_length": 651.78125, "epoch": 1.1248, "grad_norm": 0.012361954897642136, "kl": 0.007305145263671875, "learning_rate": 2.3599830231792896e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1055 }, { "completion_length": 615.21875, "epoch": 1.1258666666666666, "grad_norm": 0.000579266925342381, "kl": 0.0051898956298828125, "learning_rate": 2.3584556687431787e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1056 }, { "completion_length": 655.5625, "epoch": 1.1269333333333333, "grad_norm": 0.00038487129495479167, "kl": 0.012563705444335938, "learning_rate": 2.3569269895998465e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1057 }, { "completion_length": 529.84375, "epoch": 1.1280000000000001, "grad_norm": 0.0004189469909761101, "kl": 0.012237548828125, "learning_rate": 2.355396988108242e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1058 }, { "completion_length": 547.15625, "epoch": 1.1290666666666667, "grad_norm": 0.00047873525181785226, "kl": 0.0053386688232421875, "learning_rate": 2.3538656666293525e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1059 }, { "completion_length": 482.875, "epoch": 1.1301333333333332, "grad_norm": 0.012812867760658264, "kl": 0.008245468139648438, "learning_rate": 2.352333027526204e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1060 }, { "completion_length": 551.5, "epoch": 1.1312, "grad_norm": 0.016537869349122047, "kl": 0.009138107299804688, "learning_rate": 2.350799073163856e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1061 }, { "completion_length": 725.3125, "epoch": 1.1322666666666668, "grad_norm": 0.00019561979570426047, "kl": 0.004185676574707031, "learning_rate": 2.3492638059093957e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1062 }, { "completion_length": 529.3125, "epoch": 1.1333333333333333, "grad_norm": 0.014647443778812885, "kl": 0.008878707885742188, "learning_rate": 2.3477272281319386e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1063 }, { "completion_length": 460.53125, "epoch": 1.1344, "grad_norm": 0.00046819730778224766, "kl": 0.0155792236328125, "learning_rate": 2.3461893422026198e-06, "loss": 0.0006, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1064 }, { "completion_length": 606.4375, "epoch": 1.1354666666666666, "grad_norm": 0.014641834422945976, "kl": 0.008245468139648438, "learning_rate": 2.344650150494596e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1065 }, { "completion_length": 577.125, "epoch": 1.1365333333333334, "grad_norm": 0.009389547631144524, "kl": 0.00620269775390625, "learning_rate": 2.343109655383037e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1066 }, { "completion_length": 717.34375, "epoch": 1.1376, "grad_norm": 0.00020591873908415437, "kl": 0.006434440612792969, "learning_rate": 2.341567859245124e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1067 }, { "completion_length": 717.34375, "epoch": 1.1386666666666667, "grad_norm": 0.002627094741910696, "kl": 0.0052623748779296875, "learning_rate": 2.340024764460046e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1068 }, { "completion_length": 613.21875, "epoch": 1.1397333333333333, "grad_norm": 0.014577399007976055, "kl": 0.011533737182617188, "learning_rate": 2.3384803734089975e-06, "loss": 0.0005, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1069 }, { "completion_length": 462.5625, "epoch": 1.1408, "grad_norm": 0.016221966594457626, "kl": 0.010059356689453125, "learning_rate": 2.3369346884751707e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1070 }, { "completion_length": 687.125, "epoch": 1.1418666666666666, "grad_norm": 0.00889455247670412, "kl": 0.008184432983398438, "learning_rate": 2.3353877120437565e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1071 }, { "completion_length": 473.0625, "epoch": 1.1429333333333334, "grad_norm": 0.019018476828932762, "kl": 0.008113861083984375, "learning_rate": 2.333839446501938e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1072 }, { "completion_length": 458.0, "epoch": 1.144, "grad_norm": 0.0006568889948539436, "kl": 0.012599945068359375, "learning_rate": 2.3322898942388862e-06, "loss": 0.0005, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1073 }, { "completion_length": 524.3125, "epoch": 1.1450666666666667, "grad_norm": 0.0005744692170992494, "kl": 0.004756927490234375, "learning_rate": 2.330739057645761e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1074 }, { "completion_length": 518.4375, "epoch": 1.1461333333333332, "grad_norm": 0.00034784144372679293, "kl": 0.011058807373046875, "learning_rate": 2.329186939115701e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1075 }, { "completion_length": 614.53125, "epoch": 1.1472, "grad_norm": 0.010061669163405895, "kl": 0.008920669555664062, "learning_rate": 2.3276335410438246e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1076 }, { "completion_length": 510.0, "epoch": 1.1482666666666668, "grad_norm": 0.03813467174768448, "kl": 0.0163421630859375, "learning_rate": 2.3260788658272246e-06, "loss": 0.0007, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1077 }, { "completion_length": 543.40625, "epoch": 1.1493333333333333, "grad_norm": 0.0008230321109294891, "kl": 0.0077972412109375, "learning_rate": 2.324522915864963e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1078 }, { "completion_length": 691.34375, "epoch": 1.1504, "grad_norm": 0.012825827114284039, "kl": 0.009683609008789062, "learning_rate": 2.3229656935580715e-06, "loss": 0.0004, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1079 }, { "completion_length": 549.3125, "epoch": 1.1514666666666666, "grad_norm": 0.016602125018835068, "kl": 0.013025283813476562, "learning_rate": 2.3214072013095436e-06, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1080 }, { "completion_length": 602.8125, "epoch": 1.1525333333333334, "grad_norm": 0.006718806456774473, "kl": 0.0082244873046875, "learning_rate": 2.3198474415243323e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1081 }, { "completion_length": 516.625, "epoch": 1.1536, "grad_norm": 0.019506407901644707, "kl": 0.007171630859375, "learning_rate": 2.3182864166093476e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1082 }, { "completion_length": 542.5, "epoch": 1.1546666666666667, "grad_norm": 0.0005245170323178172, "kl": 0.012073516845703125, "learning_rate": 2.3167241289734514e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1083 }, { "completion_length": 515.65625, "epoch": 1.1557333333333333, "grad_norm": 0.0008858436485752463, "kl": 0.012317657470703125, "learning_rate": 2.3151605810274527e-06, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1084 }, { "completion_length": 559.8125, "epoch": 1.1568, "grad_norm": 0.009225446730852127, "kl": 0.0087432861328125, "learning_rate": 2.3135957751841075e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1085 }, { "completion_length": 557.0, "epoch": 1.1578666666666666, "grad_norm": 0.023872263729572296, "kl": 0.008855819702148438, "learning_rate": 2.312029713858112e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1086 }, { "completion_length": 623.8125, "epoch": 1.1589333333333334, "grad_norm": 0.018188845366239548, "kl": 0.00466156005859375, "learning_rate": 2.3104623994660996e-06, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1087 }, { "completion_length": 533.6875, "epoch": 1.16, "grad_norm": 0.011855081655085087, "kl": 0.0071868896484375, "learning_rate": 2.308893834426637e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1088 }, { "completion_length": 550.0625, "epoch": 1.1610666666666667, "grad_norm": 0.0004817430744878948, "kl": 0.011798858642578125, "learning_rate": 2.307324021160222e-06, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1089 }, { "completion_length": 554.28125, "epoch": 1.1621333333333332, "grad_norm": 0.02092517726123333, "kl": 0.009326934814453125, "learning_rate": 2.3057529620892775e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1090 }, { "completion_length": 551.34375, "epoch": 1.1632, "grad_norm": 0.0005532547947950661, "kl": 0.007266998291015625, "learning_rate": 2.3041806596381493e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1091 }, { "completion_length": 544.03125, "epoch": 1.1642666666666668, "grad_norm": 0.0007244640146382153, "kl": 0.0048160552978515625, "learning_rate": 2.302607116233101e-06, "loss": 0.0002, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1092 }, { "completion_length": 517.0625, "epoch": 1.1653333333333333, "grad_norm": 0.01631855219602585, "kl": 0.007354736328125, "learning_rate": 2.3010323343023135e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1093 }, { "completion_length": 545.65625, "epoch": 1.1663999999999999, "grad_norm": 0.000859578256495297, "kl": 0.0066928863525390625, "learning_rate": 2.2994563162758758e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1094 }, { "completion_length": 488.4375, "epoch": 1.1674666666666667, "grad_norm": 0.011287637986242771, "kl": 0.008213043212890625, "learning_rate": 2.2978790645857867e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1095 }, { "completion_length": 726.09375, "epoch": 1.1685333333333334, "grad_norm": 0.03011258691549301, "kl": 0.008062362670898438, "learning_rate": 2.2963005816659477e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1096 }, { "completion_length": 535.9375, "epoch": 1.1696, "grad_norm": 0.02621668204665184, "kl": 0.013271331787109375, "learning_rate": 2.2947208699521608e-06, "loss": 0.0005, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1097 }, { "completion_length": 694.03125, "epoch": 1.1706666666666667, "grad_norm": 0.005752963945269585, "kl": 0.0068874359130859375, "learning_rate": 2.293139931882123e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1098 }, { "completion_length": 691.125, "epoch": 1.1717333333333333, "grad_norm": 0.0005337951588444412, "kl": 0.0057525634765625, "learning_rate": 2.2915577698954254e-06, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1099 }, { "completion_length": 557.8125, "epoch": 1.1728, "grad_norm": 0.013920563273131847, "kl": 0.007091522216796875, "learning_rate": 2.2899743864335463e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1100 }, { "completion_length": 666.875, "epoch": 1.1738666666666666, "grad_norm": 0.0037772157229483128, "kl": 0.012079238891601562, "learning_rate": 2.28838978393985e-06, "loss": 0.0005, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1101 }, { "completion_length": 580.1875, "epoch": 1.1749333333333334, "grad_norm": 0.0002662607003003359, "kl": 0.011381149291992188, "learning_rate": 2.2868039648595807e-06, "loss": 0.0005, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1102 }, { "completion_length": 569.8125, "epoch": 1.176, "grad_norm": 0.0006004689494147897, "kl": 0.009122848510742188, "learning_rate": 2.2852169316398612e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1103 }, { "completion_length": 566.125, "epoch": 1.1770666666666667, "grad_norm": 0.0010300504509359598, "kl": 0.015949249267578125, "learning_rate": 2.2836286867296872e-06, "loss": 0.0006, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1104 }, { "completion_length": 672.71875, "epoch": 1.1781333333333333, "grad_norm": 0.013759510591626167, "kl": 0.008871078491210938, "learning_rate": 2.282039232579925e-06, "loss": 0.0004, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1105 }, { "completion_length": 720.4375, "epoch": 1.1792, "grad_norm": 0.0003082614275626838, "kl": 0.0047283172607421875, "learning_rate": 2.280448571643305e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1106 }, { "completion_length": 685.53125, "epoch": 1.1802666666666666, "grad_norm": 0.013137449510395527, "kl": 0.008203506469726562, "learning_rate": 2.278856706374422e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1107 }, { "completion_length": 608.0, "epoch": 1.1813333333333333, "grad_norm": 0.01041882298886776, "kl": 0.007549285888671875, "learning_rate": 2.277263639229728e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1108 }, { "completion_length": 495.625, "epoch": 1.1824, "grad_norm": 0.022505979984998703, "kl": 0.010679244995117188, "learning_rate": 2.2756693726675305e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1109 }, { "completion_length": 580.96875, "epoch": 1.1834666666666667, "grad_norm": 0.007970555685460567, "kl": 0.02672576904296875, "learning_rate": 2.274073909147986e-06, "loss": 0.0011, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1110 }, { "completion_length": 695.75, "epoch": 1.1845333333333334, "grad_norm": 0.014978848397731781, "kl": 0.014066696166992188, "learning_rate": 2.2724772511331015e-06, "loss": 0.0006, "reward": 0.53125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1111 }, { "completion_length": 489.65625, "epoch": 1.1856, "grad_norm": 0.012171509675681591, "kl": 0.01018524169921875, "learning_rate": 2.2708794010867225e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1112 }, { "completion_length": 582.75, "epoch": 1.1866666666666668, "grad_norm": 0.021932998672127724, "kl": 0.00579071044921875, "learning_rate": 2.2692803614745386e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1113 }, { "completion_length": 596.0625, "epoch": 1.1877333333333333, "grad_norm": 0.0006007946212776005, "kl": 0.010829925537109375, "learning_rate": 2.267680134764072e-06, "loss": 0.0004, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1114 }, { "completion_length": 546.96875, "epoch": 1.1888, "grad_norm": 0.007610719185322523, "kl": 0.0062084197998046875, "learning_rate": 2.266078723424679e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1115 }, { "completion_length": 797.375, "epoch": 1.1898666666666666, "grad_norm": 0.013942292891442776, "kl": 0.0057086944580078125, "learning_rate": 2.264476129927541e-06, "loss": 0.0002, "reward": 0.40625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1116 }, { "completion_length": 728.53125, "epoch": 1.1909333333333334, "grad_norm": 0.1281808316707611, "kl": 0.04720497131347656, "learning_rate": 2.262872356745667e-06, "loss": 0.0019, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1117 }, { "completion_length": 776.03125, "epoch": 1.192, "grad_norm": 0.01756027154624462, "kl": 0.0060024261474609375, "learning_rate": 2.2612674063538836e-06, "loss": 0.0002, "reward": 0.34375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 1118 }, { "completion_length": 654.71875, "epoch": 1.1930666666666667, "grad_norm": 0.01908211223781109, "kl": 0.004630088806152344, "learning_rate": 2.259661281228836e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1119 }, { "completion_length": 473.8125, "epoch": 1.1941333333333333, "grad_norm": 0.01656883768737316, "kl": 0.00832366943359375, "learning_rate": 2.2580539838489804e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1120 }, { "completion_length": 729.28125, "epoch": 1.1952, "grad_norm": 0.011806572787463665, "kl": 0.0046062469482421875, "learning_rate": 2.2564455166945836e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1121 }, { "completion_length": 582.21875, "epoch": 1.1962666666666666, "grad_norm": 0.006238792557269335, "kl": 0.015272140502929688, "learning_rate": 2.254835882247716e-06, "loss": 0.0006, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1122 }, { "completion_length": 646.75, "epoch": 1.1973333333333334, "grad_norm": 0.02237238734960556, "kl": 0.006000518798828125, "learning_rate": 2.2532250829922504e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1123 }, { "completion_length": 733.03125, "epoch": 1.1984, "grad_norm": 0.013292050920426846, "kl": 0.0055904388427734375, "learning_rate": 2.251613121413858e-06, "loss": 0.0002, "reward": 0.28125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 1124 }, { "completion_length": 779.5, "epoch": 1.1994666666666667, "grad_norm": 0.011388933286070824, "kl": 0.0063323974609375, "learning_rate": 2.25e-06, "loss": 0.0003, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1125 }, { "completion_length": 641.0625, "epoch": 1.2005333333333335, "grad_norm": 0.009037202224135399, "kl": 0.0075168609619140625, "learning_rate": 2.248385721239931e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1126 }, { "completion_length": 646.09375, "epoch": 1.2016, "grad_norm": 0.01855260320007801, "kl": 0.0071277618408203125, "learning_rate": 2.2467702876246907e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1127 }, { "completion_length": 674.71875, "epoch": 1.2026666666666666, "grad_norm": 0.01736130379140377, "kl": 0.0095367431640625, "learning_rate": 2.245153701647099e-06, "loss": 0.0004, "reward": 0.5625, "reward_std": 0.375, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1128 }, { "completion_length": 563.03125, "epoch": 1.2037333333333333, "grad_norm": 0.014031539671123028, "kl": 0.0064716339111328125, "learning_rate": 2.2435359658017558e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1129 }, { "completion_length": 599.59375, "epoch": 1.2048, "grad_norm": 0.01800990104675293, "kl": 0.0091094970703125, "learning_rate": 2.2419170825850363e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1130 }, { "completion_length": 560.125, "epoch": 1.2058666666666666, "grad_norm": 0.000375211559003219, "kl": 0.0064907073974609375, "learning_rate": 2.2402970544950836e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1131 }, { "completion_length": 811.09375, "epoch": 1.2069333333333334, "grad_norm": 0.015832360833883286, "kl": 0.010030746459960938, "learning_rate": 2.238675884031809e-06, "loss": 0.0004, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1132 }, { "completion_length": 590.34375, "epoch": 1.208, "grad_norm": 0.0004541722300928086, "kl": 0.012119293212890625, "learning_rate": 2.237053573696887e-06, "loss": 0.0005, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1133 }, { "completion_length": 570.96875, "epoch": 1.2090666666666667, "grad_norm": 0.0006265526753850281, "kl": 0.011880874633789062, "learning_rate": 2.23543012599375e-06, "loss": 0.0005, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1134 }, { "completion_length": 482.78125, "epoch": 1.2101333333333333, "grad_norm": 0.023369288071990013, "kl": 0.009256362915039062, "learning_rate": 2.2338055434275873e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1135 }, { "completion_length": 658.5625, "epoch": 1.2112, "grad_norm": 0.013758067972958088, "kl": 0.0120391845703125, "learning_rate": 2.2321798285053366e-06, "loss": 0.0005, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1136 }, { "completion_length": 705.9375, "epoch": 1.2122666666666666, "grad_norm": 0.015511819161474705, "kl": 0.00739288330078125, "learning_rate": 2.230552983735686e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1137 }, { "completion_length": 710.46875, "epoch": 1.2133333333333334, "grad_norm": 0.020236384123563766, "kl": 0.01003265380859375, "learning_rate": 2.2289250116290644e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1138 }, { "completion_length": 599.71875, "epoch": 1.2144, "grad_norm": 0.0009744733688421547, "kl": 0.00830078125, "learning_rate": 2.2272959146976422e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1139 }, { "completion_length": 616.6875, "epoch": 1.2154666666666667, "grad_norm": 0.014037062413990498, "kl": 0.009141921997070312, "learning_rate": 2.225665695455325e-06, "loss": 0.0004, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1140 }, { "completion_length": 546.15625, "epoch": 1.2165333333333332, "grad_norm": 0.01843961514532566, "kl": 0.008996963500976562, "learning_rate": 2.2240343564177498e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1141 }, { "completion_length": 669.4375, "epoch": 1.2176, "grad_norm": 0.013629628345370293, "kl": 0.008558273315429688, "learning_rate": 2.2224019001022824e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1142 }, { "completion_length": 715.59375, "epoch": 1.2186666666666666, "grad_norm": 0.008678143844008446, "kl": 0.0074615478515625, "learning_rate": 2.220768329028013e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1143 }, { "completion_length": 560.65625, "epoch": 1.2197333333333333, "grad_norm": 0.010131691582500935, "kl": 0.0104827880859375, "learning_rate": 2.21913364571575e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1144 }, { "completion_length": 642.125, "epoch": 1.2208, "grad_norm": 0.0004585182759910822, "kl": 0.006140708923339844, "learning_rate": 2.21749785268802e-06, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1145 }, { "completion_length": 674.3125, "epoch": 1.2218666666666667, "grad_norm": 0.010328765958547592, "kl": 0.0075092315673828125, "learning_rate": 2.2158609524690615e-06, "loss": 0.0003, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1146 }, { "completion_length": 625.0, "epoch": 1.2229333333333334, "grad_norm": 0.01271140482276678, "kl": 0.008211135864257812, "learning_rate": 2.214222947584822e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1147 }, { "completion_length": 609.03125, "epoch": 1.224, "grad_norm": 0.0004703441518358886, "kl": 0.00830841064453125, "learning_rate": 2.2125838405629517e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1148 }, { "completion_length": 505.625, "epoch": 1.2250666666666667, "grad_norm": 0.015601180493831635, "kl": 0.010877609252929688, "learning_rate": 2.210943633932805e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1149 }, { "completion_length": 577.03125, "epoch": 1.2261333333333333, "grad_norm": 0.01735014282166958, "kl": 0.011502265930175781, "learning_rate": 2.2093023302254297e-06, "loss": 0.0005, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1150 }, { "completion_length": 587.9375, "epoch": 1.2272, "grad_norm": 0.010533609427511692, "kl": 0.00722503662109375, "learning_rate": 2.207659931973568e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1151 }, { "completion_length": 721.6875, "epoch": 1.2282666666666666, "grad_norm": 0.0172139722853899, "kl": 0.007476806640625, "learning_rate": 2.206016441711652e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1152 }, { "completion_length": 667.65625, "epoch": 1.2293333333333334, "grad_norm": 0.015525475144386292, "kl": 0.0053348541259765625, "learning_rate": 2.204371861975798e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1153 }, { "completion_length": 628.90625, "epoch": 1.2304, "grad_norm": 0.00037340386188589036, "kl": 0.013690948486328125, "learning_rate": 2.202726195303802e-06, "loss": 0.0005, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1154 }, { "completion_length": 506.8125, "epoch": 1.2314666666666667, "grad_norm": 0.015397511422634125, "kl": 0.010847091674804688, "learning_rate": 2.20107944423514e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1155 }, { "completion_length": 526.875, "epoch": 1.2325333333333333, "grad_norm": 0.00968928076326847, "kl": 0.009616851806640625, "learning_rate": 2.1994316113109607e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1156 }, { "completion_length": 541.25, "epoch": 1.2336, "grad_norm": 0.02263009175658226, "kl": 0.0054683685302734375, "learning_rate": 2.1977826990740798e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1157 }, { "completion_length": 565.65625, "epoch": 1.2346666666666666, "grad_norm": 0.017205731943249702, "kl": 0.016357421875, "learning_rate": 2.1961327100689823e-06, "loss": 0.0007, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1158 }, { "completion_length": 765.0, "epoch": 1.2357333333333334, "grad_norm": 0.06228814274072647, "kl": 0.005672454833984375, "learning_rate": 2.1944816468418123e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1159 }, { "completion_length": 615.65625, "epoch": 1.2368000000000001, "grad_norm": 0.012319199740886688, "kl": 0.0043964385986328125, "learning_rate": 2.1928295119403713e-06, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1160 }, { "completion_length": 633.375, "epoch": 1.2378666666666667, "grad_norm": 0.019157785922288895, "kl": 0.007946014404296875, "learning_rate": 2.1911763079141163e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1161 }, { "completion_length": 640.6875, "epoch": 1.2389333333333332, "grad_norm": 0.012698204256594181, "kl": 0.00716400146484375, "learning_rate": 2.1895220373141533e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1162 }, { "completion_length": 652.1875, "epoch": 1.24, "grad_norm": 0.009386896155774593, "kl": 0.0113372802734375, "learning_rate": 2.1878667026932333e-06, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1163 }, { "completion_length": 718.15625, "epoch": 1.2410666666666668, "grad_norm": 0.014553130604326725, "kl": 0.008291244506835938, "learning_rate": 2.1862103066057508e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1164 }, { "completion_length": 595.6875, "epoch": 1.2421333333333333, "grad_norm": 0.011394434608519077, "kl": 0.011522293090820312, "learning_rate": 2.1845528516077364e-06, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1165 }, { "completion_length": 589.6875, "epoch": 1.2432, "grad_norm": 0.0002169158833567053, "kl": 0.00995635986328125, "learning_rate": 2.182894340256857e-06, "loss": 0.0004, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1166 }, { "completion_length": 656.8125, "epoch": 1.2442666666666666, "grad_norm": 0.0006030050572007895, "kl": 0.0066585540771484375, "learning_rate": 2.1812347751124072e-06, "loss": 0.0003, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1167 }, { "completion_length": 369.53125, "epoch": 1.2453333333333334, "grad_norm": 0.011113031767308712, "kl": 0.010950088500976562, "learning_rate": 2.17957415873531e-06, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.0, "step": 1168 }, { "completion_length": 786.21875, "epoch": 1.2464, "grad_norm": 0.0006681903614662588, "kl": 0.00670623779296875, "learning_rate": 2.177912493688109e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1169 }, { "completion_length": 713.4375, "epoch": 1.2474666666666667, "grad_norm": 0.017123019322752953, "kl": 0.0066070556640625, "learning_rate": 2.1762497825349665e-06, "loss": 0.0003, "reward": 0.34375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 1170 }, { "completion_length": 498.1875, "epoch": 1.2485333333333333, "grad_norm": 0.014773450791835785, "kl": 0.00798797607421875, "learning_rate": 2.1745860278416587e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 1171 }, { "completion_length": 579.53125, "epoch": 1.2496, "grad_norm": 0.0004623143468052149, "kl": 0.0076751708984375, "learning_rate": 2.1729212321755738e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.375, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1172 }, { "completion_length": 507.53125, "epoch": 1.2506666666666666, "grad_norm": 0.00038370766560547054, "kl": 0.008205413818359375, "learning_rate": 2.171255398105703e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1173 }, { "completion_length": 560.0, "epoch": 1.2517333333333334, "grad_norm": 0.020410023629665375, "kl": 0.01709747314453125, "learning_rate": 2.169588528202644e-06, "loss": 0.0007, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1174 }, { "completion_length": 670.90625, "epoch": 1.2528000000000001, "grad_norm": 0.013416011817753315, "kl": 0.00598907470703125, "learning_rate": 2.16792062503859e-06, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1175 }, { "completion_length": 609.9375, "epoch": 1.2538666666666667, "grad_norm": 0.01285651046782732, "kl": 0.00811004638671875, "learning_rate": 2.166251691187329e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1176 }, { "completion_length": 650.875, "epoch": 1.2549333333333332, "grad_norm": 0.012745539657771587, "kl": 0.0076141357421875, "learning_rate": 2.1645817292242415e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1177 }, { "completion_length": 567.9375, "epoch": 1.256, "grad_norm": 0.0003793219511862844, "kl": 0.00530242919921875, "learning_rate": 2.162910741726292e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1178 }, { "completion_length": 459.8125, "epoch": 1.2570666666666668, "grad_norm": 0.025026338174939156, "kl": 0.01636505126953125, "learning_rate": 2.1612387312720286e-06, "loss": 0.0007, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1179 }, { "completion_length": 609.09375, "epoch": 1.2581333333333333, "grad_norm": 0.009737813845276833, "kl": 0.008028030395507812, "learning_rate": 2.159565700441578e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1180 }, { "completion_length": 631.40625, "epoch": 1.2591999999999999, "grad_norm": 0.0005282722995616496, "kl": 0.013525009155273438, "learning_rate": 2.1578916518166414e-06, "loss": 0.0005, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1181 }, { "completion_length": 463.40625, "epoch": 1.2602666666666666, "grad_norm": 0.023690953850746155, "kl": 0.016309738159179688, "learning_rate": 2.156216587980491e-06, "loss": 0.0007, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1182 }, { "completion_length": 707.125, "epoch": 1.2613333333333334, "grad_norm": 0.03520140051841736, "kl": 0.01123046875, "learning_rate": 2.154540511517964e-06, "loss": 0.0004, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1183 }, { "completion_length": 740.46875, "epoch": 1.2624, "grad_norm": 0.00896117091178894, "kl": 0.0057544708251953125, "learning_rate": 2.152863425015463e-06, "loss": 0.0002, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1184 }, { "completion_length": 659.34375, "epoch": 1.2634666666666667, "grad_norm": 0.012418188154697418, "kl": 0.008584976196289062, "learning_rate": 2.1511853310609467e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1185 }, { "completion_length": 751.6875, "epoch": 1.2645333333333333, "grad_norm": 137.45782470703125, "kl": 12.13136100769043, "learning_rate": 2.1495062322439295e-06, "loss": 0.4835, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1186 }, { "completion_length": 600.34375, "epoch": 1.2656, "grad_norm": 0.010285577736794949, "kl": 0.008573532104492188, "learning_rate": 2.147826131155476e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1187 }, { "completion_length": 623.25, "epoch": 1.2666666666666666, "grad_norm": 0.019483089447021484, "kl": 0.009019851684570312, "learning_rate": 2.146145030388198e-06, "loss": 0.0004, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1188 }, { "completion_length": 668.03125, "epoch": 1.2677333333333334, "grad_norm": 0.0005610976950265467, "kl": 0.008230209350585938, "learning_rate": 2.1444629325362495e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1189 }, { "completion_length": 620.4375, "epoch": 1.2688, "grad_norm": 0.017677342519164085, "kl": 0.009508132934570312, "learning_rate": 2.1427798401953234e-06, "loss": 0.0004, "reward": 0.46875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1190 }, { "completion_length": 576.75, "epoch": 1.2698666666666667, "grad_norm": 0.011769439093768597, "kl": 0.00794219970703125, "learning_rate": 2.141095755962647e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1191 }, { "completion_length": 732.90625, "epoch": 1.2709333333333332, "grad_norm": 0.0009140791371464729, "kl": 0.00836181640625, "learning_rate": 2.1394106824369786e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1192 }, { "completion_length": 504.5625, "epoch": 1.272, "grad_norm": 0.0003776104422286153, "kl": 0.015415191650390625, "learning_rate": 2.137724622218602e-06, "loss": 0.0006, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1193 }, { "completion_length": 648.9375, "epoch": 1.2730666666666668, "grad_norm": 0.011455425061285496, "kl": 0.0071353912353515625, "learning_rate": 2.1360375779093257e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1194 }, { "completion_length": 595.03125, "epoch": 1.2741333333333333, "grad_norm": 0.015172876417636871, "kl": 0.0050907135009765625, "learning_rate": 2.134349552112474e-06, "loss": 0.0002, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1195 }, { "completion_length": 733.21875, "epoch": 1.2752, "grad_norm": 0.015222933143377304, "kl": 0.009700775146484375, "learning_rate": 2.1326605474328885e-06, "loss": 0.0004, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1196 }, { "completion_length": 571.1875, "epoch": 1.2762666666666667, "grad_norm": 0.001907902886159718, "kl": 0.012099266052246094, "learning_rate": 2.1309705664769195e-06, "loss": 0.0005, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1197 }, { "completion_length": 614.0, "epoch": 1.2773333333333334, "grad_norm": 0.0010743369348347187, "kl": 0.0067844390869140625, "learning_rate": 2.1292796118524247e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 1198 }, { "completion_length": 676.9375, "epoch": 1.2784, "grad_norm": 0.00923280231654644, "kl": 0.007068634033203125, "learning_rate": 2.127587686168763e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1199 }, { "completion_length": 666.75, "epoch": 1.2794666666666665, "grad_norm": 0.012538731098175049, "kl": 0.012334823608398438, "learning_rate": 2.1258947920367943e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1200 }, { "completion_length": 572.875, "epoch": 1.2805333333333333, "grad_norm": 0.029093483462929726, "kl": 0.0093841552734375, "learning_rate": 2.1242009320688705e-06, "loss": 0.0004, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1201 }, { "completion_length": 467.5, "epoch": 1.2816, "grad_norm": 0.018018577247858047, "kl": 0.00757598876953125, "learning_rate": 2.122506108878835e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1202 }, { "completion_length": 478.4375, "epoch": 1.2826666666666666, "grad_norm": 0.015252448618412018, "kl": 0.016679763793945312, "learning_rate": 2.120810325082017e-06, "loss": 0.0007, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1203 }, { "completion_length": 577.75, "epoch": 1.2837333333333334, "grad_norm": 0.014902959577739239, "kl": 0.008331298828125, "learning_rate": 2.11911358329523e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1204 }, { "completion_length": 647.34375, "epoch": 1.2848, "grad_norm": 0.001308619393967092, "kl": 0.00862884521484375, "learning_rate": 2.1174158861367616e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1205 }, { "completion_length": 738.625, "epoch": 1.2858666666666667, "grad_norm": 0.0002523988368920982, "kl": 0.0060672760009765625, "learning_rate": 2.1157172362263782e-06, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1206 }, { "completion_length": 675.78125, "epoch": 1.2869333333333333, "grad_norm": 0.0004171332693658769, "kl": 0.009471893310546875, "learning_rate": 2.1140176361853143e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1207 }, { "completion_length": 625.9375, "epoch": 1.288, "grad_norm": 0.0005467624287120998, "kl": 0.008031845092773438, "learning_rate": 2.112317088636271e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1208 }, { "completion_length": 614.78125, "epoch": 1.2890666666666668, "grad_norm": 0.00030096518457867205, "kl": 0.013889312744140625, "learning_rate": 2.1106155962034103e-06, "loss": 0.0006, "reward": 0.46875, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1209 }, { "completion_length": 534.21875, "epoch": 1.2901333333333334, "grad_norm": 0.03858719766139984, "kl": 0.015508651733398438, "learning_rate": 2.108913161512354e-06, "loss": 0.0006, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1210 }, { "completion_length": 619.28125, "epoch": 1.2912, "grad_norm": 0.019034195691347122, "kl": 0.012470245361328125, "learning_rate": 2.1072097871901775e-06, "loss": 0.0005, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1211 }, { "completion_length": 488.28125, "epoch": 1.2922666666666667, "grad_norm": 0.024340176954865456, "kl": 0.009571075439453125, "learning_rate": 2.1055054758654056e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1212 }, { "completion_length": 546.375, "epoch": 1.2933333333333334, "grad_norm": 0.03409958630800247, "kl": 0.01474761962890625, "learning_rate": 2.103800230168009e-06, "loss": 0.0006, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1213 }, { "completion_length": 638.53125, "epoch": 1.2944, "grad_norm": 0.0006501022144220769, "kl": 0.015333175659179688, "learning_rate": 2.1020940527294e-06, "loss": 0.0006, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1214 }, { "completion_length": 732.625, "epoch": 1.2954666666666665, "grad_norm": 0.007551982998847961, "kl": 0.0068511962890625, "learning_rate": 2.100386946182431e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1215 }, { "completion_length": 711.59375, "epoch": 1.2965333333333333, "grad_norm": 0.011358737014234066, "kl": 0.017705917358398438, "learning_rate": 2.0986789131613847e-06, "loss": 0.0007, "reward": 0.3125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 1216 }, { "completion_length": 564.625, "epoch": 1.2976, "grad_norm": 0.010777955874800682, "kl": 0.018337249755859375, "learning_rate": 2.0969699563019764e-06, "loss": 0.0007, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1217 }, { "completion_length": 589.875, "epoch": 1.2986666666666666, "grad_norm": 0.026637423783540726, "kl": 0.00608062744140625, "learning_rate": 2.0952600782413454e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1218 }, { "completion_length": 593.6875, "epoch": 1.2997333333333334, "grad_norm": 0.006972908973693848, "kl": 0.010997772216796875, "learning_rate": 2.0935492816180523e-06, "loss": 0.0004, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1219 }, { "completion_length": 589.125, "epoch": 1.3008, "grad_norm": 0.009530383162200451, "kl": 0.00754547119140625, "learning_rate": 2.0918375690720764e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1220 }, { "completion_length": 578.03125, "epoch": 1.3018666666666667, "grad_norm": 0.016814162954688072, "kl": 0.021961212158203125, "learning_rate": 2.090124943244809e-06, "loss": 0.0009, "reward": 0.375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1221 }, { "completion_length": 563.0625, "epoch": 1.3029333333333333, "grad_norm": 0.01609862968325615, "kl": 0.009042739868164062, "learning_rate": 2.088411406779053e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1222 }, { "completion_length": 705.0625, "epoch": 1.304, "grad_norm": 0.0006519664893858135, "kl": 0.0073108673095703125, "learning_rate": 2.0866969623190134e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1223 }, { "completion_length": 753.34375, "epoch": 1.3050666666666666, "grad_norm": 0.016677116975188255, "kl": 0.013153076171875, "learning_rate": 2.084981612510298e-06, "loss": 0.0005, "reward": 0.34375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 1224 }, { "completion_length": 599.28125, "epoch": 1.3061333333333334, "grad_norm": 0.0006548812380060554, "kl": 0.012792587280273438, "learning_rate": 2.083265359999913e-06, "loss": 0.0005, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1225 }, { "completion_length": 611.375, "epoch": 1.3072, "grad_norm": 0.0004412749840412289, "kl": 0.01251220703125, "learning_rate": 2.0815482074362554e-06, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1226 }, { "completion_length": 749.15625, "epoch": 1.3082666666666667, "grad_norm": 0.01523663755506277, "kl": 0.013538360595703125, "learning_rate": 2.0798301574691106e-06, "loss": 0.0005, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1227 }, { "completion_length": 559.46875, "epoch": 1.3093333333333335, "grad_norm": 0.00039678296889178455, "kl": 0.01326751708984375, "learning_rate": 2.078111212749652e-06, "loss": 0.0005, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1228 }, { "completion_length": 659.15625, "epoch": 1.3104, "grad_norm": 0.015950201079249382, "kl": 0.01483917236328125, "learning_rate": 2.0763913759304313e-06, "loss": 0.0006, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1229 }, { "completion_length": 641.5, "epoch": 1.3114666666666666, "grad_norm": 0.02292380854487419, "kl": 0.016448974609375, "learning_rate": 2.0746706496653765e-06, "loss": 0.0007, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1230 }, { "completion_length": 494.40625, "epoch": 1.3125333333333333, "grad_norm": 0.0006118149613030255, "kl": 0.008365631103515625, "learning_rate": 2.07294903660979e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1231 }, { "completion_length": 676.71875, "epoch": 1.3136, "grad_norm": 0.00035010784631595016, "kl": 0.008274078369140625, "learning_rate": 2.0712265394203414e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1232 }, { "completion_length": 576.46875, "epoch": 1.3146666666666667, "grad_norm": 0.013233615085482597, "kl": 0.011444091796875, "learning_rate": 2.069503160755064e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1233 }, { "completion_length": 494.1875, "epoch": 1.3157333333333332, "grad_norm": 0.012978498823940754, "kl": 0.019924163818359375, "learning_rate": 2.0677789032733534e-06, "loss": 0.0008, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1234 }, { "completion_length": 554.6875, "epoch": 1.3168, "grad_norm": 0.029986368492245674, "kl": 0.0070972442626953125, "learning_rate": 2.0660537696359586e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1235 }, { "completion_length": 500.15625, "epoch": 1.3178666666666667, "grad_norm": 0.0003826612082775682, "kl": 0.0075321197509765625, "learning_rate": 2.0643277625049832e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1236 }, { "completion_length": 558.9375, "epoch": 1.3189333333333333, "grad_norm": 0.019280698150396347, "kl": 0.00949859619140625, "learning_rate": 2.0626008845438784e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1237 }, { "completion_length": 495.78125, "epoch": 1.32, "grad_norm": 0.013442954048514366, "kl": 0.0186767578125, "learning_rate": 2.0608731384174363e-06, "loss": 0.0007, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1238 }, { "completion_length": 550.5, "epoch": 1.3210666666666666, "grad_norm": 0.020929250866174698, "kl": 0.009416580200195312, "learning_rate": 2.0591445267917923e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1239 }, { "completion_length": 597.0625, "epoch": 1.3221333333333334, "grad_norm": 0.001763841020874679, "kl": 0.012866973876953125, "learning_rate": 2.0574150523344153e-06, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1240 }, { "completion_length": 673.625, "epoch": 1.3232, "grad_norm": 0.013303659856319427, "kl": 0.009708404541015625, "learning_rate": 2.055684717714107e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1241 }, { "completion_length": 652.0, "epoch": 1.3242666666666667, "grad_norm": 0.013913939706981182, "kl": 0.010906219482421875, "learning_rate": 2.053953525600994e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1242 }, { "completion_length": 546.3125, "epoch": 1.3253333333333333, "grad_norm": 0.0009187787654809654, "kl": 0.014469146728515625, "learning_rate": 2.05222147866653e-06, "loss": 0.0006, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1243 }, { "completion_length": 608.21875, "epoch": 1.3264, "grad_norm": 0.040844209492206573, "kl": 0.018489837646484375, "learning_rate": 2.0504885795834836e-06, "loss": 0.0007, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1244 }, { "completion_length": 660.09375, "epoch": 1.3274666666666666, "grad_norm": 0.013724476099014282, "kl": 0.007602691650390625, "learning_rate": 2.048754831025942e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1245 }, { "completion_length": 692.0, "epoch": 1.3285333333333333, "grad_norm": 0.009446753188967705, "kl": 0.005649566650390625, "learning_rate": 2.047020235669301e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1246 }, { "completion_length": 708.53125, "epoch": 1.3296000000000001, "grad_norm": 0.017415180802345276, "kl": 0.010114669799804688, "learning_rate": 2.0452847961902635e-06, "loss": 0.0004, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1247 }, { "completion_length": 550.03125, "epoch": 1.3306666666666667, "grad_norm": 0.24827085435390472, "kl": 0.01944732666015625, "learning_rate": 2.0435485152668356e-06, "loss": 0.0008, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1248 }, { "completion_length": 735.40625, "epoch": 1.3317333333333332, "grad_norm": 0.011748087592422962, "kl": 0.011259078979492188, "learning_rate": 2.041811395578321e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1249 }, { "completion_length": 541.0625, "epoch": 1.3328, "grad_norm": 0.024518538266420364, "kl": 0.009057998657226562, "learning_rate": 2.0400734398053188e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1250 }, { "completion_length": 611.15625, "epoch": 1.3338666666666668, "grad_norm": 0.02655027247965336, "kl": 0.011753082275390625, "learning_rate": 2.038334650629718e-06, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1251 }, { "completion_length": 519.84375, "epoch": 1.3349333333333333, "grad_norm": 0.0010946511756628752, "kl": 0.01204681396484375, "learning_rate": 2.036595030734692e-06, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1252 }, { "completion_length": 582.78125, "epoch": 1.336, "grad_norm": 0.00034822183079086244, "kl": 0.0162811279296875, "learning_rate": 2.0348545828046986e-06, "loss": 0.0007, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1253 }, { "completion_length": 516.625, "epoch": 1.3370666666666666, "grad_norm": 0.0020573532674461603, "kl": 0.00916290283203125, "learning_rate": 2.033113309525472e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1254 }, { "completion_length": 658.3125, "epoch": 1.3381333333333334, "grad_norm": 0.0005985244642943144, "kl": 0.012958526611328125, "learning_rate": 2.0313712135840197e-06, "loss": 0.0005, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1255 }, { "completion_length": 544.28125, "epoch": 1.3392, "grad_norm": 0.019510671496391296, "kl": 0.01319122314453125, "learning_rate": 2.0296282976686203e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1256 }, { "completion_length": 610.78125, "epoch": 1.3402666666666667, "grad_norm": 0.023317163810133934, "kl": 0.011280059814453125, "learning_rate": 2.027884564468816e-06, "loss": 0.0005, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1257 }, { "completion_length": 568.1875, "epoch": 1.3413333333333333, "grad_norm": 0.0005698777385987341, "kl": 0.00786590576171875, "learning_rate": 2.026140016675411e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1258 }, { "completion_length": 484.0, "epoch": 1.3424, "grad_norm": 0.0007171474280767143, "kl": 0.0077152252197265625, "learning_rate": 2.0243946569804653e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.125, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1259 }, { "completion_length": 611.15625, "epoch": 1.3434666666666666, "grad_norm": 0.0004840837209485471, "kl": 0.00815582275390625, "learning_rate": 2.0226484880772943e-06, "loss": 0.0003, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1260 }, { "completion_length": 606.1875, "epoch": 1.3445333333333334, "grad_norm": 0.048089370131492615, "kl": 0.020303726196289062, "learning_rate": 2.02090151266046e-06, "loss": 0.0008, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1261 }, { "completion_length": 515.0625, "epoch": 1.3456000000000001, "grad_norm": 0.011416108347475529, "kl": 0.01287841796875, "learning_rate": 2.019153733425769e-06, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1262 }, { "completion_length": 580.96875, "epoch": 1.3466666666666667, "grad_norm": 0.005677978508174419, "kl": 0.007781982421875, "learning_rate": 2.01740515307027e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1263 }, { "completion_length": 624.40625, "epoch": 1.3477333333333332, "grad_norm": 0.0007445538067258894, "kl": 0.0128326416015625, "learning_rate": 2.015655774292246e-06, "loss": 0.0005, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1264 }, { "completion_length": 527.8125, "epoch": 1.3488, "grad_norm": 0.02182154171168804, "kl": 0.0132904052734375, "learning_rate": 2.0139055997912135e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1265 }, { "completion_length": 765.90625, "epoch": 1.3498666666666668, "grad_norm": 0.014725574292242527, "kl": 0.01105499267578125, "learning_rate": 2.012154632267915e-06, "loss": 0.0004, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1266 }, { "completion_length": 632.75, "epoch": 1.3509333333333333, "grad_norm": 0.0010104362154379487, "kl": 0.010271072387695312, "learning_rate": 2.01040287442432e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1267 }, { "completion_length": 704.9375, "epoch": 1.3519999999999999, "grad_norm": 0.001194022479467094, "kl": 0.015293121337890625, "learning_rate": 2.008650328963614e-06, "loss": 0.0006, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1268 }, { "completion_length": 777.71875, "epoch": 1.3530666666666666, "grad_norm": 0.0011057378724217415, "kl": 0.023052215576171875, "learning_rate": 2.0068969985901996e-06, "loss": 0.0009, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1269 }, { "completion_length": 684.65625, "epoch": 1.3541333333333334, "grad_norm": 0.017669258639216423, "kl": 0.011249542236328125, "learning_rate": 2.0051428860096913e-06, "loss": 0.0005, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1270 }, { "completion_length": 717.875, "epoch": 1.3552, "grad_norm": 0.013455036096274853, "kl": 0.008882522583007812, "learning_rate": 2.003387993928909e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1271 }, { "completion_length": 525.21875, "epoch": 1.3562666666666667, "grad_norm": 0.016936862841248512, "kl": 0.01305389404296875, "learning_rate": 2.0016323250558765e-06, "loss": 0.0005, "reward": 0.5625, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1272 }, { "completion_length": 498.46875, "epoch": 1.3573333333333333, "grad_norm": 0.0005599724245257676, "kl": 0.017583847045898438, "learning_rate": 1.9998758820998166e-06, "loss": 0.0007, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1273 }, { "completion_length": 533.875, "epoch": 1.3584, "grad_norm": 0.012048149481415749, "kl": 0.008640289306640625, "learning_rate": 1.998118667771145e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1274 }, { "completion_length": 641.0, "epoch": 1.3594666666666666, "grad_norm": 0.022434744983911514, "kl": 0.0062274932861328125, "learning_rate": 1.9963606847814702e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1275 }, { "completion_length": 637.09375, "epoch": 1.3605333333333334, "grad_norm": 0.000766363984439522, "kl": 0.019725799560546875, "learning_rate": 1.994601935843585e-06, "loss": 0.0008, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1276 }, { "completion_length": 544.59375, "epoch": 1.3616, "grad_norm": 0.008639483712613583, "kl": 0.015056610107421875, "learning_rate": 1.9928424236714642e-06, "loss": 0.0006, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1277 }, { "completion_length": 593.59375, "epoch": 1.3626666666666667, "grad_norm": 0.007746206130832434, "kl": 0.015810012817382812, "learning_rate": 1.991082150980261e-06, "loss": 0.0006, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1278 }, { "completion_length": 557.46875, "epoch": 1.3637333333333332, "grad_norm": 0.0007692532381042838, "kl": 0.010711669921875, "learning_rate": 1.989321120486302e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1279 }, { "completion_length": 446.84375, "epoch": 1.3648, "grad_norm": 0.0007566125714220107, "kl": 0.011074066162109375, "learning_rate": 1.9875593349070833e-06, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 1280 }, { "completion_length": 708.21875, "epoch": 1.3658666666666668, "grad_norm": 0.011906236410140991, "kl": 0.011730194091796875, "learning_rate": 1.9857967969612654e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1281 }, { "completion_length": 619.15625, "epoch": 1.3669333333333333, "grad_norm": 0.011300171725451946, "kl": 0.017187118530273438, "learning_rate": 1.984033509368672e-06, "loss": 0.0007, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1282 }, { "completion_length": 720.125, "epoch": 1.3679999999999999, "grad_norm": 0.018757710233330727, "kl": 0.0055446624755859375, "learning_rate": 1.98226947485028e-06, "loss": 0.0002, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1283 }, { "completion_length": 422.1875, "epoch": 1.3690666666666667, "grad_norm": 0.002224042546004057, "kl": 0.018520355224609375, "learning_rate": 1.9805046961282226e-06, "loss": 0.0007, "reward": 0.875, "reward_std": 0.125, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1284 }, { "completion_length": 605.25, "epoch": 1.3701333333333334, "grad_norm": 0.0007783490000292659, "kl": 0.020938873291015625, "learning_rate": 1.978739175925779e-06, "loss": 0.0008, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1285 }, { "completion_length": 485.75, "epoch": 1.3712, "grad_norm": 0.009571416303515434, "kl": 0.008405685424804688, "learning_rate": 1.9769729169673738e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1286 }, { "completion_length": 583.15625, "epoch": 1.3722666666666667, "grad_norm": 0.044460829347372055, "kl": 0.01244354248046875, "learning_rate": 1.9752059219785703e-06, "loss": 0.0005, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1287 }, { "completion_length": 571.90625, "epoch": 1.3733333333333333, "grad_norm": 0.012697877362370491, "kl": 0.012880325317382812, "learning_rate": 1.9734381936860696e-06, "loss": 0.0005, "reward": 0.8125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1288 }, { "completion_length": 623.40625, "epoch": 1.3744, "grad_norm": 0.019717169925570488, "kl": 0.009954452514648438, "learning_rate": 1.971669734817702e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1289 }, { "completion_length": 667.40625, "epoch": 1.3754666666666666, "grad_norm": 0.0003561181656550616, "kl": 0.009439468383789062, "learning_rate": 1.9699005481024273e-06, "loss": 0.0004, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1290 }, { "completion_length": 612.5, "epoch": 1.3765333333333334, "grad_norm": 0.031214017421007156, "kl": 0.0143280029296875, "learning_rate": 1.9681306362703276e-06, "loss": 0.0006, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1291 }, { "completion_length": 642.53125, "epoch": 1.3776, "grad_norm": 0.02671513333916664, "kl": 0.0169677734375, "learning_rate": 1.966360002052603e-06, "loss": 0.0007, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1292 }, { "completion_length": 692.875, "epoch": 1.3786666666666667, "grad_norm": 0.013825293630361557, "kl": 0.026973724365234375, "learning_rate": 1.96458864818157e-06, "loss": 0.0011, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1293 }, { "completion_length": 698.125, "epoch": 1.3797333333333333, "grad_norm": 0.022821778431534767, "kl": 0.006565093994140625, "learning_rate": 1.9628165773906545e-06, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1294 }, { "completion_length": 593.46875, "epoch": 1.3808, "grad_norm": 0.01035065297037363, "kl": 0.014873504638671875, "learning_rate": 1.9610437924143893e-06, "loss": 0.0006, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1295 }, { "completion_length": 633.3125, "epoch": 1.3818666666666668, "grad_norm": 0.016227442771196365, "kl": 0.016222000122070312, "learning_rate": 1.9592702959884095e-06, "loss": 0.0006, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1296 }, { "completion_length": 580.1875, "epoch": 1.3829333333333333, "grad_norm": 0.02255415916442871, "kl": 0.01001739501953125, "learning_rate": 1.9574960908494465e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1297 }, { "completion_length": 693.59375, "epoch": 1.384, "grad_norm": 0.0004973778850398958, "kl": 0.024404525756835938, "learning_rate": 1.955721179735327e-06, "loss": 0.001, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1298 }, { "completion_length": 553.3125, "epoch": 1.3850666666666667, "grad_norm": 0.015492220409214497, "kl": 0.0059356689453125, "learning_rate": 1.953945565384967e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1299 }, { "completion_length": 546.875, "epoch": 1.3861333333333334, "grad_norm": 0.013452638871967793, "kl": 0.0196380615234375, "learning_rate": 1.952169250538366e-06, "loss": 0.0008, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1300 }, { "completion_length": 605.1875, "epoch": 1.3872, "grad_norm": 0.00961318425834179, "kl": 0.010824203491210938, "learning_rate": 1.9503922379366068e-06, "loss": 0.0004, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1301 }, { "completion_length": 589.75, "epoch": 1.3882666666666665, "grad_norm": 0.017894010990858078, "kl": 0.023101806640625, "learning_rate": 1.948614530321848e-06, "loss": 0.0009, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1302 }, { "completion_length": 554.96875, "epoch": 1.3893333333333333, "grad_norm": 0.011067014187574387, "kl": 0.009433746337890625, "learning_rate": 1.9468361304373186e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1303 }, { "completion_length": 509.0625, "epoch": 1.3904, "grad_norm": 0.020050108432769775, "kl": 0.01576995849609375, "learning_rate": 1.94505704102732e-06, "loss": 0.0006, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1304 }, { "completion_length": 615.9375, "epoch": 1.3914666666666666, "grad_norm": 0.011843348853290081, "kl": 0.0137939453125, "learning_rate": 1.943277264837214e-06, "loss": 0.0006, "reward": 0.6875, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1305 }, { "completion_length": 756.8125, "epoch": 1.3925333333333334, "grad_norm": 0.008673295378684998, "kl": 0.006519317626953125, "learning_rate": 1.941496804613424e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1306 }, { "completion_length": 734.3125, "epoch": 1.3936, "grad_norm": 0.016515472903847694, "kl": 0.015672683715820312, "learning_rate": 1.939715663103429e-06, "loss": 0.0006, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1307 }, { "completion_length": 738.75, "epoch": 1.3946666666666667, "grad_norm": 0.005936862900853157, "kl": 0.019901275634765625, "learning_rate": 1.9379338430557582e-06, "loss": 0.0008, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1308 }, { "completion_length": 530.0625, "epoch": 1.3957333333333333, "grad_norm": 0.021389536559581757, "kl": 0.009786605834960938, "learning_rate": 1.9361513472199884e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1309 }, { "completion_length": 578.9375, "epoch": 1.3968, "grad_norm": 0.017694955691695213, "kl": 0.018035888671875, "learning_rate": 1.93436817834674e-06, "loss": 0.0007, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1310 }, { "completion_length": 567.03125, "epoch": 1.3978666666666666, "grad_norm": 0.0008810822037048638, "kl": 0.022846221923828125, "learning_rate": 1.932584339187671e-06, "loss": 0.0009, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1311 }, { "completion_length": 607.375, "epoch": 1.3989333333333334, "grad_norm": 0.0343339741230011, "kl": 0.01934814453125, "learning_rate": 1.930799832495474e-06, "loss": 0.0008, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1312 }, { "completion_length": 595.9375, "epoch": 1.4, "grad_norm": 0.010732077062129974, "kl": 0.022186279296875, "learning_rate": 1.9290146610238717e-06, "loss": 0.0009, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1313 }, { "completion_length": 634.09375, "epoch": 1.4010666666666667, "grad_norm": 0.010342524386942387, "kl": 0.013332366943359375, "learning_rate": 1.927228827527612e-06, "loss": 0.0005, "reward": 0.59375, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1314 }, { "completion_length": 730.8125, "epoch": 1.4021333333333335, "grad_norm": 0.011542940512299538, "kl": 0.01552581787109375, "learning_rate": 1.9254423347624667e-06, "loss": 0.0006, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1315 }, { "completion_length": 673.5625, "epoch": 1.4032, "grad_norm": 0.005588240455836058, "kl": 0.014713287353515625, "learning_rate": 1.9236551854852227e-06, "loss": 0.0006, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1316 }, { "completion_length": 580.71875, "epoch": 1.4042666666666666, "grad_norm": 0.009844362735748291, "kl": 0.013799667358398438, "learning_rate": 1.921867382453679e-06, "loss": 0.0006, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1317 }, { "completion_length": 576.0, "epoch": 1.4053333333333333, "grad_norm": 0.007794824428856373, "kl": 0.010326385498046875, "learning_rate": 1.9200789284266474e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1318 }, { "completion_length": 577.125, "epoch": 1.4064, "grad_norm": 0.02245236560702324, "kl": 0.0317230224609375, "learning_rate": 1.918289826163941e-06, "loss": 0.0013, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1319 }, { "completion_length": 468.6875, "epoch": 1.4074666666666666, "grad_norm": 0.04803838953375816, "kl": 0.04036712646484375, "learning_rate": 1.9165000784263734e-06, "loss": 0.0016, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1320 }, { "completion_length": 632.21875, "epoch": 1.4085333333333334, "grad_norm": 0.0186783354729414, "kl": 0.009044647216796875, "learning_rate": 1.9147096879757554e-06, "loss": 0.0004, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1321 }, { "completion_length": 693.59375, "epoch": 1.4096, "grad_norm": 0.028158975765109062, "kl": 0.027454376220703125, "learning_rate": 1.9129186575748895e-06, "loss": 0.0011, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1322 }, { "completion_length": 606.625, "epoch": 1.4106666666666667, "grad_norm": 0.02249998413026333, "kl": 0.03122711181640625, "learning_rate": 1.911126989987565e-06, "loss": 0.0012, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1323 }, { "completion_length": 586.5, "epoch": 1.4117333333333333, "grad_norm": 0.010378973558545113, "kl": 0.023153305053710938, "learning_rate": 1.909334687978555e-06, "loss": 0.0009, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1324 }, { "completion_length": 656.6875, "epoch": 1.4128, "grad_norm": 0.0014452401082962751, "kl": 0.01795196533203125, "learning_rate": 1.907541754313611e-06, "loss": 0.0007, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1325 }, { "completion_length": 629.40625, "epoch": 1.4138666666666666, "grad_norm": 0.000347377936122939, "kl": 0.0404815673828125, "learning_rate": 1.9057481917594604e-06, "loss": 0.0016, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1326 }, { "completion_length": 567.1875, "epoch": 1.4149333333333334, "grad_norm": 0.0036816864740103483, "kl": 0.009647369384765625, "learning_rate": 1.9039540030837997e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1327 }, { "completion_length": 534.8125, "epoch": 1.416, "grad_norm": 0.014557529240846634, "kl": 0.017322540283203125, "learning_rate": 1.9021591910552923e-06, "loss": 0.0007, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1328 }, { "completion_length": 618.1875, "epoch": 1.4170666666666667, "grad_norm": 0.024338219314813614, "kl": 0.00597381591796875, "learning_rate": 1.9003637584435633e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1329 }, { "completion_length": 537.84375, "epoch": 1.4181333333333335, "grad_norm": 0.0041288211941719055, "kl": 0.03261566162109375, "learning_rate": 1.8985677080191962e-06, "loss": 0.0013, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1330 }, { "completion_length": 552.625, "epoch": 1.4192, "grad_norm": 0.01921827532351017, "kl": 0.040752410888671875, "learning_rate": 1.8967710425537262e-06, "loss": 0.0016, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1331 }, { "completion_length": 604.53125, "epoch": 1.4202666666666666, "grad_norm": 0.011252343654632568, "kl": 0.027706146240234375, "learning_rate": 1.8949737648196395e-06, "loss": 0.0011, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1332 }, { "completion_length": 664.09375, "epoch": 1.4213333333333333, "grad_norm": 0.013579328544437885, "kl": 0.028978347778320312, "learning_rate": 1.8931758775903656e-06, "loss": 0.0012, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1333 }, { "completion_length": 712.6875, "epoch": 1.4224, "grad_norm": 0.009799477644264698, "kl": 0.022504806518554688, "learning_rate": 1.8913773836402752e-06, "loss": 0.0009, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1334 }, { "completion_length": 589.8125, "epoch": 1.4234666666666667, "grad_norm": 0.022877521812915802, "kl": 0.03330230712890625, "learning_rate": 1.8895782857446754e-06, "loss": 0.0013, "reward": 0.5625, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1335 }, { "completion_length": 549.21875, "epoch": 1.4245333333333332, "grad_norm": 0.01079490128904581, "kl": 0.010389328002929688, "learning_rate": 1.887778586679805e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1336 }, { "completion_length": 544.28125, "epoch": 1.4256, "grad_norm": 0.019543325528502464, "kl": 0.022090911865234375, "learning_rate": 1.8859782892228308e-06, "loss": 0.0009, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1337 }, { "completion_length": 626.15625, "epoch": 1.4266666666666667, "grad_norm": 0.0007591038011014462, "kl": 0.02873992919921875, "learning_rate": 1.8841773961518417e-06, "loss": 0.0012, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1338 }, { "completion_length": 511.6875, "epoch": 1.4277333333333333, "grad_norm": 0.016519272699952126, "kl": 0.021564483642578125, "learning_rate": 1.8823759102458478e-06, "loss": 0.0009, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1339 }, { "completion_length": 585.375, "epoch": 1.4288, "grad_norm": 0.008748607710003853, "kl": 0.028093338012695312, "learning_rate": 1.880573834284773e-06, "loss": 0.0011, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1340 }, { "completion_length": 615.59375, "epoch": 1.4298666666666666, "grad_norm": 0.04205981642007828, "kl": 0.017368316650390625, "learning_rate": 1.8787711710494509e-06, "loss": 0.0007, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1341 }, { "completion_length": 674.0, "epoch": 1.4309333333333334, "grad_norm": 0.0013201347319409251, "kl": 0.01293182373046875, "learning_rate": 1.876967923321622e-06, "loss": 0.0005, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1342 }, { "completion_length": 677.96875, "epoch": 1.432, "grad_norm": 0.009985373355448246, "kl": 0.026182174682617188, "learning_rate": 1.8751640938839303e-06, "loss": 0.001, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1343 }, { "completion_length": 495.34375, "epoch": 1.4330666666666667, "grad_norm": 0.031064679846167564, "kl": 0.02967071533203125, "learning_rate": 1.8733596855199147e-06, "loss": 0.0012, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1344 }, { "completion_length": 646.0, "epoch": 1.4341333333333333, "grad_norm": 0.019942356273531914, "kl": 0.0207061767578125, "learning_rate": 1.8715547010140088e-06, "loss": 0.0008, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1345 }, { "completion_length": 728.0625, "epoch": 1.4352, "grad_norm": 0.012913760729134083, "kl": 0.021907806396484375, "learning_rate": 1.8697491431515358e-06, "loss": 0.0009, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1346 }, { "completion_length": 650.5, "epoch": 1.4362666666666666, "grad_norm": 0.014465796761214733, "kl": 0.02573394775390625, "learning_rate": 1.8679430147187031e-06, "loss": 0.001, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1347 }, { "completion_length": 682.84375, "epoch": 1.4373333333333334, "grad_norm": 0.0017618249403312802, "kl": 0.04277801513671875, "learning_rate": 1.866136318502598e-06, "loss": 0.0017, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1348 }, { "completion_length": 679.84375, "epoch": 1.4384000000000001, "grad_norm": 0.0018813092028722167, "kl": 0.04508209228515625, "learning_rate": 1.864329057291185e-06, "loss": 0.0018, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1349 }, { "completion_length": 510.3125, "epoch": 1.4394666666666667, "grad_norm": 0.0033061266876757145, "kl": 0.0207061767578125, "learning_rate": 1.8625212338733005e-06, "loss": 0.0008, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1350 }, { "completion_length": 570.8125, "epoch": 1.4405333333333332, "grad_norm": 0.00847130548208952, "kl": 0.018888473510742188, "learning_rate": 1.8607128510386465e-06, "loss": 0.0008, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1351 }, { "completion_length": 662.40625, "epoch": 1.4416, "grad_norm": 0.010312633588910103, "kl": 0.024829864501953125, "learning_rate": 1.8589039115777908e-06, "loss": 0.001, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1352 }, { "completion_length": 637.34375, "epoch": 1.4426666666666668, "grad_norm": 0.0018144723726436496, "kl": 0.0323486328125, "learning_rate": 1.8570944182821588e-06, "loss": 0.0013, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1353 }, { "completion_length": 648.9375, "epoch": 1.4437333333333333, "grad_norm": 0.01905154623091221, "kl": 0.031040191650390625, "learning_rate": 1.8552843739440307e-06, "loss": 0.0012, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1354 }, { "completion_length": 637.875, "epoch": 1.4447999999999999, "grad_norm": 0.030851272866129875, "kl": 0.028045654296875, "learning_rate": 1.8534737813565371e-06, "loss": 0.0011, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1355 }, { "completion_length": 563.65625, "epoch": 1.4458666666666666, "grad_norm": 0.022492146119475365, "kl": 0.033077239990234375, "learning_rate": 1.8516626433136547e-06, "loss": 0.0013, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1356 }, { "completion_length": 665.96875, "epoch": 1.4469333333333334, "grad_norm": 0.0007085526012815535, "kl": 0.04555511474609375, "learning_rate": 1.849850962610202e-06, "loss": 0.0018, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1357 }, { "completion_length": 823.15625, "epoch": 1.448, "grad_norm": 0.019539983943104744, "kl": 0.03342628479003906, "learning_rate": 1.8480387420418344e-06, "loss": 0.0013, "reward": 0.25, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1358 }, { "completion_length": 588.6875, "epoch": 1.4490666666666667, "grad_norm": 0.012796362861990929, "kl": 0.027883529663085938, "learning_rate": 1.8462259844050408e-06, "loss": 0.0011, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1359 }, { "completion_length": 755.0, "epoch": 1.4501333333333333, "grad_norm": 0.035776279866695404, "kl": 0.049571990966796875, "learning_rate": 1.8444126924971386e-06, "loss": 0.002, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1360 }, { "completion_length": 560.25, "epoch": 1.4512, "grad_norm": 0.000291418720735237, "kl": 0.0182952880859375, "learning_rate": 1.8425988691162707e-06, "loss": 0.0007, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1361 }, { "completion_length": 659.75, "epoch": 1.4522666666666666, "grad_norm": 0.02978499047458172, "kl": 0.043651580810546875, "learning_rate": 1.840784517061398e-06, "loss": 0.0017, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1362 }, { "completion_length": 454.625, "epoch": 1.4533333333333334, "grad_norm": 0.0244918130338192, "kl": 0.0131683349609375, "learning_rate": 1.8389696391322995e-06, "loss": 0.0005, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1363 }, { "completion_length": 658.53125, "epoch": 1.4544000000000001, "grad_norm": 0.016039330512285233, "kl": 0.05540657043457031, "learning_rate": 1.8371542381295638e-06, "loss": 0.0022, "reward": 0.40625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1364 }, { "completion_length": 603.4375, "epoch": 1.4554666666666667, "grad_norm": 0.011020014062523842, "kl": 0.013645172119140625, "learning_rate": 1.835338316854588e-06, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1365 }, { "completion_length": 515.5625, "epoch": 1.4565333333333332, "grad_norm": 0.02389819361269474, "kl": 0.031261444091796875, "learning_rate": 1.833521878109572e-06, "loss": 0.0012, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1366 }, { "completion_length": 555.9375, "epoch": 1.4576, "grad_norm": 0.026525167748332024, "kl": 0.041683197021484375, "learning_rate": 1.831704924697513e-06, "loss": 0.0017, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1367 }, { "completion_length": 570.34375, "epoch": 1.4586666666666668, "grad_norm": 0.011611703783273697, "kl": 0.057392120361328125, "learning_rate": 1.8298874594222035e-06, "loss": 0.0023, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1368 }, { "completion_length": 567.65625, "epoch": 1.4597333333333333, "grad_norm": 0.02795572578907013, "kl": 0.03574562072753906, "learning_rate": 1.8280694850882262e-06, "loss": 0.0014, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1369 }, { "completion_length": 678.40625, "epoch": 1.4607999999999999, "grad_norm": 0.01761452853679657, "kl": 0.017726898193359375, "learning_rate": 1.8262510045009473e-06, "loss": 0.0007, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1370 }, { "completion_length": 610.875, "epoch": 1.4618666666666666, "grad_norm": 0.00029782854835502803, "kl": 0.017404556274414062, "learning_rate": 1.824432020466517e-06, "loss": 0.0007, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1371 }, { "completion_length": 510.1875, "epoch": 1.4629333333333334, "grad_norm": 0.00034329647314734757, "kl": 0.022062301635742188, "learning_rate": 1.8226125357918604e-06, "loss": 0.0009, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1372 }, { "completion_length": 644.5, "epoch": 1.464, "grad_norm": 0.02789568342268467, "kl": 0.0537567138671875, "learning_rate": 1.8207925532846753e-06, "loss": 0.0022, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1373 }, { "completion_length": 785.15625, "epoch": 1.4650666666666667, "grad_norm": 0.0004297412815503776, "kl": 0.027627944946289062, "learning_rate": 1.8189720757534291e-06, "loss": 0.0011, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1374 }, { "completion_length": 670.65625, "epoch": 1.4661333333333333, "grad_norm": 0.0003178965416736901, "kl": 0.01894378662109375, "learning_rate": 1.817151106007352e-06, "loss": 0.0008, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1375 }, { "completion_length": 653.09375, "epoch": 1.4672, "grad_norm": 0.023261070251464844, "kl": 0.03452110290527344, "learning_rate": 1.815329646856433e-06, "loss": 0.0014, "reward": 0.65625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1376 }, { "completion_length": 591.8125, "epoch": 1.4682666666666666, "grad_norm": 0.015171222388744354, "kl": 0.02013397216796875, "learning_rate": 1.8135077011114185e-06, "loss": 0.0008, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1377 }, { "completion_length": 534.21875, "epoch": 1.4693333333333334, "grad_norm": 0.010529767721891403, "kl": 0.0072021484375, "learning_rate": 1.8116852715838037e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1378 }, { "completion_length": 703.125, "epoch": 1.4704, "grad_norm": 0.013977235183119774, "kl": 0.0384521484375, "learning_rate": 1.8098623610858315e-06, "loss": 0.0015, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1379 }, { "completion_length": 761.53125, "epoch": 1.4714666666666667, "grad_norm": 0.02675720676779747, "kl": 0.0596466064453125, "learning_rate": 1.8080389724304863e-06, "loss": 0.0024, "reward": 0.25, "reward_std": 0.125, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1380 }, { "completion_length": 733.03125, "epoch": 1.4725333333333332, "grad_norm": 0.009576570242643356, "kl": 0.031551361083984375, "learning_rate": 1.8062151084314908e-06, "loss": 0.0013, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1381 }, { "completion_length": 567.34375, "epoch": 1.4736, "grad_norm": 0.01492581982165575, "kl": 0.04335975646972656, "learning_rate": 1.8043907719033015e-06, "loss": 0.0017, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1382 }, { "completion_length": 573.375, "epoch": 1.4746666666666668, "grad_norm": 0.002090984955430031, "kl": 0.036640167236328125, "learning_rate": 1.8025659656611033e-06, "loss": 0.0015, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1383 }, { "completion_length": 702.59375, "epoch": 1.4757333333333333, "grad_norm": 0.011985894292593002, "kl": 0.03422737121582031, "learning_rate": 1.8007406925208062e-06, "loss": 0.0014, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1384 }, { "completion_length": 698.21875, "epoch": 1.4768, "grad_norm": 0.05888623371720314, "kl": 0.02741241455078125, "learning_rate": 1.7989149552990414e-06, "loss": 0.0011, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1385 }, { "completion_length": 567.4375, "epoch": 1.4778666666666667, "grad_norm": 0.01015876978635788, "kl": 0.03511810302734375, "learning_rate": 1.797088756813155e-06, "loss": 0.0014, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1386 }, { "completion_length": 481.875, "epoch": 1.4789333333333334, "grad_norm": 0.03175974637269974, "kl": 0.039005279541015625, "learning_rate": 1.795262099881206e-06, "loss": 0.0016, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1387 }, { "completion_length": 718.40625, "epoch": 1.48, "grad_norm": 0.020660271868109703, "kl": 0.037410736083984375, "learning_rate": 1.79343498732196e-06, "loss": 0.0015, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1388 }, { "completion_length": 753.28125, "epoch": 1.4810666666666665, "grad_norm": 0.019862249493598938, "kl": 0.07538795471191406, "learning_rate": 1.7916074219548866e-06, "loss": 0.003, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1389 }, { "completion_length": 694.0625, "epoch": 1.4821333333333333, "grad_norm": 0.0029105409048497677, "kl": 0.020154953002929688, "learning_rate": 1.7897794066001527e-06, "loss": 0.0008, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1390 }, { "completion_length": 649.125, "epoch": 1.4832, "grad_norm": 0.0010029426775872707, "kl": 0.0656280517578125, "learning_rate": 1.7879509440786216e-06, "loss": 0.0026, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1391 }, { "completion_length": 717.3125, "epoch": 1.4842666666666666, "grad_norm": 0.0009193735895678401, "kl": 0.03810310363769531, "learning_rate": 1.7861220372118446e-06, "loss": 0.0015, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1392 }, { "completion_length": 707.9375, "epoch": 1.4853333333333334, "grad_norm": 0.037942416965961456, "kl": 0.0451812744140625, "learning_rate": 1.7842926888220596e-06, "loss": 0.0018, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1393 }, { "completion_length": 492.65625, "epoch": 1.4864, "grad_norm": 0.0008568963967263699, "kl": 0.03375244140625, "learning_rate": 1.7824629017321872e-06, "loss": 0.0014, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1394 }, { "completion_length": 499.625, "epoch": 1.4874666666666667, "grad_norm": 0.0016620937967672944, "kl": 0.02754974365234375, "learning_rate": 1.7806326787658219e-06, "loss": 0.0011, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1395 }, { "completion_length": 653.125, "epoch": 1.4885333333333333, "grad_norm": 0.010237671434879303, "kl": 0.034046173095703125, "learning_rate": 1.7788020227472334e-06, "loss": 0.0014, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1396 }, { "completion_length": 736.09375, "epoch": 1.4896, "grad_norm": 0.024222588166594505, "kl": 0.056118011474609375, "learning_rate": 1.7769709365013587e-06, "loss": 0.0022, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1397 }, { "completion_length": 646.21875, "epoch": 1.4906666666666666, "grad_norm": 0.010648822411894798, "kl": 0.025157928466796875, "learning_rate": 1.7751394228537989e-06, "loss": 0.001, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1398 }, { "completion_length": 667.59375, "epoch": 1.4917333333333334, "grad_norm": 0.002942193066701293, "kl": 0.03318214416503906, "learning_rate": 1.7733074846308137e-06, "loss": 0.0013, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1399 }, { "completion_length": 842.21875, "epoch": 1.4928, "grad_norm": 0.019109157845377922, "kl": 0.04219818115234375, "learning_rate": 1.7714751246593197e-06, "loss": 0.0017, "reward": 0.25, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1400 }, { "completion_length": 762.21875, "epoch": 1.4938666666666667, "grad_norm": 0.0057843877002596855, "kl": 0.06724929809570312, "learning_rate": 1.7696423457668832e-06, "loss": 0.0027, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1401 }, { "completion_length": 477.40625, "epoch": 1.4949333333333334, "grad_norm": 0.004817749839276075, "kl": 0.0249481201171875, "learning_rate": 1.7678091507817172e-06, "loss": 0.001, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1402 }, { "completion_length": 627.8125, "epoch": 1.496, "grad_norm": 0.03637614846229553, "kl": 0.058666229248046875, "learning_rate": 1.7659755425326763e-06, "loss": 0.0023, "reward": 0.59375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1403 }, { "completion_length": 593.8125, "epoch": 1.4970666666666665, "grad_norm": 0.01794705167412758, "kl": 0.02547454833984375, "learning_rate": 1.7641415238492536e-06, "loss": 0.001, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1404 }, { "completion_length": 557.3125, "epoch": 1.4981333333333333, "grad_norm": 0.01311718113720417, "kl": 0.056972503662109375, "learning_rate": 1.7623070975615749e-06, "loss": 0.0023, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1405 }, { "completion_length": 636.65625, "epoch": 1.4992, "grad_norm": 0.0048241158947348595, "kl": 0.028228759765625, "learning_rate": 1.7604722665003958e-06, "loss": 0.0011, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1406 }, { "completion_length": 622.625, "epoch": 1.5002666666666666, "grad_norm": 0.019638555124402046, "kl": 0.04961395263671875, "learning_rate": 1.7586370334970954e-06, "loss": 0.002, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1407 }, { "completion_length": 516.0625, "epoch": 1.5013333333333332, "grad_norm": 0.02490825578570366, "kl": 0.0501556396484375, "learning_rate": 1.756801401383674e-06, "loss": 0.002, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1408 }, { "completion_length": 506.84375, "epoch": 1.5024, "grad_norm": 0.013667884282767773, "kl": 0.02106475830078125, "learning_rate": 1.754965372992747e-06, "loss": 0.0008, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1409 }, { "completion_length": 578.15625, "epoch": 1.5034666666666667, "grad_norm": 0.0007950848084874451, "kl": 0.01242828369140625, "learning_rate": 1.7531289511575427e-06, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1410 }, { "completion_length": 697.75, "epoch": 1.5045333333333333, "grad_norm": 0.024423375725746155, "kl": 0.0772247314453125, "learning_rate": 1.751292138711895e-06, "loss": 0.0031, "reward": 0.53125, "reward_std": 0.47617512941360474, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1411 }, { "completion_length": 631.125, "epoch": 1.5056, "grad_norm": 0.02266407571732998, "kl": 0.031826019287109375, "learning_rate": 1.7494549384902404e-06, "loss": 0.0013, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1412 }, { "completion_length": 625.3125, "epoch": 1.5066666666666668, "grad_norm": 0.025954829528927803, "kl": 0.049846649169921875, "learning_rate": 1.747617353327616e-06, "loss": 0.002, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1413 }, { "completion_length": 644.28125, "epoch": 1.5077333333333334, "grad_norm": 0.02223033830523491, "kl": 0.0324249267578125, "learning_rate": 1.7457793860596502e-06, "loss": 0.0013, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1414 }, { "completion_length": 603.46875, "epoch": 1.5088, "grad_norm": 0.006684810388833284, "kl": 0.032306671142578125, "learning_rate": 1.7439410395225628e-06, "loss": 0.0013, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1415 }, { "completion_length": 574.21875, "epoch": 1.5098666666666667, "grad_norm": 0.0004039084305986762, "kl": 0.0266876220703125, "learning_rate": 1.7421023165531584e-06, "loss": 0.0011, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1416 }, { "completion_length": 683.0, "epoch": 1.5109333333333335, "grad_norm": 0.015207846648991108, "kl": 0.026775360107421875, "learning_rate": 1.7402632199888222e-06, "loss": 0.0011, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1417 }, { "completion_length": 687.0, "epoch": 1.512, "grad_norm": 0.0013780429726466537, "kl": 0.03978538513183594, "learning_rate": 1.7384237526675159e-06, "loss": 0.0016, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1418 }, { "completion_length": 585.03125, "epoch": 1.5130666666666666, "grad_norm": 0.0009299692464992404, "kl": 0.02780914306640625, "learning_rate": 1.7365839174277743e-06, "loss": 0.0011, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1419 }, { "completion_length": 667.53125, "epoch": 1.5141333333333333, "grad_norm": 0.01673083007335663, "kl": 0.025875091552734375, "learning_rate": 1.7347437171086989e-06, "loss": 0.001, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1420 }, { "completion_length": 599.59375, "epoch": 1.5152, "grad_norm": 0.014979545958340168, "kl": 0.027021408081054688, "learning_rate": 1.732903154549954e-06, "loss": 0.0011, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1421 }, { "completion_length": 586.84375, "epoch": 1.5162666666666667, "grad_norm": 0.006706463173031807, "kl": 0.025247573852539062, "learning_rate": 1.7310622325917648e-06, "loss": 0.001, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1422 }, { "completion_length": 527.8125, "epoch": 1.5173333333333332, "grad_norm": 0.013668043538928032, "kl": 0.042774200439453125, "learning_rate": 1.7292209540749096e-06, "loss": 0.0017, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1423 }, { "completion_length": 622.03125, "epoch": 1.5184, "grad_norm": 0.02074313722550869, "kl": 0.04017448425292969, "learning_rate": 1.7273793218407175e-06, "loss": 0.0016, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1424 }, { "completion_length": 616.28125, "epoch": 1.5194666666666667, "grad_norm": 0.016958341002464294, "kl": 0.034511566162109375, "learning_rate": 1.7255373387310633e-06, "loss": 0.0014, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1425 }, { "completion_length": 591.09375, "epoch": 1.5205333333333333, "grad_norm": 0.01404169574379921, "kl": 0.021209716796875, "learning_rate": 1.723695007588363e-06, "loss": 0.0008, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1426 }, { "completion_length": 652.71875, "epoch": 1.5215999999999998, "grad_norm": 0.024984318763017654, "kl": 0.0406036376953125, "learning_rate": 1.7218523312555701e-06, "loss": 0.0016, "reward": 0.46875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1427 }, { "completion_length": 667.0, "epoch": 1.5226666666666666, "grad_norm": 0.0015200409106910229, "kl": 0.030941009521484375, "learning_rate": 1.7200093125761706e-06, "loss": 0.0012, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1428 }, { "completion_length": 547.5625, "epoch": 1.5237333333333334, "grad_norm": 0.02082795836031437, "kl": 0.039302825927734375, "learning_rate": 1.7181659543941785e-06, "loss": 0.0016, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1429 }, { "completion_length": 609.625, "epoch": 1.5248, "grad_norm": 0.025898881256580353, "kl": 0.053680419921875, "learning_rate": 1.716322259554132e-06, "loss": 0.0022, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1430 }, { "completion_length": 646.6875, "epoch": 1.5258666666666667, "grad_norm": 0.01943087950348854, "kl": 0.04392242431640625, "learning_rate": 1.714478230901089e-06, "loss": 0.0018, "reward": 0.46875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1431 }, { "completion_length": 566.34375, "epoch": 1.5269333333333335, "grad_norm": 0.010560310445725918, "kl": 0.03881072998046875, "learning_rate": 1.7126338712806223e-06, "loss": 0.0016, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1432 }, { "completion_length": 691.3125, "epoch": 1.528, "grad_norm": 0.0037186667323112488, "kl": 0.043277740478515625, "learning_rate": 1.7107891835388148e-06, "loss": 0.0017, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1433 }, { "completion_length": 651.75, "epoch": 1.5290666666666666, "grad_norm": 0.01896616816520691, "kl": 0.0470733642578125, "learning_rate": 1.7089441705222568e-06, "loss": 0.0019, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1434 }, { "completion_length": 586.78125, "epoch": 1.5301333333333333, "grad_norm": 0.019276432693004608, "kl": 0.0659637451171875, "learning_rate": 1.7070988350780397e-06, "loss": 0.0026, "reward": 0.75, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1435 }, { "completion_length": 778.6875, "epoch": 1.5312000000000001, "grad_norm": 0.008374935016036034, "kl": 0.0363006591796875, "learning_rate": 1.7052531800537538e-06, "loss": 0.0015, "reward": 0.375, "reward_std": 0.125, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1436 }, { "completion_length": 606.46875, "epoch": 1.5322666666666667, "grad_norm": 0.020756205543875694, "kl": 0.04238128662109375, "learning_rate": 1.7034072082974805e-06, "loss": 0.0017, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1437 }, { "completion_length": 572.71875, "epoch": 1.5333333333333332, "grad_norm": 0.02243969403207302, "kl": 0.02666473388671875, "learning_rate": 1.701560922657791e-06, "loss": 0.0011, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1438 }, { "completion_length": 618.03125, "epoch": 1.5344, "grad_norm": 0.02122250571846962, "kl": 0.034770965576171875, "learning_rate": 1.699714325983742e-06, "loss": 0.0014, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1439 }, { "completion_length": 687.1875, "epoch": 1.5354666666666668, "grad_norm": 0.014624211937189102, "kl": 0.0323944091796875, "learning_rate": 1.6978674211248676e-06, "loss": 0.0013, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1440 }, { "completion_length": 624.5, "epoch": 1.5365333333333333, "grad_norm": 0.007383113261312246, "kl": 0.03632354736328125, "learning_rate": 1.6960202109311801e-06, "loss": 0.0015, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1441 }, { "completion_length": 703.625, "epoch": 1.5375999999999999, "grad_norm": 0.02276577427983284, "kl": 0.0540008544921875, "learning_rate": 1.6941726982531617e-06, "loss": 0.0022, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1442 }, { "completion_length": 618.375, "epoch": 1.5386666666666666, "grad_norm": 0.0004837214946746826, "kl": 0.01825714111328125, "learning_rate": 1.69232488594176e-06, "loss": 0.0007, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1443 }, { "completion_length": 670.1875, "epoch": 1.5397333333333334, "grad_norm": 0.033442020416259766, "kl": 0.043182373046875, "learning_rate": 1.6904767768483886e-06, "loss": 0.0017, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1444 }, { "completion_length": 689.78125, "epoch": 1.5408, "grad_norm": 0.02027198299765587, "kl": 0.029737472534179688, "learning_rate": 1.6886283738249158e-06, "loss": 0.0012, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1445 }, { "completion_length": 686.40625, "epoch": 1.5418666666666667, "grad_norm": 0.0005447552539408207, "kl": 0.027944564819335938, "learning_rate": 1.6867796797236638e-06, "loss": 0.0011, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1446 }, { "completion_length": 673.71875, "epoch": 1.5429333333333335, "grad_norm": 0.02054557576775551, "kl": 0.08795166015625, "learning_rate": 1.6849306973974063e-06, "loss": 0.0035, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1447 }, { "completion_length": 651.6875, "epoch": 1.544, "grad_norm": 0.024634115397930145, "kl": 0.032947540283203125, "learning_rate": 1.6830814296993592e-06, "loss": 0.0013, "reward": 0.625, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1448 }, { "completion_length": 589.21875, "epoch": 1.5450666666666666, "grad_norm": 0.045302968472242355, "kl": 0.022871017456054688, "learning_rate": 1.6812318794831804e-06, "loss": 0.0009, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1449 }, { "completion_length": 590.03125, "epoch": 1.5461333333333334, "grad_norm": 0.0027858770918101072, "kl": 0.045928955078125, "learning_rate": 1.6793820496029625e-06, "loss": 0.0018, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1450 }, { "completion_length": 574.5, "epoch": 1.5472000000000001, "grad_norm": 0.0210605189204216, "kl": 0.042865753173828125, "learning_rate": 1.6775319429132305e-06, "loss": 0.0017, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1451 }, { "completion_length": 557.1875, "epoch": 1.5482666666666667, "grad_norm": 0.017802750691771507, "kl": 0.04443359375, "learning_rate": 1.6756815622689371e-06, "loss": 0.0018, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1452 }, { "completion_length": 630.8125, "epoch": 1.5493333333333332, "grad_norm": 0.029190413653850555, "kl": 0.040782928466796875, "learning_rate": 1.6738309105254561e-06, "loss": 0.0016, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1453 }, { "completion_length": 669.96875, "epoch": 1.5504, "grad_norm": 0.02164646051824093, "kl": 0.04670143127441406, "learning_rate": 1.6719799905385807e-06, "loss": 0.0019, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1454 }, { "completion_length": 696.5, "epoch": 1.5514666666666668, "grad_norm": 0.0022009345702826977, "kl": 0.03852081298828125, "learning_rate": 1.6701288051645182e-06, "loss": 0.0015, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1455 }, { "completion_length": 621.0, "epoch": 1.5525333333333333, "grad_norm": 0.011342490091919899, "kl": 0.023092269897460938, "learning_rate": 1.6682773572598849e-06, "loss": 0.0009, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1456 }, { "completion_length": 661.59375, "epoch": 1.5535999999999999, "grad_norm": 0.020909376442432404, "kl": 0.050685882568359375, "learning_rate": 1.6664256496817019e-06, "loss": 0.002, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1457 }, { "completion_length": 593.09375, "epoch": 1.5546666666666666, "grad_norm": 0.001505324151366949, "kl": 0.053348541259765625, "learning_rate": 1.664573685287393e-06, "loss": 0.0021, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1458 }, { "completion_length": 557.15625, "epoch": 1.5557333333333334, "grad_norm": 0.024634674191474915, "kl": 0.019670486450195312, "learning_rate": 1.6627214669347755e-06, "loss": 0.0008, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1459 }, { "completion_length": 649.78125, "epoch": 1.5568, "grad_norm": 0.026952771469950676, "kl": 0.027866363525390625, "learning_rate": 1.6608689974820601e-06, "loss": 0.0011, "reward": 0.6875, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1460 }, { "completion_length": 578.375, "epoch": 1.5578666666666665, "grad_norm": 0.035983748733997345, "kl": 0.050518035888671875, "learning_rate": 1.6590162797878457e-06, "loss": 0.002, "reward": 0.53125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1461 }, { "completion_length": 542.125, "epoch": 1.5589333333333333, "grad_norm": 0.009795625694096088, "kl": 0.05585479736328125, "learning_rate": 1.6571633167111122e-06, "loss": 0.0022, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1462 }, { "completion_length": 568.75, "epoch": 1.56, "grad_norm": 0.0007905043894425035, "kl": 0.05005645751953125, "learning_rate": 1.6553101111112199e-06, "loss": 0.002, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1463 }, { "completion_length": 478.4375, "epoch": 1.5610666666666666, "grad_norm": 0.03327744081616402, "kl": 0.037075042724609375, "learning_rate": 1.653456665847903e-06, "loss": 0.0015, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1464 }, { "completion_length": 563.4375, "epoch": 1.5621333333333334, "grad_norm": 0.0006013494567014277, "kl": 0.043670654296875, "learning_rate": 1.6516029837812648e-06, "loss": 0.0017, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1465 }, { "completion_length": 564.90625, "epoch": 1.5632000000000001, "grad_norm": 0.030525261536240578, "kl": 0.0509796142578125, "learning_rate": 1.6497490677717746e-06, "loss": 0.002, "reward": 0.6875, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1466 }, { "completion_length": 650.65625, "epoch": 1.5642666666666667, "grad_norm": 0.01932027004659176, "kl": 0.06327438354492188, "learning_rate": 1.6478949206802629e-06, "loss": 0.0025, "reward": 0.5625, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1467 }, { "completion_length": 651.125, "epoch": 1.5653333333333332, "grad_norm": 0.01164123136550188, "kl": 0.025360107421875, "learning_rate": 1.6460405453679164e-06, "loss": 0.001, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1468 }, { "completion_length": 699.8125, "epoch": 1.5664, "grad_norm": 0.0009262848761864007, "kl": 0.039337158203125, "learning_rate": 1.6441859446962743e-06, "loss": 0.0016, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1469 }, { "completion_length": 585.03125, "epoch": 1.5674666666666668, "grad_norm": 0.018543489277362823, "kl": 0.0761566162109375, "learning_rate": 1.642331121527223e-06, "loss": 0.003, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1470 }, { "completion_length": 643.3125, "epoch": 1.5685333333333333, "grad_norm": 0.02948055788874626, "kl": 0.034320831298828125, "learning_rate": 1.6404760787229926e-06, "loss": 0.0014, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1471 }, { "completion_length": 624.375, "epoch": 1.5695999999999999, "grad_norm": 0.011122863739728928, "kl": 0.021846771240234375, "learning_rate": 1.638620819146152e-06, "loss": 0.0009, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1472 }, { "completion_length": 760.875, "epoch": 1.5706666666666667, "grad_norm": 0.021892521530389786, "kl": 0.07042312622070312, "learning_rate": 1.6367653456596054e-06, "loss": 0.0028, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1473 }, { "completion_length": 652.375, "epoch": 1.5717333333333334, "grad_norm": 0.015165319666266441, "kl": 0.041080474853515625, "learning_rate": 1.634909661126586e-06, "loss": 0.0016, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1474 }, { "completion_length": 512.125, "epoch": 1.5728, "grad_norm": 0.012899802066385746, "kl": 0.032440185546875, "learning_rate": 1.6330537684106529e-06, "loss": 0.0013, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1475 }, { "completion_length": 586.0625, "epoch": 1.5738666666666665, "grad_norm": 0.018755193799734116, "kl": 0.031330108642578125, "learning_rate": 1.6311976703756868e-06, "loss": 0.0013, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1476 }, { "completion_length": 678.34375, "epoch": 1.5749333333333333, "grad_norm": 0.019284091889858246, "kl": 0.033054351806640625, "learning_rate": 1.6293413698858847e-06, "loss": 0.0013, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1477 }, { "completion_length": 816.53125, "epoch": 1.576, "grad_norm": 0.014310114085674286, "kl": 0.058139801025390625, "learning_rate": 1.6274848698057572e-06, "loss": 0.0023, "reward": 0.46875, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1478 }, { "completion_length": 739.03125, "epoch": 1.5770666666666666, "grad_norm": 0.00920061394572258, "kl": 0.093597412109375, "learning_rate": 1.6256281730001213e-06, "loss": 0.0037, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1479 }, { "completion_length": 578.65625, "epoch": 1.5781333333333334, "grad_norm": 0.022563645616173744, "kl": 0.07468795776367188, "learning_rate": 1.623771282334099e-06, "loss": 0.003, "reward": 0.71875, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1480 }, { "completion_length": 604.59375, "epoch": 1.5792000000000002, "grad_norm": 0.02152581699192524, "kl": 0.03551292419433594, "learning_rate": 1.6219142006731103e-06, "loss": 0.0014, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1481 }, { "completion_length": 621.40625, "epoch": 1.5802666666666667, "grad_norm": 0.026471363380551338, "kl": 0.0738677978515625, "learning_rate": 1.6200569308828705e-06, "loss": 0.003, "reward": 0.6875, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1482 }, { "completion_length": 459.75, "epoch": 1.5813333333333333, "grad_norm": 0.023553745821118355, "kl": 0.069488525390625, "learning_rate": 1.6181994758293854e-06, "loss": 0.0028, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1483 }, { "completion_length": 551.59375, "epoch": 1.5824, "grad_norm": 0.0016524199163541198, "kl": 0.050312042236328125, "learning_rate": 1.6163418383789465e-06, "loss": 0.002, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1484 }, { "completion_length": 523.625, "epoch": 1.5834666666666668, "grad_norm": 0.0026003983803093433, "kl": 0.05419921875, "learning_rate": 1.6144840213981257e-06, "loss": 0.0022, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1485 }, { "completion_length": 683.84375, "epoch": 1.5845333333333333, "grad_norm": 0.004192454740405083, "kl": 0.030681610107421875, "learning_rate": 1.6126260277537743e-06, "loss": 0.0012, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1486 }, { "completion_length": 610.6875, "epoch": 1.5856, "grad_norm": 0.01635925844311714, "kl": 0.0496978759765625, "learning_rate": 1.6107678603130144e-06, "loss": 0.002, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1487 }, { "completion_length": 673.5, "epoch": 1.5866666666666667, "grad_norm": 0.015295674093067646, "kl": 0.01412200927734375, "learning_rate": 1.6089095219432359e-06, "loss": 0.0006, "reward": 0.53125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1488 }, { "completion_length": 645.125, "epoch": 1.5877333333333334, "grad_norm": 0.0006534929852932692, "kl": 0.008670806884765625, "learning_rate": 1.6070510155120946e-06, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1489 }, { "completion_length": 567.65625, "epoch": 1.5888, "grad_norm": 0.009270443581044674, "kl": 0.03014373779296875, "learning_rate": 1.6051923438875037e-06, "loss": 0.0012, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1490 }, { "completion_length": 595.1875, "epoch": 1.5898666666666665, "grad_norm": 0.012912658043205738, "kl": 0.07326507568359375, "learning_rate": 1.6033335099376315e-06, "loss": 0.0029, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1491 }, { "completion_length": 645.875, "epoch": 1.5909333333333333, "grad_norm": 0.0007485125097446144, "kl": 0.043392181396484375, "learning_rate": 1.6014745165308975e-06, "loss": 0.0017, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1492 }, { "completion_length": 663.8125, "epoch": 1.592, "grad_norm": 0.0005711975391022861, "kl": 0.02265167236328125, "learning_rate": 1.599615366535968e-06, "loss": 0.0009, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1493 }, { "completion_length": 616.75, "epoch": 1.5930666666666666, "grad_norm": 0.014764913357794285, "kl": 0.049991607666015625, "learning_rate": 1.5977560628217482e-06, "loss": 0.002, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1494 }, { "completion_length": 640.65625, "epoch": 1.5941333333333332, "grad_norm": 0.016853664070367813, "kl": 0.06753349304199219, "learning_rate": 1.5958966082573837e-06, "loss": 0.0027, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1495 }, { "completion_length": 737.125, "epoch": 1.5952, "grad_norm": 0.0004955663462169468, "kl": 0.045009613037109375, "learning_rate": 1.594037005712251e-06, "loss": 0.0018, "reward": 0.28125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 1496 }, { "completion_length": 629.21875, "epoch": 1.5962666666666667, "grad_norm": 0.0127304932102561, "kl": 0.0478057861328125, "learning_rate": 1.5921772580559549e-06, "loss": 0.0019, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1497 }, { "completion_length": 704.0, "epoch": 1.5973333333333333, "grad_norm": 0.018157044425606728, "kl": 0.040912628173828125, "learning_rate": 1.590317368158325e-06, "loss": 0.0016, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1498 }, { "completion_length": 709.96875, "epoch": 1.5984, "grad_norm": 0.0010056915925815701, "kl": 0.05666351318359375, "learning_rate": 1.5884573388894102e-06, "loss": 0.0023, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1499 }, { "completion_length": 659.6875, "epoch": 1.5994666666666668, "grad_norm": 0.01128507498651743, "kl": 0.041672706604003906, "learning_rate": 1.5865971731194738e-06, "loss": 0.0017, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1500 }, { "completion_length": 625.3125, "epoch": 1.6005333333333334, "grad_norm": 0.024760091677308083, "kl": 0.009922027587890625, "learning_rate": 1.5847368737189907e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1501 }, { "completion_length": 548.0625, "epoch": 1.6016, "grad_norm": 0.010713090188801289, "kl": 0.0348968505859375, "learning_rate": 1.582876443558641e-06, "loss": 0.0014, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1502 }, { "completion_length": 531.8125, "epoch": 1.6026666666666667, "grad_norm": 0.01946599781513214, "kl": 0.009754180908203125, "learning_rate": 1.5810158855093075e-06, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1503 }, { "completion_length": 662.96875, "epoch": 1.6037333333333335, "grad_norm": 0.022514402866363525, "kl": 0.056285858154296875, "learning_rate": 1.5791552024420699e-06, "loss": 0.0023, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1504 }, { "completion_length": 783.125, "epoch": 1.6048, "grad_norm": 0.00043888462823815644, "kl": 0.04312896728515625, "learning_rate": 1.5772943972282007e-06, "loss": 0.0017, "reward": 0.46875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1505 }, { "completion_length": 637.125, "epoch": 1.6058666666666666, "grad_norm": 0.007742941379547119, "kl": 0.11088180541992188, "learning_rate": 1.5754334727391613e-06, "loss": 0.0044, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1506 }, { "completion_length": 606.34375, "epoch": 1.6069333333333333, "grad_norm": 0.018433814868330956, "kl": 0.024415969848632812, "learning_rate": 1.5735724318465962e-06, "loss": 0.001, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1507 }, { "completion_length": 748.84375, "epoch": 1.608, "grad_norm": 0.013482904061675072, "kl": 0.06398773193359375, "learning_rate": 1.5717112774223307e-06, "loss": 0.0026, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1508 }, { "completion_length": 501.9375, "epoch": 1.6090666666666666, "grad_norm": 0.021982554346323013, "kl": 0.03412628173828125, "learning_rate": 1.5698500123383657e-06, "loss": 0.0014, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1509 }, { "completion_length": 678.09375, "epoch": 1.6101333333333332, "grad_norm": 0.001298671355471015, "kl": 0.06557464599609375, "learning_rate": 1.567988639466871e-06, "loss": 0.0026, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1510 }, { "completion_length": 632.09375, "epoch": 1.6112, "grad_norm": 0.028069788590073586, "kl": 0.014179229736328125, "learning_rate": 1.566127161680183e-06, "loss": 0.0006, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1511 }, { "completion_length": 653.78125, "epoch": 1.6122666666666667, "grad_norm": 0.010497380048036575, "kl": 0.07790374755859375, "learning_rate": 1.5642655818508029e-06, "loss": 0.0031, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1512 }, { "completion_length": 723.03125, "epoch": 1.6133333333333333, "grad_norm": 0.019101565703749657, "kl": 0.07439231872558594, "learning_rate": 1.5624039028513848e-06, "loss": 0.003, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1513 }, { "completion_length": 521.59375, "epoch": 1.6143999999999998, "grad_norm": 0.0004506227851379663, "kl": 0.017333984375, "learning_rate": 1.56054212755474e-06, "loss": 0.0007, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1514 }, { "completion_length": 526.65625, "epoch": 1.6154666666666668, "grad_norm": 0.015125748701393604, "kl": 0.055675506591796875, "learning_rate": 1.5586802588338262e-06, "loss": 0.0022, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1515 }, { "completion_length": 673.0, "epoch": 1.6165333333333334, "grad_norm": 0.0016940500354394317, "kl": 0.061885833740234375, "learning_rate": 1.5568182995617447e-06, "loss": 0.0025, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1516 }, { "completion_length": 569.84375, "epoch": 1.6176, "grad_norm": 0.016915762796998024, "kl": 0.06128692626953125, "learning_rate": 1.5549562526117387e-06, "loss": 0.0025, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1517 }, { "completion_length": 601.03125, "epoch": 1.6186666666666667, "grad_norm": 0.02699512429535389, "kl": 0.031673431396484375, "learning_rate": 1.553094120857185e-06, "loss": 0.0013, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1518 }, { "completion_length": 549.65625, "epoch": 1.6197333333333335, "grad_norm": 0.0010540661169216037, "kl": 0.0199127197265625, "learning_rate": 1.5512319071715916e-06, "loss": 0.0008, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1519 }, { "completion_length": 632.28125, "epoch": 1.6208, "grad_norm": 0.0016561689553782344, "kl": 0.07463455200195312, "learning_rate": 1.5493696144285936e-06, "loss": 0.003, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1520 }, { "completion_length": 679.53125, "epoch": 1.6218666666666666, "grad_norm": 0.019187767058610916, "kl": 0.08098220825195312, "learning_rate": 1.547507245501947e-06, "loss": 0.0032, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1521 }, { "completion_length": 657.21875, "epoch": 1.6229333333333333, "grad_norm": 0.015852797776460648, "kl": 0.029356002807617188, "learning_rate": 1.5456448032655267e-06, "loss": 0.0012, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1522 }, { "completion_length": 484.25, "epoch": 1.624, "grad_norm": 0.019881270825862885, "kl": 0.031169891357421875, "learning_rate": 1.5437822905933191e-06, "loss": 0.0012, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1523 }, { "completion_length": 666.0625, "epoch": 1.6250666666666667, "grad_norm": 0.0008816152694635093, "kl": 0.07939338684082031, "learning_rate": 1.5419197103594208e-06, "loss": 0.0032, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1524 }, { "completion_length": 558.375, "epoch": 1.6261333333333332, "grad_norm": 0.02647484838962555, "kl": 0.06945037841796875, "learning_rate": 1.540057065438032e-06, "loss": 0.0028, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1525 }, { "completion_length": 795.71875, "epoch": 1.6272, "grad_norm": 0.0008303086506202817, "kl": 0.06937026977539062, "learning_rate": 1.5381943587034526e-06, "loss": 0.0028, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1526 }, { "completion_length": 558.9375, "epoch": 1.6282666666666668, "grad_norm": 0.01216091401875019, "kl": 0.076202392578125, "learning_rate": 1.5363315930300777e-06, "loss": 0.003, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1527 }, { "completion_length": 555.8125, "epoch": 1.6293333333333333, "grad_norm": 0.01898142322897911, "kl": 0.04193115234375, "learning_rate": 1.5344687712923947e-06, "loss": 0.0017, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1528 }, { "completion_length": 601.96875, "epoch": 1.6303999999999998, "grad_norm": 0.012601028196513653, "kl": 0.0226898193359375, "learning_rate": 1.5326058963649756e-06, "loss": 0.0009, "reward": 0.375, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1529 }, { "completion_length": 663.9375, "epoch": 1.6314666666666666, "grad_norm": 0.0005362481460906565, "kl": 0.038280487060546875, "learning_rate": 1.5307429711224756e-06, "loss": 0.0015, "reward": 0.34375, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 1530 }, { "completion_length": 581.15625, "epoch": 1.6325333333333334, "grad_norm": 0.017124183475971222, "kl": 0.024250030517578125, "learning_rate": 1.5288799984396278e-06, "loss": 0.001, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1531 }, { "completion_length": 610.6875, "epoch": 1.6336, "grad_norm": 0.004973063711076975, "kl": 0.03246498107910156, "learning_rate": 1.5270169811912376e-06, "loss": 0.0013, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1532 }, { "completion_length": 464.0, "epoch": 1.6346666666666667, "grad_norm": 0.002114866627380252, "kl": 0.047283172607421875, "learning_rate": 1.525153922252179e-06, "loss": 0.0019, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1533 }, { "completion_length": 516.4375, "epoch": 1.6357333333333335, "grad_norm": 0.0028758824337273836, "kl": 0.053737640380859375, "learning_rate": 1.5232908244973922e-06, "loss": 0.0021, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1534 }, { "completion_length": 569.4375, "epoch": 1.6368, "grad_norm": 0.020198123529553413, "kl": 0.036937713623046875, "learning_rate": 1.521427690801875e-06, "loss": 0.0015, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1535 }, { "completion_length": 580.03125, "epoch": 1.6378666666666666, "grad_norm": 0.0032537884544581175, "kl": 0.0560455322265625, "learning_rate": 1.519564524040682e-06, "loss": 0.0022, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1536 }, { "completion_length": 698.1875, "epoch": 1.6389333333333334, "grad_norm": 0.0005865933489985764, "kl": 0.026453018188476562, "learning_rate": 1.5177013270889191e-06, "loss": 0.0011, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1537 }, { "completion_length": 532.25, "epoch": 1.6400000000000001, "grad_norm": 0.0004631151387002319, "kl": 0.02758026123046875, "learning_rate": 1.515838102821738e-06, "loss": 0.0011, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1538 }, { "completion_length": 591.28125, "epoch": 1.6410666666666667, "grad_norm": 0.02823726460337639, "kl": 0.05489158630371094, "learning_rate": 1.5139748541143317e-06, "loss": 0.0022, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1539 }, { "completion_length": 576.34375, "epoch": 1.6421333333333332, "grad_norm": 0.000850582669954747, "kl": 0.030612945556640625, "learning_rate": 1.512111583841933e-06, "loss": 0.0012, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1540 }, { "completion_length": 614.8125, "epoch": 1.6432, "grad_norm": 0.01910511963069439, "kl": 0.043460845947265625, "learning_rate": 1.5102482948798072e-06, "loss": 0.0017, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1541 }, { "completion_length": 496.40625, "epoch": 1.6442666666666668, "grad_norm": 0.012242469936609268, "kl": 0.036838531494140625, "learning_rate": 1.5083849901032472e-06, "loss": 0.0015, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1542 }, { "completion_length": 549.5, "epoch": 1.6453333333333333, "grad_norm": 0.0012427056208252907, "kl": 0.016326904296875, "learning_rate": 1.5065216723875722e-06, "loss": 0.0007, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1543 }, { "completion_length": 711.84375, "epoch": 1.6463999999999999, "grad_norm": 0.0026660265866667032, "kl": 0.07424545288085938, "learning_rate": 1.50465834460812e-06, "loss": 0.003, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1544 }, { "completion_length": 627.78125, "epoch": 1.6474666666666666, "grad_norm": 0.015772750601172447, "kl": 0.04477691650390625, "learning_rate": 1.5027950096402447e-06, "loss": 0.0018, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1545 }, { "completion_length": 630.53125, "epoch": 1.6485333333333334, "grad_norm": 0.012437895871698856, "kl": 0.060199737548828125, "learning_rate": 1.5009316703593107e-06, "loss": 0.0024, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1546 }, { "completion_length": 628.1875, "epoch": 1.6496, "grad_norm": 0.02269742265343666, "kl": 0.028026580810546875, "learning_rate": 1.4990683296406898e-06, "loss": 0.0011, "reward": 0.625, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1547 }, { "completion_length": 635.3125, "epoch": 1.6506666666666665, "grad_norm": 0.009310560300946236, "kl": 0.034252166748046875, "learning_rate": 1.4972049903597554e-06, "loss": 0.0014, "reward": 0.4375, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1548 }, { "completion_length": 578.0, "epoch": 1.6517333333333335, "grad_norm": 0.024972768500447273, "kl": 0.00760650634765625, "learning_rate": 1.4953416553918801e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1549 }, { "completion_length": 716.5625, "epoch": 1.6528, "grad_norm": 0.018674077466130257, "kl": 0.02581787109375, "learning_rate": 1.493478327612428e-06, "loss": 0.001, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1550 }, { "completion_length": 740.5, "epoch": 1.6538666666666666, "grad_norm": 0.009972807951271534, "kl": 0.036365509033203125, "learning_rate": 1.4916150098967525e-06, "loss": 0.0015, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1551 }, { "completion_length": 541.46875, "epoch": 1.6549333333333334, "grad_norm": 0.019257208332419395, "kl": 0.055423736572265625, "learning_rate": 1.4897517051201933e-06, "loss": 0.0022, "reward": 0.84375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1552 }, { "completion_length": 551.625, "epoch": 1.6560000000000001, "grad_norm": 0.025481054559350014, "kl": 0.038936614990234375, "learning_rate": 1.487888416158067e-06, "loss": 0.0016, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1553 }, { "completion_length": 450.125, "epoch": 1.6570666666666667, "grad_norm": 0.008357531391084194, "kl": 0.03130340576171875, "learning_rate": 1.4860251458856683e-06, "loss": 0.0013, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1554 }, { "completion_length": 749.90625, "epoch": 1.6581333333333332, "grad_norm": 0.013690453954041004, "kl": 0.030416488647460938, "learning_rate": 1.4841618971782626e-06, "loss": 0.0012, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1555 }, { "completion_length": 562.375, "epoch": 1.6592, "grad_norm": 0.0021870688069611788, "kl": 0.02298736572265625, "learning_rate": 1.4822986729110812e-06, "loss": 0.0009, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1556 }, { "completion_length": 742.90625, "epoch": 1.6602666666666668, "grad_norm": 0.023663608357310295, "kl": 0.046772003173828125, "learning_rate": 1.4804354759593176e-06, "loss": 0.0019, "reward": 0.46875, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1557 }, { "completion_length": 531.59375, "epoch": 1.6613333333333333, "grad_norm": 0.013872145675122738, "kl": 0.034847259521484375, "learning_rate": 1.4785723091981254e-06, "loss": 0.0014, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1558 }, { "completion_length": 577.78125, "epoch": 1.6623999999999999, "grad_norm": 0.0016107176197692752, "kl": 0.045391082763671875, "learning_rate": 1.4767091755026079e-06, "loss": 0.0018, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1559 }, { "completion_length": 725.96875, "epoch": 1.6634666666666666, "grad_norm": 0.012816647067666054, "kl": 0.07960128784179688, "learning_rate": 1.474846077747821e-06, "loss": 0.0032, "reward": 0.5, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1560 }, { "completion_length": 616.6875, "epoch": 1.6645333333333334, "grad_norm": 0.026476765051484108, "kl": 0.048603057861328125, "learning_rate": 1.472983018808763e-06, "loss": 0.0019, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1561 }, { "completion_length": 570.5625, "epoch": 1.6656, "grad_norm": 0.02020035684108734, "kl": 0.025279998779296875, "learning_rate": 1.4711200015603724e-06, "loss": 0.001, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1562 }, { "completion_length": 557.4375, "epoch": 1.6666666666666665, "grad_norm": 0.041316475719213486, "kl": 0.023715972900390625, "learning_rate": 1.4692570288775243e-06, "loss": 0.0009, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1563 }, { "completion_length": 674.34375, "epoch": 1.6677333333333333, "grad_norm": 0.0006495251436717808, "kl": 0.010868072509765625, "learning_rate": 1.4673941036350246e-06, "loss": 0.0004, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1564 }, { "completion_length": 469.625, "epoch": 1.6688, "grad_norm": 0.0012729080626741052, "kl": 0.06691169738769531, "learning_rate": 1.4655312287076054e-06, "loss": 0.0027, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1565 }, { "completion_length": 624.75, "epoch": 1.6698666666666666, "grad_norm": 0.016060469672083855, "kl": 0.022197723388671875, "learning_rate": 1.4636684069699222e-06, "loss": 0.0009, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1566 }, { "completion_length": 561.65625, "epoch": 1.6709333333333334, "grad_norm": 0.00043610608554445207, "kl": 0.033111572265625, "learning_rate": 1.4618056412965479e-06, "loss": 0.0013, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1567 }, { "completion_length": 593.5625, "epoch": 1.6720000000000002, "grad_norm": 0.02100343070924282, "kl": 0.025970458984375, "learning_rate": 1.4599429345619683e-06, "loss": 0.001, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1568 }, { "completion_length": 587.6875, "epoch": 1.6730666666666667, "grad_norm": 0.011760890483856201, "kl": 0.010395050048828125, "learning_rate": 1.4580802896405793e-06, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1569 }, { "completion_length": 528.09375, "epoch": 1.6741333333333333, "grad_norm": 0.0013335815165191889, "kl": 0.03508758544921875, "learning_rate": 1.4562177094066814e-06, "loss": 0.0014, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 1570 }, { "completion_length": 635.75, "epoch": 1.6752, "grad_norm": 0.0005951053462922573, "kl": 0.035068511962890625, "learning_rate": 1.4543551967344738e-06, "loss": 0.0014, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1571 }, { "completion_length": 677.5625, "epoch": 1.6762666666666668, "grad_norm": 0.010533375665545464, "kl": 0.024740219116210938, "learning_rate": 1.452492754498053e-06, "loss": 0.001, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1572 }, { "completion_length": 667.6875, "epoch": 1.6773333333333333, "grad_norm": 0.001281802193261683, "kl": 0.040241241455078125, "learning_rate": 1.450630385571407e-06, "loss": 0.0016, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1573 }, { "completion_length": 475.09375, "epoch": 1.6784, "grad_norm": 0.00047662341967225075, "kl": 0.042934417724609375, "learning_rate": 1.4487680928284087e-06, "loss": 0.0017, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1574 }, { "completion_length": 717.6875, "epoch": 1.6794666666666667, "grad_norm": 0.008792465552687645, "kl": 0.047210693359375, "learning_rate": 1.4469058791428154e-06, "loss": 0.0019, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1575 }, { "completion_length": 671.03125, "epoch": 1.6805333333333334, "grad_norm": 0.002760164672508836, "kl": 0.05798912048339844, "learning_rate": 1.4450437473882612e-06, "loss": 0.0023, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1576 }, { "completion_length": 550.65625, "epoch": 1.6816, "grad_norm": 0.0005546698230318725, "kl": 0.041839599609375, "learning_rate": 1.4431817004382556e-06, "loss": 0.0017, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1577 }, { "completion_length": 709.4375, "epoch": 1.6826666666666665, "grad_norm": 0.008587568998336792, "kl": 0.0571441650390625, "learning_rate": 1.4413197411661739e-06, "loss": 0.0023, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1578 }, { "completion_length": 638.9375, "epoch": 1.6837333333333333, "grad_norm": 0.02309885434806347, "kl": 0.04254150390625, "learning_rate": 1.4394578724452598e-06, "loss": 0.0017, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1579 }, { "completion_length": 756.8125, "epoch": 1.6848, "grad_norm": 0.02285153977572918, "kl": 0.057361602783203125, "learning_rate": 1.4375960971486153e-06, "loss": 0.0023, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1580 }, { "completion_length": 670.5, "epoch": 1.6858666666666666, "grad_norm": 0.002098619472235441, "kl": 0.044414520263671875, "learning_rate": 1.4357344181491972e-06, "loss": 0.0018, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1581 }, { "completion_length": 565.1875, "epoch": 1.6869333333333332, "grad_norm": 0.023538053035736084, "kl": 0.039157867431640625, "learning_rate": 1.4338728383198167e-06, "loss": 0.0016, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1582 }, { "completion_length": 652.5625, "epoch": 1.688, "grad_norm": 0.004857766907662153, "kl": 0.09571075439453125, "learning_rate": 1.4320113605331297e-06, "loss": 0.0038, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1583 }, { "completion_length": 722.25, "epoch": 1.6890666666666667, "grad_norm": 0.011051468551158905, "kl": 0.05461883544921875, "learning_rate": 1.4301499876616344e-06, "loss": 0.0022, "reward": 0.5, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1584 }, { "completion_length": 596.21875, "epoch": 1.6901333333333333, "grad_norm": 0.017571991309523582, "kl": 0.06249046325683594, "learning_rate": 1.4282887225776692e-06, "loss": 0.0025, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1585 }, { "completion_length": 557.34375, "epoch": 1.6912, "grad_norm": 0.0009220740175805986, "kl": 0.016246795654296875, "learning_rate": 1.4264275681534043e-06, "loss": 0.0006, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1586 }, { "completion_length": 633.0625, "epoch": 1.6922666666666668, "grad_norm": 0.010267502628266811, "kl": 0.07770347595214844, "learning_rate": 1.4245665272608392e-06, "loss": 0.0031, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1587 }, { "completion_length": 504.5625, "epoch": 1.6933333333333334, "grad_norm": 0.025533290579915047, "kl": 0.0499114990234375, "learning_rate": 1.4227056027717996e-06, "loss": 0.002, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1588 }, { "completion_length": 650.1875, "epoch": 1.6944, "grad_norm": 0.016953717917203903, "kl": 0.09880828857421875, "learning_rate": 1.4208447975579309e-06, "loss": 0.0039, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1589 }, { "completion_length": 383.96875, "epoch": 1.6954666666666667, "grad_norm": 0.03267298638820648, "kl": 0.07241058349609375, "learning_rate": 1.4189841144906928e-06, "loss": 0.0029, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1590 }, { "completion_length": 622.34375, "epoch": 1.6965333333333334, "grad_norm": 0.01067518163472414, "kl": 0.05390167236328125, "learning_rate": 1.4171235564413593e-06, "loss": 0.0022, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1591 }, { "completion_length": 624.8125, "epoch": 1.6976, "grad_norm": 0.0009454087121412158, "kl": 0.02887725830078125, "learning_rate": 1.41526312628101e-06, "loss": 0.0012, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1592 }, { "completion_length": 693.5, "epoch": 1.6986666666666665, "grad_norm": 0.0015375764342024922, "kl": 0.034992218017578125, "learning_rate": 1.4134028268805265e-06, "loss": 0.0014, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1593 }, { "completion_length": 688.34375, "epoch": 1.6997333333333333, "grad_norm": 0.0027157075237482786, "kl": 0.06697845458984375, "learning_rate": 1.4115426611105901e-06, "loss": 0.0027, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1594 }, { "completion_length": 556.96875, "epoch": 1.7008, "grad_norm": 0.0010823271004483104, "kl": 0.0771484375, "learning_rate": 1.4096826318416757e-06, "loss": 0.0031, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1595 }, { "completion_length": 522.46875, "epoch": 1.7018666666666666, "grad_norm": 0.01653400994837284, "kl": 0.019155502319335938, "learning_rate": 1.4078227419440454e-06, "loss": 0.0008, "reward": 0.71875, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1596 }, { "completion_length": 568.59375, "epoch": 1.7029333333333332, "grad_norm": 0.027192506939172745, "kl": 0.049953460693359375, "learning_rate": 1.4059629942877491e-06, "loss": 0.002, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1597 }, { "completion_length": 517.84375, "epoch": 1.704, "grad_norm": 0.020030027255415916, "kl": 0.05312347412109375, "learning_rate": 1.4041033917426168e-06, "loss": 0.0021, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1598 }, { "completion_length": 651.28125, "epoch": 1.7050666666666667, "grad_norm": 0.008888927288353443, "kl": 0.007955551147460938, "learning_rate": 1.402243937178252e-06, "loss": 0.0003, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1599 }, { "completion_length": 612.375, "epoch": 1.7061333333333333, "grad_norm": 0.0034335388336330652, "kl": 0.1001129150390625, "learning_rate": 1.4003846334640324e-06, "loss": 0.004, "reward": 0.53125, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1600 }, { "completion_length": 580.75, "epoch": 1.7072, "grad_norm": 0.02341236174106598, "kl": 0.07538223266601562, "learning_rate": 1.3985254834691026e-06, "loss": 0.003, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1601 }, { "completion_length": 701.90625, "epoch": 1.7082666666666668, "grad_norm": 0.02087077684700489, "kl": 0.10430145263671875, "learning_rate": 1.396666490062369e-06, "loss": 0.0042, "reward": 0.4375, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1602 }, { "completion_length": 616.125, "epoch": 1.7093333333333334, "grad_norm": 0.0023236488923430443, "kl": 0.0323486328125, "learning_rate": 1.3948076561124966e-06, "loss": 0.0013, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1603 }, { "completion_length": 567.5625, "epoch": 1.7104, "grad_norm": 0.015865063294768333, "kl": 0.031093597412109375, "learning_rate": 1.3929489844879057e-06, "loss": 0.0012, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1604 }, { "completion_length": 638.8125, "epoch": 1.7114666666666667, "grad_norm": 0.020314138382673264, "kl": 0.020643234252929688, "learning_rate": 1.3910904780567642e-06, "loss": 0.0008, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1605 }, { "completion_length": 627.59375, "epoch": 1.7125333333333335, "grad_norm": 0.01804991066455841, "kl": 0.029102325439453125, "learning_rate": 1.3892321396869856e-06, "loss": 0.0012, "reward": 0.53125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1606 }, { "completion_length": 577.78125, "epoch": 1.7136, "grad_norm": 0.01602832041680813, "kl": 0.0257110595703125, "learning_rate": 1.3873739722462256e-06, "loss": 0.001, "reward": 0.6875, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1607 }, { "completion_length": 592.71875, "epoch": 1.7146666666666666, "grad_norm": 0.010625237599015236, "kl": 0.09659194946289062, "learning_rate": 1.3855159786018744e-06, "loss": 0.0039, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1608 }, { "completion_length": 655.84375, "epoch": 1.7157333333333333, "grad_norm": 0.011065269820392132, "kl": 0.031696319580078125, "learning_rate": 1.3836581616210538e-06, "loss": 0.0013, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1609 }, { "completion_length": 635.59375, "epoch": 1.7168, "grad_norm": 0.025883419439196587, "kl": 0.050571441650390625, "learning_rate": 1.3818005241706147e-06, "loss": 0.002, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1610 }, { "completion_length": 572.46875, "epoch": 1.7178666666666667, "grad_norm": 0.017869915813207626, "kl": 0.026058197021484375, "learning_rate": 1.37994306911713e-06, "loss": 0.001, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1611 }, { "completion_length": 613.375, "epoch": 1.7189333333333332, "grad_norm": 0.01517536025494337, "kl": 0.0305938720703125, "learning_rate": 1.37808579932689e-06, "loss": 0.0012, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1612 }, { "completion_length": 515.875, "epoch": 1.72, "grad_norm": 0.009024427272379398, "kl": 0.033100128173828125, "learning_rate": 1.3762287176659012e-06, "loss": 0.0013, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1613 }, { "completion_length": 715.53125, "epoch": 1.7210666666666667, "grad_norm": 0.002077645855024457, "kl": 0.040218353271484375, "learning_rate": 1.374371826999879e-06, "loss": 0.0016, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 1614 }, { "completion_length": 668.65625, "epoch": 1.7221333333333333, "grad_norm": 0.000767464516684413, "kl": 0.07601261138916016, "learning_rate": 1.372515130194243e-06, "loss": 0.003, "reward": 0.5, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1615 }, { "completion_length": 669.1875, "epoch": 1.7231999999999998, "grad_norm": 0.025470398366451263, "kl": 0.07561492919921875, "learning_rate": 1.3706586301141154e-06, "loss": 0.003, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1616 }, { "completion_length": 622.0625, "epoch": 1.7242666666666666, "grad_norm": 0.015985995531082153, "kl": 0.0199127197265625, "learning_rate": 1.368802329624314e-06, "loss": 0.0008, "reward": 0.78125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1617 }, { "completion_length": 566.96875, "epoch": 1.7253333333333334, "grad_norm": 0.0010799318552017212, "kl": 0.02600860595703125, "learning_rate": 1.3669462315893474e-06, "loss": 0.001, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1618 }, { "completion_length": 626.15625, "epoch": 1.7264, "grad_norm": 0.014922739937901497, "kl": 0.061962127685546875, "learning_rate": 1.365090338873414e-06, "loss": 0.0025, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1619 }, { "completion_length": 653.40625, "epoch": 1.7274666666666667, "grad_norm": 0.20027899742126465, "kl": 0.049251556396484375, "learning_rate": 1.3632346543403946e-06, "loss": 0.002, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1620 }, { "completion_length": 678.875, "epoch": 1.7285333333333335, "grad_norm": 0.00113101361785084, "kl": 0.03680992126464844, "learning_rate": 1.361379180853848e-06, "loss": 0.0015, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1621 }, { "completion_length": 599.84375, "epoch": 1.7296, "grad_norm": 0.016107989475131035, "kl": 0.04268646240234375, "learning_rate": 1.3595239212770074e-06, "loss": 0.0017, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1622 }, { "completion_length": 572.25, "epoch": 1.7306666666666666, "grad_norm": 0.002070691669359803, "kl": 0.01381683349609375, "learning_rate": 1.3576688784727775e-06, "loss": 0.0006, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1623 }, { "completion_length": 562.71875, "epoch": 1.7317333333333333, "grad_norm": 0.011812915094196796, "kl": 0.042758941650390625, "learning_rate": 1.355814055303726e-06, "loss": 0.0017, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1624 }, { "completion_length": 576.4375, "epoch": 1.7328000000000001, "grad_norm": 0.021693751215934753, "kl": 0.017940521240234375, "learning_rate": 1.3539594546320835e-06, "loss": 0.0007, "reward": 0.75, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1625 }, { "completion_length": 618.46875, "epoch": 1.7338666666666667, "grad_norm": 0.008229823783040047, "kl": 0.03281402587890625, "learning_rate": 1.3521050793197374e-06, "loss": 0.0013, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1626 }, { "completion_length": 447.25, "epoch": 1.7349333333333332, "grad_norm": 0.013121076859533787, "kl": 0.049533843994140625, "learning_rate": 1.3502509322282257e-06, "loss": 0.002, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1627 }, { "completion_length": 589.03125, "epoch": 1.736, "grad_norm": 0.02164619043469429, "kl": 0.06828689575195312, "learning_rate": 1.3483970162187353e-06, "loss": 0.0027, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1628 }, { "completion_length": 585.8125, "epoch": 1.7370666666666668, "grad_norm": 0.018452303484082222, "kl": 0.056690216064453125, "learning_rate": 1.3465433341520975e-06, "loss": 0.0023, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1629 }, { "completion_length": 471.5625, "epoch": 1.7381333333333333, "grad_norm": 0.020342737436294556, "kl": 0.09187698364257812, "learning_rate": 1.3446898888887806e-06, "loss": 0.0037, "reward": 0.78125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1630 }, { "completion_length": 689.6875, "epoch": 1.7391999999999999, "grad_norm": 0.023992931470274925, "kl": 0.07099151611328125, "learning_rate": 1.3428366832888878e-06, "loss": 0.0028, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1631 }, { "completion_length": 634.40625, "epoch": 1.7402666666666666, "grad_norm": 0.008620304986834526, "kl": 0.0243988037109375, "learning_rate": 1.3409837202121548e-06, "loss": 0.001, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1632 }, { "completion_length": 686.9375, "epoch": 1.7413333333333334, "grad_norm": 0.010625900700688362, "kl": 0.07044601440429688, "learning_rate": 1.3391310025179401e-06, "loss": 0.0028, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1633 }, { "completion_length": 505.46875, "epoch": 1.7424, "grad_norm": 0.002077045850455761, "kl": 0.08072662353515625, "learning_rate": 1.3372785330652248e-06, "loss": 0.0032, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1634 }, { "completion_length": 448.40625, "epoch": 1.7434666666666667, "grad_norm": 0.0010098540224134922, "kl": 0.035022735595703125, "learning_rate": 1.335426314712607e-06, "loss": 0.0014, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1635 }, { "completion_length": 698.96875, "epoch": 1.7445333333333335, "grad_norm": 0.01168907806277275, "kl": 0.021877288818359375, "learning_rate": 1.3335743503182982e-06, "loss": 0.0009, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1636 }, { "completion_length": 526.71875, "epoch": 1.7456, "grad_norm": 0.0008874751511029899, "kl": 0.03425407409667969, "learning_rate": 1.3317226427401154e-06, "loss": 0.0014, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1637 }, { "completion_length": 600.65625, "epoch": 1.7466666666666666, "grad_norm": 0.015389233827590942, "kl": 0.0801544189453125, "learning_rate": 1.3298711948354818e-06, "loss": 0.0032, "reward": 0.5625, "reward_std": 0.375, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1638 }, { "completion_length": 654.4375, "epoch": 1.7477333333333334, "grad_norm": 0.014036598615348339, "kl": 0.057514190673828125, "learning_rate": 1.3280200094614196e-06, "loss": 0.0023, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1639 }, { "completion_length": 698.84375, "epoch": 1.7488000000000001, "grad_norm": 0.018412377685308456, "kl": 0.043399810791015625, "learning_rate": 1.3261690894745444e-06, "loss": 0.0017, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1640 }, { "completion_length": 717.40625, "epoch": 1.7498666666666667, "grad_norm": 0.02517620287835598, "kl": 0.0720977783203125, "learning_rate": 1.324318437731063e-06, "loss": 0.0029, "reward": 0.3125, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 1641 }, { "completion_length": 678.78125, "epoch": 1.7509333333333332, "grad_norm": 0.03769215941429138, "kl": 0.03663825988769531, "learning_rate": 1.3224680570867695e-06, "loss": 0.0015, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1642 }, { "completion_length": 638.53125, "epoch": 1.752, "grad_norm": 0.0005521145067177713, "kl": 0.027057647705078125, "learning_rate": 1.3206179503970378e-06, "loss": 0.0011, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1643 }, { "completion_length": 642.0, "epoch": 1.7530666666666668, "grad_norm": 0.009183046407997608, "kl": 0.08028221130371094, "learning_rate": 1.3187681205168196e-06, "loss": 0.0032, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1644 }, { "completion_length": 643.28125, "epoch": 1.7541333333333333, "grad_norm": 0.01761985383927822, "kl": 0.027067184448242188, "learning_rate": 1.3169185703006409e-06, "loss": 0.0011, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1645 }, { "completion_length": 627.6875, "epoch": 1.7551999999999999, "grad_norm": 0.02098446525633335, "kl": 0.05891227722167969, "learning_rate": 1.315069302602594e-06, "loss": 0.0023, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1646 }, { "completion_length": 634.21875, "epoch": 1.7562666666666666, "grad_norm": 0.01798068732023239, "kl": 0.0531158447265625, "learning_rate": 1.313220320276336e-06, "loss": 0.0021, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1647 }, { "completion_length": 545.0, "epoch": 1.7573333333333334, "grad_norm": 0.021751374006271362, "kl": 0.11995697021484375, "learning_rate": 1.311371626175085e-06, "loss": 0.0048, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1648 }, { "completion_length": 612.9375, "epoch": 1.7584, "grad_norm": 0.02430207096040249, "kl": 0.10671615600585938, "learning_rate": 1.3095232231516117e-06, "loss": 0.0043, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1649 }, { "completion_length": 609.53125, "epoch": 1.7594666666666665, "grad_norm": 0.02243250422179699, "kl": 0.08968353271484375, "learning_rate": 1.3076751140582396e-06, "loss": 0.0036, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1650 }, { "completion_length": 547.15625, "epoch": 1.7605333333333333, "grad_norm": 0.010916748084127903, "kl": 0.0363311767578125, "learning_rate": 1.305827301746839e-06, "loss": 0.0015, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1651 }, { "completion_length": 659.09375, "epoch": 1.7616, "grad_norm": 0.013421720825135708, "kl": 0.047611236572265625, "learning_rate": 1.3039797890688201e-06, "loss": 0.0019, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1652 }, { "completion_length": 652.90625, "epoch": 1.7626666666666666, "grad_norm": 0.0005918650422245264, "kl": 0.022838592529296875, "learning_rate": 1.3021325788751322e-06, "loss": 0.0009, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1653 }, { "completion_length": 601.96875, "epoch": 1.7637333333333334, "grad_norm": 0.019572753459215164, "kl": 0.08074951171875, "learning_rate": 1.3002856740162586e-06, "loss": 0.0032, "reward": 0.53125, "reward_std": 0.4375, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1654 }, { "completion_length": 589.0, "epoch": 1.7648000000000001, "grad_norm": 0.018073873594403267, "kl": 0.10721588134765625, "learning_rate": 1.2984390773422093e-06, "loss": 0.0043, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1655 }, { "completion_length": 656.03125, "epoch": 1.7658666666666667, "grad_norm": 0.02772505208849907, "kl": 0.054279327392578125, "learning_rate": 1.2965927917025198e-06, "loss": 0.0022, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1656 }, { "completion_length": 624.3125, "epoch": 1.7669333333333332, "grad_norm": 0.013359417207539082, "kl": 0.017635345458984375, "learning_rate": 1.2947468199462467e-06, "loss": 0.0007, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1657 }, { "completion_length": 601.0625, "epoch": 1.768, "grad_norm": 0.043348994106054306, "kl": 0.05635643005371094, "learning_rate": 1.2929011649219606e-06, "loss": 0.0022, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1658 }, { "completion_length": 547.0625, "epoch": 1.7690666666666668, "grad_norm": 0.01852148026227951, "kl": 0.03917694091796875, "learning_rate": 1.2910558294777435e-06, "loss": 0.0016, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1659 }, { "completion_length": 711.28125, "epoch": 1.7701333333333333, "grad_norm": 0.002265875693410635, "kl": 0.0618133544921875, "learning_rate": 1.289210816461186e-06, "loss": 0.0025, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1660 }, { "completion_length": 672.03125, "epoch": 1.7711999999999999, "grad_norm": 0.020658576861023903, "kl": 0.07562065124511719, "learning_rate": 1.2873661287193782e-06, "loss": 0.003, "reward": 0.625, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1661 }, { "completion_length": 692.03125, "epoch": 1.7722666666666667, "grad_norm": 0.017030460759997368, "kl": 0.058696746826171875, "learning_rate": 1.285521769098911e-06, "loss": 0.0023, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1662 }, { "completion_length": 673.28125, "epoch": 1.7733333333333334, "grad_norm": 0.02617974393069744, "kl": 0.06052398681640625, "learning_rate": 1.283677740445868e-06, "loss": 0.0024, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1663 }, { "completion_length": 530.71875, "epoch": 1.7744, "grad_norm": 0.00040298368548974395, "kl": 0.0586700439453125, "learning_rate": 1.2818340456058218e-06, "loss": 0.0023, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1664 }, { "completion_length": 567.8125, "epoch": 1.7754666666666665, "grad_norm": 0.01853414997458458, "kl": 0.032665252685546875, "learning_rate": 1.2799906874238297e-06, "loss": 0.0013, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1665 }, { "completion_length": 490.28125, "epoch": 1.7765333333333333, "grad_norm": 0.0181675273925066, "kl": 0.039340972900390625, "learning_rate": 1.2781476687444297e-06, "loss": 0.0016, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1666 }, { "completion_length": 539.96875, "epoch": 1.7776, "grad_norm": 0.020889298990368843, "kl": 0.028690338134765625, "learning_rate": 1.2763049924116371e-06, "loss": 0.0011, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1667 }, { "completion_length": 611.9375, "epoch": 1.7786666666666666, "grad_norm": 0.017619794234633446, "kl": 0.04247283935546875, "learning_rate": 1.2744626612689368e-06, "loss": 0.0017, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1668 }, { "completion_length": 576.59375, "epoch": 1.7797333333333332, "grad_norm": 0.02982979826629162, "kl": 0.06986045837402344, "learning_rate": 1.2726206781592824e-06, "loss": 0.0028, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1669 }, { "completion_length": 660.5625, "epoch": 1.7808000000000002, "grad_norm": 3.0927441120147705, "kl": 1.3267669677734375, "learning_rate": 1.2707790459250905e-06, "loss": 0.0532, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1670 }, { "completion_length": 640.0, "epoch": 1.7818666666666667, "grad_norm": 0.015379548072814941, "kl": 0.028484344482421875, "learning_rate": 1.2689377674082355e-06, "loss": 0.0011, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1671 }, { "completion_length": 743.875, "epoch": 1.7829333333333333, "grad_norm": 0.010633885860443115, "kl": 0.06176948547363281, "learning_rate": 1.267096845450046e-06, "loss": 0.0025, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1672 }, { "completion_length": 683.8125, "epoch": 1.784, "grad_norm": 0.024792315438389778, "kl": 0.118865966796875, "learning_rate": 1.2652562828913019e-06, "loss": 0.0048, "reward": 0.46875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1673 }, { "completion_length": 641.9375, "epoch": 1.7850666666666668, "grad_norm": 0.0010042012436315417, "kl": 0.020198822021484375, "learning_rate": 1.263416082572226e-06, "loss": 0.0008, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1674 }, { "completion_length": 677.375, "epoch": 1.7861333333333334, "grad_norm": 0.0009322188561782241, "kl": 0.07474136352539062, "learning_rate": 1.261576247332484e-06, "loss": 0.003, "reward": 0.53125, "reward_std": 0.4233439117670059, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1675 }, { "completion_length": 781.46875, "epoch": 1.7872, "grad_norm": 0.008486129343509674, "kl": 0.056095123291015625, "learning_rate": 1.2597367800111783e-06, "loss": 0.0022, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1676 }, { "completion_length": 599.78125, "epoch": 1.7882666666666667, "grad_norm": 0.017604554072022438, "kl": 0.1006622314453125, "learning_rate": 1.257897683446842e-06, "loss": 0.004, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1677 }, { "completion_length": 711.1875, "epoch": 1.7893333333333334, "grad_norm": 0.018927862867712975, "kl": 0.07457733154296875, "learning_rate": 1.2560589604774372e-06, "loss": 0.003, "reward": 0.46875, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1678 }, { "completion_length": 614.75, "epoch": 1.7904, "grad_norm": 0.01261872984468937, "kl": 0.09139251708984375, "learning_rate": 1.25422061394035e-06, "loss": 0.0037, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1679 }, { "completion_length": 727.71875, "epoch": 1.7914666666666665, "grad_norm": 0.018505482003092766, "kl": 0.12089157104492188, "learning_rate": 1.2523826466723843e-06, "loss": 0.0048, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1680 }, { "completion_length": 716.5, "epoch": 1.7925333333333333, "grad_norm": 0.024239063262939453, "kl": 0.13699722290039062, "learning_rate": 1.2505450615097595e-06, "loss": 0.0055, "reward": 0.46875, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1681 }, { "completion_length": 649.65625, "epoch": 1.7936, "grad_norm": 0.01996826007962227, "kl": 0.12799072265625, "learning_rate": 1.2487078612881058e-06, "loss": 0.0051, "reward": 0.5, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1682 }, { "completion_length": 690.59375, "epoch": 1.7946666666666666, "grad_norm": 0.00041609170148149133, "kl": 0.12511444091796875, "learning_rate": 1.2468710488424574e-06, "loss": 0.005, "reward": 0.46875, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1683 }, { "completion_length": 581.21875, "epoch": 1.7957333333333332, "grad_norm": 0.013994589447975159, "kl": 0.053829193115234375, "learning_rate": 1.2450346270072528e-06, "loss": 0.0022, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1684 }, { "completion_length": 598.0, "epoch": 1.7968, "grad_norm": 0.020420458167791367, "kl": 0.0309906005859375, "learning_rate": 1.2431985986163266e-06, "loss": 0.0012, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1685 }, { "completion_length": 688.25, "epoch": 1.7978666666666667, "grad_norm": 0.019061345607042313, "kl": 0.07246017456054688, "learning_rate": 1.2413629665029049e-06, "loss": 0.0029, "reward": 0.5, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1686 }, { "completion_length": 694.90625, "epoch": 1.7989333333333333, "grad_norm": 0.020004941150546074, "kl": 0.0876007080078125, "learning_rate": 1.2395277334996047e-06, "loss": 0.0035, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1687 }, { "completion_length": 552.4375, "epoch": 1.8, "grad_norm": 0.016389042139053345, "kl": 0.05340576171875, "learning_rate": 1.2376929024384256e-06, "loss": 0.0021, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1688 }, { "completion_length": 566.875, "epoch": 1.8010666666666668, "grad_norm": 0.0413937009871006, "kl": 0.06042671203613281, "learning_rate": 1.2358584761507467e-06, "loss": 0.0024, "reward": 0.625, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1689 }, { "completion_length": 600.53125, "epoch": 1.8021333333333334, "grad_norm": 0.030884262174367905, "kl": 0.0659332275390625, "learning_rate": 1.2340244574673238e-06, "loss": 0.0026, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1690 }, { "completion_length": 621.875, "epoch": 1.8032, "grad_norm": 0.0003060042508877814, "kl": 0.04146575927734375, "learning_rate": 1.2321908492182833e-06, "loss": 0.0017, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1691 }, { "completion_length": 745.1875, "epoch": 1.8042666666666667, "grad_norm": 0.0013377133291214705, "kl": 0.10842132568359375, "learning_rate": 1.2303576542331168e-06, "loss": 0.0043, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1692 }, { "completion_length": 684.0, "epoch": 1.8053333333333335, "grad_norm": 0.0015809130854904652, "kl": 0.06893157958984375, "learning_rate": 1.2285248753406804e-06, "loss": 0.0028, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1693 }, { "completion_length": 528.9375, "epoch": 1.8064, "grad_norm": 0.013378932140767574, "kl": 0.024234771728515625, "learning_rate": 1.226692515369186e-06, "loss": 0.001, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1694 }, { "completion_length": 572.65625, "epoch": 1.8074666666666666, "grad_norm": 0.0017017939826473594, "kl": 0.06716156005859375, "learning_rate": 1.2248605771462016e-06, "loss": 0.0027, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1695 }, { "completion_length": 665.6875, "epoch": 1.8085333333333333, "grad_norm": 0.002141845179721713, "kl": 0.031970977783203125, "learning_rate": 1.2230290634986416e-06, "loss": 0.0013, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1696 }, { "completion_length": 547.4375, "epoch": 1.8096, "grad_norm": 0.018682334572076797, "kl": 0.06816482543945312, "learning_rate": 1.2211979772527665e-06, "loss": 0.0027, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1697 }, { "completion_length": 597.28125, "epoch": 1.8106666666666666, "grad_norm": 0.014169125817716122, "kl": 0.07184600830078125, "learning_rate": 1.2193673212341784e-06, "loss": 0.0029, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1698 }, { "completion_length": 608.28125, "epoch": 1.8117333333333332, "grad_norm": 0.01853131502866745, "kl": 0.09654617309570312, "learning_rate": 1.217537098267813e-06, "loss": 0.0039, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1699 }, { "completion_length": 596.84375, "epoch": 1.8128, "grad_norm": 0.015249219723045826, "kl": 0.06634140014648438, "learning_rate": 1.21570731117794e-06, "loss": 0.0027, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1700 }, { "completion_length": 633.34375, "epoch": 1.8138666666666667, "grad_norm": 0.0011856269557029009, "kl": 0.09853363037109375, "learning_rate": 1.213877962788156e-06, "loss": 0.0039, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1701 }, { "completion_length": 587.71875, "epoch": 1.8149333333333333, "grad_norm": 0.0009347070590592921, "kl": 0.014644622802734375, "learning_rate": 1.212049055921379e-06, "loss": 0.0006, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1702 }, { "completion_length": 498.59375, "epoch": 1.8159999999999998, "grad_norm": 0.014975464902818203, "kl": 0.04799652099609375, "learning_rate": 1.2102205933998473e-06, "loss": 0.0019, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1703 }, { "completion_length": 653.21875, "epoch": 1.8170666666666668, "grad_norm": 0.001381723559461534, "kl": 0.06020355224609375, "learning_rate": 1.2083925780451142e-06, "loss": 0.0024, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1704 }, { "completion_length": 715.46875, "epoch": 1.8181333333333334, "grad_norm": 0.008056794293224812, "kl": 0.04378509521484375, "learning_rate": 1.20656501267804e-06, "loss": 0.0017, "reward": 0.21875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 1705 }, { "completion_length": 553.15625, "epoch": 1.8192, "grad_norm": 0.0024189534597098827, "kl": 0.06752777099609375, "learning_rate": 1.2047379001187942e-06, "loss": 0.0027, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1706 }, { "completion_length": 586.84375, "epoch": 1.8202666666666667, "grad_norm": 0.02374538965523243, "kl": 0.05748748779296875, "learning_rate": 1.2029112431868455e-06, "loss": 0.0023, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1707 }, { "completion_length": 739.78125, "epoch": 1.8213333333333335, "grad_norm": 0.0026971548795700073, "kl": 0.06819534301757812, "learning_rate": 1.2010850447009587e-06, "loss": 0.0027, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1708 }, { "completion_length": 512.6875, "epoch": 1.8224, "grad_norm": 0.021221458911895752, "kl": 0.044521331787109375, "learning_rate": 1.1992593074791938e-06, "loss": 0.0018, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1709 }, { "completion_length": 558.4375, "epoch": 1.8234666666666666, "grad_norm": 0.015692779794335365, "kl": 0.05144500732421875, "learning_rate": 1.1974340343388974e-06, "loss": 0.0021, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1710 }, { "completion_length": 618.34375, "epoch": 1.8245333333333333, "grad_norm": 0.001000017742626369, "kl": 0.0340728759765625, "learning_rate": 1.1956092280966986e-06, "loss": 0.0014, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1711 }, { "completion_length": 634.875, "epoch": 1.8256000000000001, "grad_norm": 0.021018600091338158, "kl": 0.07591629028320312, "learning_rate": 1.1937848915685093e-06, "loss": 0.003, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1712 }, { "completion_length": 630.90625, "epoch": 1.8266666666666667, "grad_norm": 0.01936923712491989, "kl": 0.06328582763671875, "learning_rate": 1.1919610275695144e-06, "loss": 0.0025, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1713 }, { "completion_length": 705.40625, "epoch": 1.8277333333333332, "grad_norm": 0.00044791566324420273, "kl": 0.046176910400390625, "learning_rate": 1.190137638914169e-06, "loss": 0.0018, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1714 }, { "completion_length": 684.875, "epoch": 1.8288, "grad_norm": 0.017162738367915154, "kl": 0.10918045043945312, "learning_rate": 1.1883147284161966e-06, "loss": 0.0044, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1715 }, { "completion_length": 527.71875, "epoch": 1.8298666666666668, "grad_norm": 0.0002735160232987255, "kl": 0.01238250732421875, "learning_rate": 1.186492298888582e-06, "loss": 0.0005, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1716 }, { "completion_length": 667.625, "epoch": 1.8309333333333333, "grad_norm": 0.008081384003162384, "kl": 0.039306640625, "learning_rate": 1.184670353143567e-06, "loss": 0.0016, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1717 }, { "completion_length": 620.96875, "epoch": 1.8319999999999999, "grad_norm": 0.001585213583894074, "kl": 0.05291748046875, "learning_rate": 1.1828488939926482e-06, "loss": 0.0021, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1718 }, { "completion_length": 681.375, "epoch": 1.8330666666666666, "grad_norm": 0.0006892421515658498, "kl": 0.054477691650390625, "learning_rate": 1.1810279242465714e-06, "loss": 0.0022, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1719 }, { "completion_length": 652.21875, "epoch": 1.8341333333333334, "grad_norm": 0.010631871409714222, "kl": 0.032810211181640625, "learning_rate": 1.179207446715325e-06, "loss": 0.0013, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1720 }, { "completion_length": 581.375, "epoch": 1.8352, "grad_norm": 0.0005056529771536589, "kl": 0.026782989501953125, "learning_rate": 1.17738746420814e-06, "loss": 0.0011, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1721 }, { "completion_length": 553.125, "epoch": 1.8362666666666667, "grad_norm": 0.0006245485856197774, "kl": 0.0093536376953125, "learning_rate": 1.1755679795334832e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1722 }, { "completion_length": 587.40625, "epoch": 1.8373333333333335, "grad_norm": 0.010481102392077446, "kl": 0.06780242919921875, "learning_rate": 1.1737489954990528e-06, "loss": 0.0027, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1723 }, { "completion_length": 692.4375, "epoch": 1.8384, "grad_norm": 0.012212998233735561, "kl": 0.08789634704589844, "learning_rate": 1.1719305149117741e-06, "loss": 0.0035, "reward": 0.4375, "reward_std": 0.5386751294136047, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1724 }, { "completion_length": 635.53125, "epoch": 1.8394666666666666, "grad_norm": 0.02750592865049839, "kl": 0.026302337646484375, "learning_rate": 1.1701125405777965e-06, "loss": 0.0011, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1725 }, { "completion_length": 645.75, "epoch": 1.8405333333333334, "grad_norm": 0.010738545097410679, "kl": 0.049365997314453125, "learning_rate": 1.1682950753024875e-06, "loss": 0.002, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1726 }, { "completion_length": 539.46875, "epoch": 1.8416000000000001, "grad_norm": 0.010605545714497566, "kl": 0.050571441650390625, "learning_rate": 1.166478121890428e-06, "loss": 0.002, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1727 }, { "completion_length": 588.5625, "epoch": 1.8426666666666667, "grad_norm": 0.0132752088829875, "kl": 0.0440673828125, "learning_rate": 1.164661683145412e-06, "loss": 0.0018, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1728 }, { "completion_length": 655.125, "epoch": 1.8437333333333332, "grad_norm": 0.008250069804489613, "kl": 0.11346054077148438, "learning_rate": 1.1628457618704367e-06, "loss": 0.0045, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1729 }, { "completion_length": 681.71875, "epoch": 1.8448, "grad_norm": 0.01104762777686119, "kl": 0.10578536987304688, "learning_rate": 1.161030360867701e-06, "loss": 0.0042, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1730 }, { "completion_length": 619.28125, "epoch": 1.8458666666666668, "grad_norm": 0.018258849158883095, "kl": 0.06209564208984375, "learning_rate": 1.1592154829386022e-06, "loss": 0.0025, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1731 }, { "completion_length": 741.59375, "epoch": 1.8469333333333333, "grad_norm": 0.023270804435014725, "kl": 0.049327850341796875, "learning_rate": 1.1574011308837302e-06, "loss": 0.002, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1732 }, { "completion_length": 669.21875, "epoch": 1.8479999999999999, "grad_norm": 0.02927088551223278, "kl": 0.051631927490234375, "learning_rate": 1.1555873075028614e-06, "loss": 0.0021, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1733 }, { "completion_length": 660.65625, "epoch": 1.8490666666666666, "grad_norm": 0.0008012840989977121, "kl": 0.033832550048828125, "learning_rate": 1.1537740155949595e-06, "loss": 0.0014, "reward": 0.5, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1734 }, { "completion_length": 727.28125, "epoch": 1.8501333333333334, "grad_norm": 0.019025232642889023, "kl": 0.09696197509765625, "learning_rate": 1.1519612579581663e-06, "loss": 0.0039, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1735 }, { "completion_length": 643.53125, "epoch": 1.8512, "grad_norm": 0.009031449444591999, "kl": 0.035480499267578125, "learning_rate": 1.1501490373897983e-06, "loss": 0.0014, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1736 }, { "completion_length": 436.84375, "epoch": 1.8522666666666665, "grad_norm": 0.01537216268479824, "kl": 0.054046630859375, "learning_rate": 1.1483373566863454e-06, "loss": 0.0022, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1737 }, { "completion_length": 556.09375, "epoch": 1.8533333333333335, "grad_norm": 0.027655011042952538, "kl": 0.06374359130859375, "learning_rate": 1.1465262186434634e-06, "loss": 0.0026, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1738 }, { "completion_length": 632.65625, "epoch": 1.8544, "grad_norm": 0.02484734356403351, "kl": 0.15361785888671875, "learning_rate": 1.1447156260559696e-06, "loss": 0.0061, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1739 }, { "completion_length": 704.34375, "epoch": 1.8554666666666666, "grad_norm": 0.021861447021365166, "kl": 0.025569915771484375, "learning_rate": 1.142905581717841e-06, "loss": 0.001, "reward": 0.34375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 1740 }, { "completion_length": 571.96875, "epoch": 1.8565333333333334, "grad_norm": 0.0005531475180760026, "kl": 0.03763580322265625, "learning_rate": 1.1410960884222095e-06, "loss": 0.0015, "reward": 0.84375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1741 }, { "completion_length": 679.0625, "epoch": 1.8576000000000001, "grad_norm": 0.025043722242116928, "kl": 0.07675933837890625, "learning_rate": 1.1392871489613538e-06, "loss": 0.0031, "reward": 0.40625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1742 }, { "completion_length": 478.4375, "epoch": 1.8586666666666667, "grad_norm": 0.0010701288701966405, "kl": 0.02941131591796875, "learning_rate": 1.1374787661266998e-06, "loss": 0.0012, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1743 }, { "completion_length": 681.65625, "epoch": 1.8597333333333332, "grad_norm": 0.013409364968538284, "kl": 0.037700653076171875, "learning_rate": 1.135670942708815e-06, "loss": 0.0015, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1744 }, { "completion_length": 613.4375, "epoch": 1.8608, "grad_norm": 0.02071770839393139, "kl": 0.09413909912109375, "learning_rate": 1.1338636814974022e-06, "loss": 0.0038, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1745 }, { "completion_length": 690.75, "epoch": 1.8618666666666668, "grad_norm": 0.02210245095193386, "kl": 0.07628822326660156, "learning_rate": 1.132056985281297e-06, "loss": 0.0031, "reward": 0.40625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1746 }, { "completion_length": 731.21875, "epoch": 1.8629333333333333, "grad_norm": 0.0046680998057127, "kl": 0.061107635498046875, "learning_rate": 1.1302508568484645e-06, "loss": 0.0024, "reward": 0.4375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1747 }, { "completion_length": 539.5625, "epoch": 1.8639999999999999, "grad_norm": 0.0005465564900077879, "kl": 0.00815582275390625, "learning_rate": 1.1284452989859915e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1748 }, { "completion_length": 684.9375, "epoch": 1.8650666666666667, "grad_norm": 0.01404372975230217, "kl": 0.0618743896484375, "learning_rate": 1.1266403144800856e-06, "loss": 0.0025, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1749 }, { "completion_length": 556.90625, "epoch": 1.8661333333333334, "grad_norm": 0.0013739976566284895, "kl": 0.030872344970703125, "learning_rate": 1.12483590611607e-06, "loss": 0.0012, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1750 }, { "completion_length": 477.59375, "epoch": 1.8672, "grad_norm": 0.020655468106269836, "kl": 0.04721832275390625, "learning_rate": 1.123032076678378e-06, "loss": 0.0019, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1751 }, { "completion_length": 610.15625, "epoch": 1.8682666666666665, "grad_norm": 0.013100720010697842, "kl": 0.05876922607421875, "learning_rate": 1.1212288289505494e-06, "loss": 0.0024, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1752 }, { "completion_length": 735.40625, "epoch": 1.8693333333333333, "grad_norm": 0.0012002745643258095, "kl": 0.048519134521484375, "learning_rate": 1.1194261657152274e-06, "loss": 0.0019, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1753 }, { "completion_length": 706.71875, "epoch": 1.8704, "grad_norm": 0.0005996609688736498, "kl": 0.022802352905273438, "learning_rate": 1.1176240897541525e-06, "loss": 0.0009, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1754 }, { "completion_length": 669.3125, "epoch": 1.8714666666666666, "grad_norm": 0.014643040485680103, "kl": 0.07394790649414062, "learning_rate": 1.1158226038481584e-06, "loss": 0.003, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1755 }, { "completion_length": 781.0, "epoch": 1.8725333333333334, "grad_norm": 0.02552828937768936, "kl": 0.046062469482421875, "learning_rate": 1.1140217107771697e-06, "loss": 0.0018, "reward": 0.28125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 1756 }, { "completion_length": 628.9375, "epoch": 1.8736000000000002, "grad_norm": 0.01609920524060726, "kl": 0.05692863464355469, "learning_rate": 1.1122214133201954e-06, "loss": 0.0023, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1757 }, { "completion_length": 656.21875, "epoch": 1.8746666666666667, "grad_norm": 0.025619085878133774, "kl": 0.10176277160644531, "learning_rate": 1.1104217142553247e-06, "loss": 0.0041, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1758 }, { "completion_length": 590.15625, "epoch": 1.8757333333333333, "grad_norm": 0.008942621760070324, "kl": 0.03241729736328125, "learning_rate": 1.1086226163597249e-06, "loss": 0.0013, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1759 }, { "completion_length": 610.78125, "epoch": 1.8768, "grad_norm": 0.007973313331604004, "kl": 0.04097938537597656, "learning_rate": 1.1068241224096349e-06, "loss": 0.0016, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1760 }, { "completion_length": 617.125, "epoch": 1.8778666666666668, "grad_norm": 0.011099555529654026, "kl": 0.07863235473632812, "learning_rate": 1.105026235180361e-06, "loss": 0.0031, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1761 }, { "completion_length": 571.03125, "epoch": 1.8789333333333333, "grad_norm": 0.02060103975236416, "kl": 0.022808074951171875, "learning_rate": 1.1032289574462737e-06, "loss": 0.0009, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1762 }, { "completion_length": 594.90625, "epoch": 1.88, "grad_norm": 0.027057351544499397, "kl": 0.020496368408203125, "learning_rate": 1.1014322919808043e-06, "loss": 0.0008, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1763 }, { "completion_length": 698.875, "epoch": 1.8810666666666667, "grad_norm": 0.01110667735338211, "kl": 0.05197906494140625, "learning_rate": 1.099636241556437e-06, "loss": 0.0021, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1764 }, { "completion_length": 548.71875, "epoch": 1.8821333333333334, "grad_norm": 0.015816988423466682, "kl": 0.022663116455078125, "learning_rate": 1.0978408089447078e-06, "loss": 0.0009, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1765 }, { "completion_length": 574.09375, "epoch": 1.8832, "grad_norm": 0.019237153232097626, "kl": 0.0445098876953125, "learning_rate": 1.0960459969162008e-06, "loss": 0.0018, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1766 }, { "completion_length": 586.3125, "epoch": 1.8842666666666665, "grad_norm": 0.0018311678431928158, "kl": 0.065216064453125, "learning_rate": 1.0942518082405401e-06, "loss": 0.0026, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1767 }, { "completion_length": 584.65625, "epoch": 1.8853333333333333, "grad_norm": 0.010896485298871994, "kl": 0.03183746337890625, "learning_rate": 1.0924582456863889e-06, "loss": 0.0013, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1768 }, { "completion_length": 619.4375, "epoch": 1.8864, "grad_norm": 0.01717567630112171, "kl": 0.04140472412109375, "learning_rate": 1.0906653120214456e-06, "loss": 0.0017, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1769 }, { "completion_length": 553.5625, "epoch": 1.8874666666666666, "grad_norm": 0.009164064191281796, "kl": 0.10186004638671875, "learning_rate": 1.0888730100124355e-06, "loss": 0.0041, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1770 }, { "completion_length": 408.28125, "epoch": 1.8885333333333332, "grad_norm": 0.0009144353098236024, "kl": 0.01335906982421875, "learning_rate": 1.0870813424251104e-06, "loss": 0.0005, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "step": 1771 }, { "completion_length": 601.34375, "epoch": 1.8896, "grad_norm": 0.0007937567424960434, "kl": 0.058139801025390625, "learning_rate": 1.0852903120242449e-06, "loss": 0.0023, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1772 }, { "completion_length": 594.03125, "epoch": 1.8906666666666667, "grad_norm": 0.013473143801093102, "kl": 0.05825042724609375, "learning_rate": 1.0834999215736271e-06, "loss": 0.0023, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1773 }, { "completion_length": 526.8125, "epoch": 1.8917333333333333, "grad_norm": 0.02073027938604355, "kl": 0.04424285888671875, "learning_rate": 1.0817101738360593e-06, "loss": 0.0018, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1774 }, { "completion_length": 607.875, "epoch": 1.8928, "grad_norm": 0.0027431724593043327, "kl": 0.06110382080078125, "learning_rate": 1.079921071573353e-06, "loss": 0.0024, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1775 }, { "completion_length": 592.84375, "epoch": 1.8938666666666668, "grad_norm": 0.002120625926181674, "kl": 0.016567230224609375, "learning_rate": 1.0781326175463212e-06, "loss": 0.0007, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1776 }, { "completion_length": 484.03125, "epoch": 1.8949333333333334, "grad_norm": 0.021015668287873268, "kl": 0.0084991455078125, "learning_rate": 1.0763448145147778e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.0, "step": 1777 }, { "completion_length": 730.96875, "epoch": 1.896, "grad_norm": 0.0016972849844023585, "kl": 0.04518699645996094, "learning_rate": 1.0745576652375336e-06, "loss": 0.0018, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1778 }, { "completion_length": 656.84375, "epoch": 1.8970666666666667, "grad_norm": 0.01941998116672039, "kl": 0.054622650146484375, "learning_rate": 1.0727711724723881e-06, "loss": 0.0022, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1779 }, { "completion_length": 600.0, "epoch": 1.8981333333333335, "grad_norm": 0.009836725890636444, "kl": 0.04534912109375, "learning_rate": 1.0709853389761288e-06, "loss": 0.0018, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1780 }, { "completion_length": 608.15625, "epoch": 1.8992, "grad_norm": 0.002337727462872863, "kl": 0.12274169921875, "learning_rate": 1.069200167504526e-06, "loss": 0.0049, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1781 }, { "completion_length": 453.5, "epoch": 1.9002666666666665, "grad_norm": 0.01833314448595047, "kl": 0.06450271606445312, "learning_rate": 1.0674156608123294e-06, "loss": 0.0026, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1782 }, { "completion_length": 589.65625, "epoch": 1.9013333333333333, "grad_norm": 0.012342681176960468, "kl": 0.08782196044921875, "learning_rate": 1.06563182165326e-06, "loss": 0.0035, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1783 }, { "completion_length": 718.28125, "epoch": 1.9024, "grad_norm": 0.0017266606446355581, "kl": 0.04189872741699219, "learning_rate": 1.0638486527800112e-06, "loss": 0.0017, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1784 }, { "completion_length": 505.75, "epoch": 1.9034666666666666, "grad_norm": 0.02014044113457203, "kl": 0.04974937438964844, "learning_rate": 1.062066156944242e-06, "loss": 0.002, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1785 }, { "completion_length": 591.65625, "epoch": 1.9045333333333332, "grad_norm": 0.014723171480000019, "kl": 0.0278167724609375, "learning_rate": 1.0602843368965712e-06, "loss": 0.0011, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1786 }, { "completion_length": 630.96875, "epoch": 1.9056, "grad_norm": 0.001145340851508081, "kl": 0.07723617553710938, "learning_rate": 1.0585031953865756e-06, "loss": 0.0031, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1787 }, { "completion_length": 710.40625, "epoch": 1.9066666666666667, "grad_norm": 0.0015806120354682207, "kl": 0.061069488525390625, "learning_rate": 1.0567227351627864e-06, "loss": 0.0024, "reward": 0.28125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 1788 }, { "completion_length": 612.6875, "epoch": 1.9077333333333333, "grad_norm": 0.008174749091267586, "kl": 0.03057098388671875, "learning_rate": 1.0549429589726806e-06, "loss": 0.0012, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1789 }, { "completion_length": 703.25, "epoch": 1.9088, "grad_norm": 0.0007341207237914205, "kl": 0.03948211669921875, "learning_rate": 1.0531638695626813e-06, "loss": 0.0016, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1790 }, { "completion_length": 714.5625, "epoch": 1.9098666666666668, "grad_norm": 0.018321482464671135, "kl": 0.08687973022460938, "learning_rate": 1.0513854696781531e-06, "loss": 0.0035, "reward": 0.34375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 1791 }, { "completion_length": 496.59375, "epoch": 1.9109333333333334, "grad_norm": 0.015493631362915039, "kl": 0.030487060546875, "learning_rate": 1.0496077620633935e-06, "loss": 0.0012, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1792 }, { "completion_length": 618.4375, "epoch": 1.912, "grad_norm": 0.0119639215990901, "kl": 0.041599273681640625, "learning_rate": 1.047830749461634e-06, "loss": 0.0017, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1793 }, { "completion_length": 654.09375, "epoch": 1.9130666666666667, "grad_norm": 0.0017514241626486182, "kl": 0.10969352722167969, "learning_rate": 1.0460544346150335e-06, "loss": 0.0044, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1794 }, { "completion_length": 765.84375, "epoch": 1.9141333333333335, "grad_norm": 0.004961746279150248, "kl": 0.04263877868652344, "learning_rate": 1.0442788202646734e-06, "loss": 0.0017, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1795 }, { "completion_length": 685.125, "epoch": 1.9152, "grad_norm": 0.011909143067896366, "kl": 0.08194351196289062, "learning_rate": 1.0425039091505536e-06, "loss": 0.0033, "reward": 0.53125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1796 }, { "completion_length": 483.96875, "epoch": 1.9162666666666666, "grad_norm": 0.017328748479485512, "kl": 0.034496307373046875, "learning_rate": 1.040729704011591e-06, "loss": 0.0014, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1797 }, { "completion_length": 498.625, "epoch": 1.9173333333333333, "grad_norm": 0.024165945127606392, "kl": 0.00821685791015625, "learning_rate": 1.0389562075856108e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1798 }, { "completion_length": 538.5625, "epoch": 1.9184, "grad_norm": 0.00045364705147221684, "kl": 0.022830963134765625, "learning_rate": 1.0371834226093455e-06, "loss": 0.0009, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1799 }, { "completion_length": 606.0625, "epoch": 1.9194666666666667, "grad_norm": 0.0167265422642231, "kl": 0.059848785400390625, "learning_rate": 1.0354113518184304e-06, "loss": 0.0024, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1800 }, { "completion_length": 554.90625, "epoch": 1.9205333333333332, "grad_norm": 0.027906320989131927, "kl": 0.0291748046875, "learning_rate": 1.0336399979473973e-06, "loss": 0.0012, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1801 }, { "completion_length": 559.15625, "epoch": 1.9216, "grad_norm": 0.020552046597003937, "kl": 0.06573486328125, "learning_rate": 1.031869363729673e-06, "loss": 0.0026, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1802 }, { "completion_length": 474.125, "epoch": 1.9226666666666667, "grad_norm": 0.01079515926539898, "kl": 0.05089569091796875, "learning_rate": 1.0300994518975732e-06, "loss": 0.002, "reward": 0.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1803 }, { "completion_length": 617.78125, "epoch": 1.9237333333333333, "grad_norm": 0.01145075261592865, "kl": 0.05829620361328125, "learning_rate": 1.0283302651822982e-06, "loss": 0.0023, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1804 }, { "completion_length": 598.59375, "epoch": 1.9247999999999998, "grad_norm": 0.02685941383242607, "kl": 0.09052658081054688, "learning_rate": 1.026561806313931e-06, "loss": 0.0036, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1805 }, { "completion_length": 707.15625, "epoch": 1.9258666666666666, "grad_norm": 0.009653367102146149, "kl": 0.03741455078125, "learning_rate": 1.0247940780214302e-06, "loss": 0.0015, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1806 }, { "completion_length": 768.15625, "epoch": 1.9269333333333334, "grad_norm": 0.012921427376568317, "kl": 0.05721855163574219, "learning_rate": 1.0230270830326267e-06, "loss": 0.0023, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1807 }, { "completion_length": 582.03125, "epoch": 1.928, "grad_norm": 0.0008199686417356133, "kl": 0.06005859375, "learning_rate": 1.021260824074221e-06, "loss": 0.0024, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1808 }, { "completion_length": 656.4375, "epoch": 1.9290666666666667, "grad_norm": 0.013128257356584072, "kl": 0.06781768798828125, "learning_rate": 1.0194953038717773e-06, "loss": 0.0027, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1809 }, { "completion_length": 653.78125, "epoch": 1.9301333333333335, "grad_norm": 0.013504628092050552, "kl": 0.043659210205078125, "learning_rate": 1.0177305251497202e-06, "loss": 0.0017, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1810 }, { "completion_length": 640.90625, "epoch": 1.9312, "grad_norm": 0.01668492704629898, "kl": 0.0759735107421875, "learning_rate": 1.0159664906313285e-06, "loss": 0.003, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1811 }, { "completion_length": 659.65625, "epoch": 1.9322666666666666, "grad_norm": 0.01658673770725727, "kl": 0.05279541015625, "learning_rate": 1.0142032030387342e-06, "loss": 0.0021, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1812 }, { "completion_length": 605.96875, "epoch": 1.9333333333333333, "grad_norm": 0.011013980954885483, "kl": 0.0063495635986328125, "learning_rate": 1.012440665092917e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1813 }, { "completion_length": 539.28125, "epoch": 1.9344000000000001, "grad_norm": 0.0008573230588808656, "kl": 0.033885955810546875, "learning_rate": 1.0106788795136984e-06, "loss": 0.0014, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1814 }, { "completion_length": 625.875, "epoch": 1.9354666666666667, "grad_norm": 0.025199277326464653, "kl": 0.06131744384765625, "learning_rate": 1.008917849019739e-06, "loss": 0.0025, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1815 }, { "completion_length": 644.46875, "epoch": 1.9365333333333332, "grad_norm": 0.012009568512439728, "kl": 0.08676528930664062, "learning_rate": 1.0071575763285363e-06, "loss": 0.0035, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1816 }, { "completion_length": 523.125, "epoch": 1.9376, "grad_norm": 0.0007379131857305765, "kl": 0.04244232177734375, "learning_rate": 1.0053980641564155e-06, "loss": 0.0017, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1817 }, { "completion_length": 564.75, "epoch": 1.9386666666666668, "grad_norm": 0.017814045771956444, "kl": 0.017848968505859375, "learning_rate": 1.0036393152185294e-06, "loss": 0.0007, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1818 }, { "completion_length": 586.96875, "epoch": 1.9397333333333333, "grad_norm": 0.014063499867916107, "kl": 0.0723114013671875, "learning_rate": 1.001881332228855e-06, "loss": 0.0029, "reward": 0.65625, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1819 }, { "completion_length": 820.03125, "epoch": 1.9407999999999999, "grad_norm": 0.01547823566943407, "kl": 0.07358932495117188, "learning_rate": 1.0001241179001837e-06, "loss": 0.0029, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1820 }, { "completion_length": 496.28125, "epoch": 1.9418666666666666, "grad_norm": 0.015017380937933922, "kl": 0.03594970703125, "learning_rate": 9.983676749441236e-07, "loss": 0.0014, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1821 }, { "completion_length": 788.5, "epoch": 1.9429333333333334, "grad_norm": 0.017919030040502548, "kl": 0.047908782958984375, "learning_rate": 9.966120060710915e-07, "loss": 0.0019, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1822 }, { "completion_length": 700.40625, "epoch": 1.944, "grad_norm": 0.00038410077104344964, "kl": 0.06400299072265625, "learning_rate": 9.94857113990309e-07, "loss": 0.0026, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1823 }, { "completion_length": 710.8125, "epoch": 1.9450666666666667, "grad_norm": 0.011783714406192303, "kl": 0.039371490478515625, "learning_rate": 9.931030014098005e-07, "loss": 0.0016, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1824 }, { "completion_length": 600.65625, "epoch": 1.9461333333333335, "grad_norm": 0.017887284979224205, "kl": 0.08120346069335938, "learning_rate": 9.913496710363866e-07, "loss": 0.0033, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1825 }, { "completion_length": 757.59375, "epoch": 1.9472, "grad_norm": 0.019361184909939766, "kl": 0.07088851928710938, "learning_rate": 9.895971255756803e-07, "loss": 0.0028, "reward": 0.53125, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1826 }, { "completion_length": 598.9375, "epoch": 1.9482666666666666, "grad_norm": 0.0007641441188752651, "kl": 0.02878570556640625, "learning_rate": 9.878453677320847e-07, "loss": 0.0012, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1827 }, { "completion_length": 686.6875, "epoch": 1.9493333333333334, "grad_norm": 0.011512529104948044, "kl": 0.04161834716796875, "learning_rate": 9.86094400208787e-07, "loss": 0.0017, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1828 }, { "completion_length": 570.09375, "epoch": 1.9504000000000001, "grad_norm": 0.0024719766806811094, "kl": 0.06899261474609375, "learning_rate": 9.843442257077541e-07, "loss": 0.0028, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1829 }, { "completion_length": 577.25, "epoch": 1.9514666666666667, "grad_norm": 0.015315888449549675, "kl": 0.06580734252929688, "learning_rate": 9.825948469297303e-07, "loss": 0.0026, "reward": 0.6875, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1830 }, { "completion_length": 636.625, "epoch": 1.9525333333333332, "grad_norm": 0.007930384948849678, "kl": 0.0712890625, "learning_rate": 9.808462665742313e-07, "loss": 0.0029, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1831 }, { "completion_length": 522.71875, "epoch": 1.9536, "grad_norm": 0.010578946210443974, "kl": 0.02800750732421875, "learning_rate": 9.790984873395406e-07, "loss": 0.0011, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1832 }, { "completion_length": 656.125, "epoch": 1.9546666666666668, "grad_norm": 0.020463261753320694, "kl": 0.053531646728515625, "learning_rate": 9.77351511922706e-07, "loss": 0.0021, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1833 }, { "completion_length": 559.0625, "epoch": 1.9557333333333333, "grad_norm": 0.0003530346730258316, "kl": 0.0602264404296875, "learning_rate": 9.756053430195354e-07, "loss": 0.0024, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1834 }, { "completion_length": 519.71875, "epoch": 1.9567999999999999, "grad_norm": 0.0032982397824525833, "kl": 0.0463409423828125, "learning_rate": 9.738599833245897e-07, "loss": 0.0019, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1835 }, { "completion_length": 596.34375, "epoch": 1.9578666666666666, "grad_norm": 0.013926470652222633, "kl": 0.08045196533203125, "learning_rate": 9.721154355311845e-07, "loss": 0.0032, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1836 }, { "completion_length": 714.90625, "epoch": 1.9589333333333334, "grad_norm": 0.012716345489025116, "kl": 0.10064697265625, "learning_rate": 9.703717023313802e-07, "loss": 0.004, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1837 }, { "completion_length": 723.59375, "epoch": 1.96, "grad_norm": 0.0013026215601712465, "kl": 0.08035659790039062, "learning_rate": 9.686287864159802e-07, "loss": 0.0032, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1838 }, { "completion_length": 637.34375, "epoch": 1.9610666666666665, "grad_norm": 0.00045026023872196674, "kl": 0.0244140625, "learning_rate": 9.668866904745284e-07, "loss": 0.001, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1839 }, { "completion_length": 602.53125, "epoch": 1.9621333333333333, "grad_norm": 0.0011365619720891118, "kl": 0.08477783203125, "learning_rate": 9.651454171953012e-07, "loss": 0.0034, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1840 }, { "completion_length": 753.5, "epoch": 1.9632, "grad_norm": 0.002974217524752021, "kl": 0.0331878662109375, "learning_rate": 9.634049692653084e-07, "loss": 0.0013, "reward": 0.375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1841 }, { "completion_length": 539.78125, "epoch": 1.9642666666666666, "grad_norm": 0.0015304103726521134, "kl": 0.0514068603515625, "learning_rate": 9.616653493702824e-07, "loss": 0.0021, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1842 }, { "completion_length": 677.1875, "epoch": 1.9653333333333334, "grad_norm": 0.0004019858024548739, "kl": 0.04547119140625, "learning_rate": 9.599265601946811e-07, "loss": 0.0018, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1843 }, { "completion_length": 622.15625, "epoch": 1.9664000000000001, "grad_norm": 0.010283052921295166, "kl": 0.095489501953125, "learning_rate": 9.581886044216792e-07, "loss": 0.0038, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1844 }, { "completion_length": 447.46875, "epoch": 1.9674666666666667, "grad_norm": 0.008148681372404099, "kl": 0.02605438232421875, "learning_rate": 9.564514847331647e-07, "loss": 0.001, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1845 }, { "completion_length": 554.96875, "epoch": 1.9685333333333332, "grad_norm": 0.0006670691072940826, "kl": 0.0608673095703125, "learning_rate": 9.547152038097367e-07, "loss": 0.0024, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1846 }, { "completion_length": 740.21875, "epoch": 1.9696, "grad_norm": 0.013874493539333344, "kl": 0.1169891357421875, "learning_rate": 9.529797643306994e-07, "loss": 0.0047, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1847 }, { "completion_length": 633.25, "epoch": 1.9706666666666668, "grad_norm": 0.022534674033522606, "kl": 0.12578201293945312, "learning_rate": 9.512451689740579e-07, "loss": 0.005, "reward": 0.46875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1848 }, { "completion_length": 686.34375, "epoch": 1.9717333333333333, "grad_norm": 0.012623815797269344, "kl": 0.04884147644042969, "learning_rate": 9.495114204165162e-07, "loss": 0.002, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1849 }, { "completion_length": 549.59375, "epoch": 1.9727999999999999, "grad_norm": 0.01708534173667431, "kl": 0.040920257568359375, "learning_rate": 9.477785213334707e-07, "loss": 0.0016, "reward": 0.75, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1850 }, { "completion_length": 585.0, "epoch": 1.9738666666666667, "grad_norm": 0.0214844923466444, "kl": 0.037120819091796875, "learning_rate": 9.460464743990059e-07, "loss": 0.0015, "reward": 0.875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1851 }, { "completion_length": 616.25, "epoch": 1.9749333333333334, "grad_norm": 0.0005504073924385011, "kl": 0.06568145751953125, "learning_rate": 9.443152822858937e-07, "loss": 0.0026, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1852 }, { "completion_length": 596.8125, "epoch": 1.976, "grad_norm": 0.01192399114370346, "kl": 0.06621360778808594, "learning_rate": 9.425849476655853e-07, "loss": 0.0026, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1853 }, { "completion_length": 605.5, "epoch": 1.9770666666666665, "grad_norm": 0.013574478216469288, "kl": 0.1439208984375, "learning_rate": 9.40855473208208e-07, "loss": 0.0057, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1854 }, { "completion_length": 645.6875, "epoch": 1.9781333333333333, "grad_norm": 0.08115159720182419, "kl": 0.06628799438476562, "learning_rate": 9.391268615825639e-07, "loss": 0.0027, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1855 }, { "completion_length": 696.1875, "epoch": 1.9792, "grad_norm": 0.011403839103877544, "kl": 0.08643341064453125, "learning_rate": 9.373991154561226e-07, "loss": 0.0035, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1856 }, { "completion_length": 672.46875, "epoch": 1.9802666666666666, "grad_norm": 0.0237040426582098, "kl": 0.0655059814453125, "learning_rate": 9.356722374950166e-07, "loss": 0.0026, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1857 }, { "completion_length": 781.09375, "epoch": 1.9813333333333332, "grad_norm": 0.00034571526339277625, "kl": 0.12713623046875, "learning_rate": 9.339462303640414e-07, "loss": 0.0051, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1858 }, { "completion_length": 723.28125, "epoch": 1.9824000000000002, "grad_norm": 0.041074853390455246, "kl": 0.020969390869140625, "learning_rate": 9.322210967266476e-07, "loss": 0.0008, "reward": 0.46875, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1859 }, { "completion_length": 536.0625, "epoch": 1.9834666666666667, "grad_norm": 0.009406054392457008, "kl": 0.034759521484375, "learning_rate": 9.304968392449361e-07, "loss": 0.0014, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1860 }, { "completion_length": 600.5, "epoch": 1.9845333333333333, "grad_norm": 0.016075588762760162, "kl": 0.057949066162109375, "learning_rate": 9.287734605796588e-07, "loss": 0.0023, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1861 }, { "completion_length": 597.6875, "epoch": 1.9856, "grad_norm": 0.0007771374075673521, "kl": 0.07257080078125, "learning_rate": 9.270509633902102e-07, "loss": 0.0029, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1862 }, { "completion_length": 707.15625, "epoch": 1.9866666666666668, "grad_norm": 0.0006995261064730585, "kl": 0.09046554565429688, "learning_rate": 9.253293503346238e-07, "loss": 0.0036, "reward": 0.28125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 1863 }, { "completion_length": 656.375, "epoch": 1.9877333333333334, "grad_norm": 0.01904395967721939, "kl": 0.09235763549804688, "learning_rate": 9.236086240695689e-07, "loss": 0.0037, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1864 }, { "completion_length": 680.1875, "epoch": 1.9888, "grad_norm": 0.0017708048690110445, "kl": 0.060024261474609375, "learning_rate": 9.218887872503483e-07, "loss": 0.0024, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1865 }, { "completion_length": 590.40625, "epoch": 1.9898666666666667, "grad_norm": 0.011493252590298653, "kl": 0.10119247436523438, "learning_rate": 9.201698425308896e-07, "loss": 0.004, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1866 }, { "completion_length": 691.125, "epoch": 1.9909333333333334, "grad_norm": 0.017861371859908104, "kl": 0.07587051391601562, "learning_rate": 9.184517925637452e-07, "loss": 0.003, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1867 }, { "completion_length": 704.96875, "epoch": 1.992, "grad_norm": 0.0068989284336566925, "kl": 0.08277511596679688, "learning_rate": 9.16734640000087e-07, "loss": 0.0033, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1868 }, { "completion_length": 564.5, "epoch": 1.9930666666666665, "grad_norm": 0.0008814999018795788, "kl": 0.1099395751953125, "learning_rate": 9.150183874897021e-07, "loss": 0.0044, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1869 }, { "completion_length": 734.125, "epoch": 1.9941333333333333, "grad_norm": 0.020851830020546913, "kl": 0.1020965576171875, "learning_rate": 9.133030376809868e-07, "loss": 0.0041, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1870 }, { "completion_length": 685.96875, "epoch": 1.9952, "grad_norm": 0.009143313392996788, "kl": 0.06944656372070312, "learning_rate": 9.115885932209473e-07, "loss": 0.0028, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1871 }, { "completion_length": 728.125, "epoch": 1.9962666666666666, "grad_norm": 0.027508912608027458, "kl": 0.09056663513183594, "learning_rate": 9.098750567551911e-07, "loss": 0.0036, "reward": 0.625, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1872 }, { "completion_length": 598.4375, "epoch": 1.9973333333333332, "grad_norm": 0.010616715997457504, "kl": 0.11339950561523438, "learning_rate": 9.08162430927924e-07, "loss": 0.0045, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1873 }, { "completion_length": 711.125, "epoch": 1.9984, "grad_norm": 0.017920054495334625, "kl": 0.1031036376953125, "learning_rate": 9.064507183819479e-07, "loss": 0.0041, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1874 }, { "completion_length": 635.4375, "epoch": 1.9994666666666667, "grad_norm": 0.02002818137407303, "kl": 0.07929611206054688, "learning_rate": 9.047399217586552e-07, "loss": 0.0032, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1875 }, { "completion_length": 632.125, "epoch": 2.0, "grad_norm": 0.0017487535951659083, "kl": 0.0802154541015625, "learning_rate": 9.030300436980236e-07, "loss": 0.0016, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1876 }, { "completion_length": 700.125, "epoch": 2.0010666666666665, "grad_norm": 0.019508857280015945, "kl": 0.029308319091796875, "learning_rate": 9.013210868386152e-07, "loss": 0.0012, "reward": 0.375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1877 }, { "completion_length": 604.40625, "epoch": 2.0021333333333335, "grad_norm": 0.0013525551185011864, "kl": 0.06978607177734375, "learning_rate": 8.996130538175697e-07, "loss": 0.0028, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1878 }, { "completion_length": 657.125, "epoch": 2.0032, "grad_norm": 0.0008275755681097507, "kl": 0.0572052001953125, "learning_rate": 8.979059472706e-07, "loss": 0.0023, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1879 }, { "completion_length": 538.96875, "epoch": 2.0042666666666666, "grad_norm": 0.0006514614797197282, "kl": 0.0378875732421875, "learning_rate": 8.961997698319912e-07, "loss": 0.0015, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1880 }, { "completion_length": 660.34375, "epoch": 2.005333333333333, "grad_norm": 0.0005554803065024316, "kl": 0.024454116821289062, "learning_rate": 8.944945241345953e-07, "loss": 0.001, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1881 }, { "completion_length": 465.90625, "epoch": 2.0064, "grad_norm": 0.0007363279582932591, "kl": 0.08020782470703125, "learning_rate": 8.927902128098226e-07, "loss": 0.0032, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1882 }, { "completion_length": 525.46875, "epoch": 2.0074666666666667, "grad_norm": 0.022320540621876717, "kl": 0.06804656982421875, "learning_rate": 8.910868384876455e-07, "loss": 0.0027, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1883 }, { "completion_length": 679.5625, "epoch": 2.0085333333333333, "grad_norm": 0.013154823333024979, "kl": 0.10529136657714844, "learning_rate": 8.893844037965898e-07, "loss": 0.0042, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1884 }, { "completion_length": 622.4375, "epoch": 2.0096, "grad_norm": 0.015815800055861473, "kl": 0.09219741821289062, "learning_rate": 8.876829113637291e-07, "loss": 0.0037, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1885 }, { "completion_length": 551.5625, "epoch": 2.010666666666667, "grad_norm": 0.0016727399779483676, "kl": 0.0676422119140625, "learning_rate": 8.859823638146852e-07, "loss": 0.0027, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1886 }, { "completion_length": 710.9375, "epoch": 2.0117333333333334, "grad_norm": 0.007285389583557844, "kl": 0.091217041015625, "learning_rate": 8.842827637736218e-07, "loss": 0.0036, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1887 }, { "completion_length": 593.5625, "epoch": 2.0128, "grad_norm": 0.03253095597028732, "kl": 0.14810562133789062, "learning_rate": 8.825841138632387e-07, "loss": 0.0059, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1888 }, { "completion_length": 674.875, "epoch": 2.0138666666666665, "grad_norm": 0.012050745077431202, "kl": 0.07525253295898438, "learning_rate": 8.808864167047707e-07, "loss": 0.003, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1889 }, { "completion_length": 581.90625, "epoch": 2.0149333333333335, "grad_norm": 0.010828932747244835, "kl": 0.02248382568359375, "learning_rate": 8.791896749179831e-07, "loss": 0.0009, "reward": 0.75, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1890 }, { "completion_length": 592.53125, "epoch": 2.016, "grad_norm": 0.004982750862836838, "kl": 0.12269973754882812, "learning_rate": 8.774938911211656e-07, "loss": 0.0049, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1891 }, { "completion_length": 527.90625, "epoch": 2.0170666666666666, "grad_norm": 0.018090875819325447, "kl": 0.0656585693359375, "learning_rate": 8.757990679311297e-07, "loss": 0.0026, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1892 }, { "completion_length": 597.8125, "epoch": 2.018133333333333, "grad_norm": 0.0072383033111691475, "kl": 0.06668281555175781, "learning_rate": 8.741052079632063e-07, "loss": 0.0027, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1893 }, { "completion_length": 672.28125, "epoch": 2.0192, "grad_norm": 0.013639530166983604, "kl": 0.09104537963867188, "learning_rate": 8.724123138312368e-07, "loss": 0.0036, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1894 }, { "completion_length": 670.40625, "epoch": 2.0202666666666667, "grad_norm": 0.017494400963187218, "kl": 0.09602737426757812, "learning_rate": 8.707203881475757e-07, "loss": 0.0038, "reward": 0.59375, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1895 }, { "completion_length": 708.9375, "epoch": 2.021333333333333, "grad_norm": 0.018665403127670288, "kl": 0.1009368896484375, "learning_rate": 8.690294335230808e-07, "loss": 0.004, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1896 }, { "completion_length": 562.625, "epoch": 2.0224, "grad_norm": 0.02404053695499897, "kl": 0.07282066345214844, "learning_rate": 8.673394525671116e-07, "loss": 0.0029, "reward": 0.75, "reward_std": 0.375, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1897 }, { "completion_length": 743.875, "epoch": 2.0234666666666667, "grad_norm": 0.013052621856331825, "kl": 0.13982009887695312, "learning_rate": 8.65650447887526e-07, "loss": 0.0056, "reward": 0.28125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 1898 }, { "completion_length": 579.71875, "epoch": 2.0245333333333333, "grad_norm": 0.006074479781091213, "kl": 0.0793304443359375, "learning_rate": 8.639624220906747e-07, "loss": 0.0032, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1899 }, { "completion_length": 536.40625, "epoch": 2.0256, "grad_norm": 0.0030716941691935062, "kl": 0.0910491943359375, "learning_rate": 8.62275377781398e-07, "loss": 0.0036, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1900 }, { "completion_length": 700.21875, "epoch": 2.026666666666667, "grad_norm": 0.018662065267562866, "kl": 0.061664581298828125, "learning_rate": 8.605893175630218e-07, "loss": 0.0025, "reward": 0.6875, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1901 }, { "completion_length": 568.0625, "epoch": 2.0277333333333334, "grad_norm": 0.0012128077214583755, "kl": 0.10311698913574219, "learning_rate": 8.589042440373532e-07, "loss": 0.0041, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1902 }, { "completion_length": 670.875, "epoch": 2.0288, "grad_norm": 0.023221664130687714, "kl": 0.06980133056640625, "learning_rate": 8.572201598046768e-07, "loss": 0.0028, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1903 }, { "completion_length": 613.15625, "epoch": 2.0298666666666665, "grad_norm": 0.017150286585092545, "kl": 0.031291961669921875, "learning_rate": 8.555370674637509e-07, "loss": 0.0013, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1904 }, { "completion_length": 607.9375, "epoch": 2.0309333333333335, "grad_norm": 0.01606287620961666, "kl": 0.036876678466796875, "learning_rate": 8.538549696118023e-07, "loss": 0.0015, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1905 }, { "completion_length": 633.71875, "epoch": 2.032, "grad_norm": 0.023224404081702232, "kl": 0.053783416748046875, "learning_rate": 8.521738688445243e-07, "loss": 0.0022, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1906 }, { "completion_length": 566.78125, "epoch": 2.0330666666666666, "grad_norm": 0.002702908357605338, "kl": 0.057598114013671875, "learning_rate": 8.504937677560708e-07, "loss": 0.0023, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1907 }, { "completion_length": 603.90625, "epoch": 2.034133333333333, "grad_norm": 0.002406406681984663, "kl": 0.0517730712890625, "learning_rate": 8.488146689390535e-07, "loss": 0.0021, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1908 }, { "completion_length": 603.09375, "epoch": 2.0352, "grad_norm": 0.015284639783203602, "kl": 0.12369537353515625, "learning_rate": 8.47136574984537e-07, "loss": 0.005, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1909 }, { "completion_length": 676.375, "epoch": 2.0362666666666667, "grad_norm": 0.017549579963088036, "kl": 0.1283721923828125, "learning_rate": 8.45459488482036e-07, "loss": 0.0051, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1910 }, { "completion_length": 540.09375, "epoch": 2.037333333333333, "grad_norm": 0.1195211336016655, "kl": 0.09212875366210938, "learning_rate": 8.437834120195094e-07, "loss": 0.0037, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1911 }, { "completion_length": 744.5, "epoch": 2.0384, "grad_norm": 0.01717277802526951, "kl": 0.08430862426757812, "learning_rate": 8.421083481833588e-07, "loss": 0.0034, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1912 }, { "completion_length": 534.09375, "epoch": 2.0394666666666668, "grad_norm": 0.021539650857448578, "kl": 0.058940887451171875, "learning_rate": 8.404342995584222e-07, "loss": 0.0024, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1913 }, { "completion_length": 661.8125, "epoch": 2.0405333333333333, "grad_norm": 0.0008125397725962102, "kl": 0.020458221435546875, "learning_rate": 8.387612687279718e-07, "loss": 0.0008, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1914 }, { "completion_length": 699.15625, "epoch": 2.0416, "grad_norm": 0.024961626157164574, "kl": 0.2485485076904297, "learning_rate": 8.370892582737083e-07, "loss": 0.0099, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1915 }, { "completion_length": 576.625, "epoch": 2.042666666666667, "grad_norm": 0.01662103831768036, "kl": 0.052745819091796875, "learning_rate": 8.354182707757588e-07, "loss": 0.0021, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1916 }, { "completion_length": 578.28125, "epoch": 2.0437333333333334, "grad_norm": 0.024270175024867058, "kl": 0.046688079833984375, "learning_rate": 8.337483088126709e-07, "loss": 0.0019, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1917 }, { "completion_length": 533.3125, "epoch": 2.0448, "grad_norm": 0.0016245418228209019, "kl": 0.024509429931640625, "learning_rate": 8.320793749614104e-07, "loss": 0.001, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1918 }, { "completion_length": 752.28125, "epoch": 2.0458666666666665, "grad_norm": 0.021360954269766808, "kl": 0.12295341491699219, "learning_rate": 8.304114717973564e-07, "loss": 0.0049, "reward": 0.40625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1919 }, { "completion_length": 620.21875, "epoch": 2.0469333333333335, "grad_norm": 0.018049776554107666, "kl": 0.074798583984375, "learning_rate": 8.287446018942973e-07, "loss": 0.003, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1920 }, { "completion_length": 470.84375, "epoch": 2.048, "grad_norm": 0.0032431057188659906, "kl": 0.0432891845703125, "learning_rate": 8.270787678244272e-07, "loss": 0.0017, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1921 }, { "completion_length": 638.46875, "epoch": 2.0490666666666666, "grad_norm": 0.016530051827430725, "kl": 0.053741455078125, "learning_rate": 8.254139721583417e-07, "loss": 0.0021, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 1922 }, { "completion_length": 609.28125, "epoch": 2.050133333333333, "grad_norm": 0.01946486346423626, "kl": 0.05924224853515625, "learning_rate": 8.237502174650336e-07, "loss": 0.0024, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1923 }, { "completion_length": 603.9375, "epoch": 2.0512, "grad_norm": 0.001764497603289783, "kl": 0.010011672973632812, "learning_rate": 8.220875063118915e-07, "loss": 0.0004, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1924 }, { "completion_length": 596.875, "epoch": 2.0522666666666667, "grad_norm": 0.0137149877846241, "kl": 0.028961181640625, "learning_rate": 8.204258412646903e-07, "loss": 0.0012, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1925 }, { "completion_length": 646.5625, "epoch": 2.0533333333333332, "grad_norm": 0.011271415278315544, "kl": 0.0401611328125, "learning_rate": 8.187652248875924e-07, "loss": 0.0016, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1926 }, { "completion_length": 565.375, "epoch": 2.0544, "grad_norm": 0.00982715655118227, "kl": 0.06828689575195312, "learning_rate": 8.171056597431432e-07, "loss": 0.0027, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1927 }, { "completion_length": 564.3125, "epoch": 2.0554666666666668, "grad_norm": 0.016189567744731903, "kl": 0.020843505859375, "learning_rate": 8.15447148392264e-07, "loss": 0.0008, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1928 }, { "completion_length": 611.0, "epoch": 2.0565333333333333, "grad_norm": 0.018176957964897156, "kl": 0.07663345336914062, "learning_rate": 8.137896933942495e-07, "loss": 0.0031, "reward": 0.5, "reward_std": 0.375, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1929 }, { "completion_length": 630.75, "epoch": 2.0576, "grad_norm": 0.015364163555204868, "kl": 0.0448150634765625, "learning_rate": 8.121332973067666e-07, "loss": 0.0018, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1930 }, { "completion_length": 548.6875, "epoch": 2.058666666666667, "grad_norm": 0.024108892306685448, "kl": 0.03952789306640625, "learning_rate": 8.104779626858475e-07, "loss": 0.0016, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1931 }, { "completion_length": 625.1875, "epoch": 2.0597333333333334, "grad_norm": 0.028107309713959694, "kl": 0.0886077880859375, "learning_rate": 8.088236920858835e-07, "loss": 0.0036, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1932 }, { "completion_length": 711.96875, "epoch": 2.0608, "grad_norm": 0.019590385258197784, "kl": 0.09848785400390625, "learning_rate": 8.071704880596285e-07, "loss": 0.0039, "reward": 0.59375, "reward_std": 0.47617512941360474, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1933 }, { "completion_length": 618.40625, "epoch": 2.0618666666666665, "grad_norm": 0.06415349245071411, "kl": 0.08887481689453125, "learning_rate": 8.055183531581884e-07, "loss": 0.0036, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1934 }, { "completion_length": 528.78125, "epoch": 2.0629333333333335, "grad_norm": 0.02002379484474659, "kl": 0.061183929443359375, "learning_rate": 8.038672899310176e-07, "loss": 0.0024, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1935 }, { "completion_length": 642.71875, "epoch": 2.064, "grad_norm": 0.017971497029066086, "kl": 0.03209877014160156, "learning_rate": 8.022173009259199e-07, "loss": 0.0013, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1936 }, { "completion_length": 603.96875, "epoch": 2.0650666666666666, "grad_norm": 0.01857435517013073, "kl": 0.0549774169921875, "learning_rate": 8.005683886890402e-07, "loss": 0.0022, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1937 }, { "completion_length": 580.40625, "epoch": 2.066133333333333, "grad_norm": 0.005393706727772951, "kl": 0.10495758056640625, "learning_rate": 7.989205557648598e-07, "loss": 0.0042, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1938 }, { "completion_length": 628.625, "epoch": 2.0672, "grad_norm": 0.0016738245030865073, "kl": 0.041316986083984375, "learning_rate": 7.97273804696198e-07, "loss": 0.0016, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1939 }, { "completion_length": 538.1875, "epoch": 2.0682666666666667, "grad_norm": 0.02729238197207451, "kl": 0.029520034790039062, "learning_rate": 7.95628138024203e-07, "loss": 0.0012, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1940 }, { "completion_length": 714.0, "epoch": 2.0693333333333332, "grad_norm": 0.0009982845513150096, "kl": 0.060199737548828125, "learning_rate": 7.939835582883478e-07, "loss": 0.0024, "reward": 0.375, "reward_std": 0.125, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1941 }, { "completion_length": 499.09375, "epoch": 2.0704, "grad_norm": 0.01719576120376587, "kl": 0.038066864013671875, "learning_rate": 7.923400680264318e-07, "loss": 0.0015, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1942 }, { "completion_length": 605.59375, "epoch": 2.071466666666667, "grad_norm": 0.013058249838650227, "kl": 0.09356307983398438, "learning_rate": 7.906976697745706e-07, "loss": 0.0037, "reward": 0.78125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1943 }, { "completion_length": 598.96875, "epoch": 2.0725333333333333, "grad_norm": 0.002982640638947487, "kl": 0.031475067138671875, "learning_rate": 7.890563660671952e-07, "loss": 0.0013, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1944 }, { "completion_length": 565.8125, "epoch": 2.0736, "grad_norm": 0.0009026309126056731, "kl": 0.016021728515625, "learning_rate": 7.874161594370482e-07, "loss": 0.0006, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1945 }, { "completion_length": 679.90625, "epoch": 2.074666666666667, "grad_norm": 0.010146631859242916, "kl": 0.057300567626953125, "learning_rate": 7.857770524151785e-07, "loss": 0.0023, "reward": 0.34375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 1946 }, { "completion_length": 633.53125, "epoch": 2.0757333333333334, "grad_norm": 0.012399670667946339, "kl": 0.09084320068359375, "learning_rate": 7.841390475309386e-07, "loss": 0.0036, "reward": 0.5625, "reward_std": 0.3608439117670059, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1947 }, { "completion_length": 545.0625, "epoch": 2.0768, "grad_norm": 0.014547216705977917, "kl": 0.08000946044921875, "learning_rate": 7.825021473119803e-07, "loss": 0.0032, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1948 }, { "completion_length": 601.875, "epoch": 2.0778666666666665, "grad_norm": 0.02396492101252079, "kl": 0.0849151611328125, "learning_rate": 7.808663542842504e-07, "loss": 0.0034, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1949 }, { "completion_length": 610.6875, "epoch": 2.0789333333333335, "grad_norm": 0.016419852152466774, "kl": 0.05005645751953125, "learning_rate": 7.792316709719875e-07, "loss": 0.002, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1950 }, { "completion_length": 625.96875, "epoch": 2.08, "grad_norm": 0.021005641669034958, "kl": 0.107574462890625, "learning_rate": 7.775980998977175e-07, "loss": 0.0043, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1951 }, { "completion_length": 663.34375, "epoch": 2.0810666666666666, "grad_norm": 0.017854899168014526, "kl": 0.08767318725585938, "learning_rate": 7.759656435822504e-07, "loss": 0.0035, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1952 }, { "completion_length": 576.53125, "epoch": 2.082133333333333, "grad_norm": 0.015729812905192375, "kl": 0.063507080078125, "learning_rate": 7.743343045446756e-07, "loss": 0.0025, "reward": 0.375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1953 }, { "completion_length": 526.15625, "epoch": 2.0832, "grad_norm": 0.0009508842485956848, "kl": 0.0368194580078125, "learning_rate": 7.727040853023583e-07, "loss": 0.0015, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1954 }, { "completion_length": 645.625, "epoch": 2.0842666666666667, "grad_norm": 0.01026894524693489, "kl": 0.024105072021484375, "learning_rate": 7.710749883709362e-07, "loss": 0.001, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1955 }, { "completion_length": 632.0625, "epoch": 2.0853333333333333, "grad_norm": 0.0003338318783789873, "kl": 0.07326507568359375, "learning_rate": 7.694470162643147e-07, "loss": 0.0029, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1956 }, { "completion_length": 529.34375, "epoch": 2.0864, "grad_norm": 0.02250298298895359, "kl": 0.051815032958984375, "learning_rate": 7.678201714946636e-07, "loss": 0.0021, "reward": 0.75, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1957 }, { "completion_length": 614.1875, "epoch": 2.087466666666667, "grad_norm": 0.007156983949244022, "kl": 0.038677215576171875, "learning_rate": 7.661944565724131e-07, "loss": 0.0015, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 1958 }, { "completion_length": 552.875, "epoch": 2.0885333333333334, "grad_norm": 0.012430202215909958, "kl": 0.037197113037109375, "learning_rate": 7.6456987400625e-07, "loss": 0.0015, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1959 }, { "completion_length": 569.25, "epoch": 2.0896, "grad_norm": 0.019467774778604507, "kl": 0.0978240966796875, "learning_rate": 7.629464263031134e-07, "loss": 0.0039, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1960 }, { "completion_length": 540.5625, "epoch": 2.0906666666666665, "grad_norm": 0.011039895005524158, "kl": 0.048614501953125, "learning_rate": 7.613241159681909e-07, "loss": 0.0019, "reward": 0.59375, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1961 }, { "completion_length": 606.5, "epoch": 2.0917333333333334, "grad_norm": 0.01363005954772234, "kl": 0.08280181884765625, "learning_rate": 7.59702945504917e-07, "loss": 0.0033, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1962 }, { "completion_length": 582.8125, "epoch": 2.0928, "grad_norm": 0.012038967572152615, "kl": 0.07292938232421875, "learning_rate": 7.580829174149643e-07, "loss": 0.0029, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1963 }, { "completion_length": 504.5, "epoch": 2.0938666666666665, "grad_norm": 0.01814015582203865, "kl": 0.025407791137695312, "learning_rate": 7.564640341982439e-07, "loss": 0.001, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1964 }, { "completion_length": 623.21875, "epoch": 2.0949333333333335, "grad_norm": 0.01649683713912964, "kl": 0.036769866943359375, "learning_rate": 7.548462983529016e-07, "loss": 0.0015, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1965 }, { "completion_length": 718.25, "epoch": 2.096, "grad_norm": 0.01707681082189083, "kl": 0.046630859375, "learning_rate": 7.5322971237531e-07, "loss": 0.0019, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1966 }, { "completion_length": 495.25, "epoch": 2.0970666666666666, "grad_norm": 0.01047737430781126, "kl": 0.030704498291015625, "learning_rate": 7.516142787600684e-07, "loss": 0.0012, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 1967 }, { "completion_length": 522.34375, "epoch": 2.098133333333333, "grad_norm": 0.007037690840661526, "kl": 0.025257110595703125, "learning_rate": 7.500000000000003e-07, "loss": 0.001, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 1968 }, { "completion_length": 640.0, "epoch": 2.0992, "grad_norm": 0.0018350585596635938, "kl": 0.007598876953125, "learning_rate": 7.483868785861427e-07, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1969 }, { "completion_length": 495.1875, "epoch": 2.1002666666666667, "grad_norm": 0.01706516183912754, "kl": 0.028766632080078125, "learning_rate": 7.467749170077489e-07, "loss": 0.0012, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1970 }, { "completion_length": 653.3125, "epoch": 2.1013333333333333, "grad_norm": 0.027117181569337845, "kl": 0.08440399169921875, "learning_rate": 7.451641177522844e-07, "loss": 0.0034, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 1971 }, { "completion_length": 631.625, "epoch": 2.1024, "grad_norm": 0.01408805325627327, "kl": 0.07710075378417969, "learning_rate": 7.435544833054173e-07, "loss": 0.0031, "reward": 0.75, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1972 }, { "completion_length": 823.21875, "epoch": 2.103466666666667, "grad_norm": 0.012404069304466248, "kl": 0.04499053955078125, "learning_rate": 7.419460161510197e-07, "loss": 0.0018, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1973 }, { "completion_length": 590.84375, "epoch": 2.1045333333333334, "grad_norm": 0.02353692427277565, "kl": 0.06691741943359375, "learning_rate": 7.40338718771165e-07, "loss": 0.0027, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1974 }, { "completion_length": 704.0, "epoch": 2.1056, "grad_norm": 0.00989667046815157, "kl": 0.06922531127929688, "learning_rate": 7.38732593646117e-07, "loss": 0.0028, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 1975 }, { "completion_length": 707.46875, "epoch": 2.1066666666666665, "grad_norm": 0.012740718200802803, "kl": 0.12015533447265625, "learning_rate": 7.371276432543332e-07, "loss": 0.0048, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 1976 }, { "completion_length": 707.28125, "epoch": 2.1077333333333335, "grad_norm": 0.014468363486230373, "kl": 0.023418426513671875, "learning_rate": 7.355238700724594e-07, "loss": 0.0009, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1977 }, { "completion_length": 448.59375, "epoch": 2.1088, "grad_norm": 0.013929460197687149, "kl": 0.047977447509765625, "learning_rate": 7.339212765753219e-07, "loss": 0.0019, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1978 }, { "completion_length": 702.34375, "epoch": 2.1098666666666666, "grad_norm": 0.016519131138920784, "kl": 0.019775390625, "learning_rate": 7.323198652359278e-07, "loss": 0.0008, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1979 }, { "completion_length": 586.34375, "epoch": 2.1109333333333336, "grad_norm": 0.011201190762221813, "kl": 0.00962066650390625, "learning_rate": 7.307196385254621e-07, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 1980 }, { "completion_length": 623.46875, "epoch": 2.112, "grad_norm": 0.00043860776349902153, "kl": 0.048069000244140625, "learning_rate": 7.291205989132781e-07, "loss": 0.0019, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1981 }, { "completion_length": 495.84375, "epoch": 2.1130666666666666, "grad_norm": 0.0017920794198289514, "kl": 0.03320884704589844, "learning_rate": 7.275227488668991e-07, "loss": 0.0013, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 1982 }, { "completion_length": 420.28125, "epoch": 2.114133333333333, "grad_norm": 0.00048562398296780884, "kl": 0.016204833984375, "learning_rate": 7.259260908520137e-07, "loss": 0.0006, "reward": 0.875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1983 }, { "completion_length": 555.34375, "epoch": 2.1152, "grad_norm": 0.02023596502840519, "kl": 0.060848236083984375, "learning_rate": 7.243306273324697e-07, "loss": 0.0024, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1984 }, { "completion_length": 670.25, "epoch": 2.1162666666666667, "grad_norm": 0.01335116196423769, "kl": 0.06827163696289062, "learning_rate": 7.227363607702717e-07, "loss": 0.0027, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 1985 }, { "completion_length": 625.96875, "epoch": 2.1173333333333333, "grad_norm": 0.001544561586342752, "kl": 0.047149658203125, "learning_rate": 7.211432936255779e-07, "loss": 0.0019, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1986 }, { "completion_length": 566.21875, "epoch": 2.1184, "grad_norm": 0.013956356793642044, "kl": 0.0327301025390625, "learning_rate": 7.195514283566949e-07, "loss": 0.0013, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1987 }, { "completion_length": 666.375, "epoch": 2.119466666666667, "grad_norm": 0.02291758544743061, "kl": 0.05626678466796875, "learning_rate": 7.179607674200752e-07, "loss": 0.0023, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 1988 }, { "completion_length": 586.65625, "epoch": 2.1205333333333334, "grad_norm": 0.0264887735247612, "kl": 0.07562255859375, "learning_rate": 7.163713132703127e-07, "loss": 0.003, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 1989 }, { "completion_length": 683.375, "epoch": 2.1216, "grad_norm": 0.0024723114911466837, "kl": 0.045467376708984375, "learning_rate": 7.147830683601389e-07, "loss": 0.0018, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1990 }, { "completion_length": 640.25, "epoch": 2.1226666666666665, "grad_norm": 0.0014011841267347336, "kl": 0.03096771240234375, "learning_rate": 7.131960351404196e-07, "loss": 0.0012, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 1991 }, { "completion_length": 668.375, "epoch": 2.1237333333333335, "grad_norm": 0.013325673528015614, "kl": 0.0616912841796875, "learning_rate": 7.116102160601505e-07, "loss": 0.0025, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1992 }, { "completion_length": 547.84375, "epoch": 2.1248, "grad_norm": 0.001634187065064907, "kl": 0.06683158874511719, "learning_rate": 7.10025613566454e-07, "loss": 0.0027, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 1993 }, { "completion_length": 685.0, "epoch": 2.1258666666666666, "grad_norm": 0.015324427746236324, "kl": 0.06212615966796875, "learning_rate": 7.084422301045748e-07, "loss": 0.0025, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1994 }, { "completion_length": 510.34375, "epoch": 2.1269333333333336, "grad_norm": 0.0015225412789732218, "kl": 0.07210922241210938, "learning_rate": 7.068600681178772e-07, "loss": 0.0029, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 1995 }, { "completion_length": 808.8125, "epoch": 2.128, "grad_norm": 0.019564252346754074, "kl": 0.10538101196289062, "learning_rate": 7.052791300478395e-07, "loss": 0.0042, "reward": 0.3125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 1996 }, { "completion_length": 630.125, "epoch": 2.1290666666666667, "grad_norm": 0.015777727589011192, "kl": 0.0670318603515625, "learning_rate": 7.036994183340524e-07, "loss": 0.0027, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 1997 }, { "completion_length": 712.34375, "epoch": 2.130133333333333, "grad_norm": 0.023247461766004562, "kl": 0.04443359375, "learning_rate": 7.021209354142133e-07, "loss": 0.0018, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 1998 }, { "completion_length": 557.09375, "epoch": 2.1312, "grad_norm": 0.010560627095401287, "kl": 0.037677764892578125, "learning_rate": 7.005436837241243e-07, "loss": 0.0015, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 1999 }, { "completion_length": 811.1875, "epoch": 2.1322666666666668, "grad_norm": 0.01854090578854084, "kl": 0.028774261474609375, "learning_rate": 6.989676656976869e-07, "loss": 0.0012, "reward": 0.34375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 2000 }, { "completion_length": 569.6875, "epoch": 2.1333333333333333, "grad_norm": 0.010162638500332832, "kl": 0.056385040283203125, "learning_rate": 6.97392883766899e-07, "loss": 0.0023, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2001 }, { "completion_length": 552.6875, "epoch": 2.1344, "grad_norm": 0.026134977117180824, "kl": 0.08598136901855469, "learning_rate": 6.958193403618512e-07, "loss": 0.0034, "reward": 0.65625, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2002 }, { "completion_length": 509.28125, "epoch": 2.135466666666667, "grad_norm": 0.0010461258934810758, "kl": 0.0553436279296875, "learning_rate": 6.942470379107228e-07, "loss": 0.0022, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2003 }, { "completion_length": 690.125, "epoch": 2.1365333333333334, "grad_norm": 0.002644994528964162, "kl": 0.08839797973632812, "learning_rate": 6.926759788397783e-07, "loss": 0.0035, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2004 }, { "completion_length": 536.8125, "epoch": 2.1376, "grad_norm": 0.0174573827534914, "kl": 0.027057647705078125, "learning_rate": 6.911061655733632e-07, "loss": 0.0011, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2005 }, { "completion_length": 622.25, "epoch": 2.1386666666666665, "grad_norm": 0.004577376414090395, "kl": 0.0710601806640625, "learning_rate": 6.895376005339008e-07, "loss": 0.0028, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2006 }, { "completion_length": 495.4375, "epoch": 2.1397333333333335, "grad_norm": 0.012449057772755623, "kl": 0.020259857177734375, "learning_rate": 6.879702861418883e-07, "loss": 0.0008, "reward": 0.96875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.0, "step": 2007 }, { "completion_length": 659.96875, "epoch": 2.1408, "grad_norm": 0.020517118275165558, "kl": 0.0451202392578125, "learning_rate": 6.864042248158926e-07, "loss": 0.0018, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2008 }, { "completion_length": 620.96875, "epoch": 2.1418666666666666, "grad_norm": 0.01909751631319523, "kl": 0.07104110717773438, "learning_rate": 6.848394189725476e-07, "loss": 0.0028, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2009 }, { "completion_length": 688.03125, "epoch": 2.142933333333333, "grad_norm": 0.01899668388068676, "kl": 0.06608963012695312, "learning_rate": 6.832758710265492e-07, "loss": 0.0026, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2010 }, { "completion_length": 581.90625, "epoch": 2.144, "grad_norm": 0.007131984457373619, "kl": 0.032207489013671875, "learning_rate": 6.817135833906527e-07, "loss": 0.0013, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2011 }, { "completion_length": 645.46875, "epoch": 2.1450666666666667, "grad_norm": 0.021794619038701057, "kl": 0.07492828369140625, "learning_rate": 6.801525584756679e-07, "loss": 0.003, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2012 }, { "completion_length": 499.5625, "epoch": 2.1461333333333332, "grad_norm": 0.0035133950877934694, "kl": 0.0305328369140625, "learning_rate": 6.785927986904567e-07, "loss": 0.0012, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 2013 }, { "completion_length": 600.84375, "epoch": 2.1471999999999998, "grad_norm": 0.02489549294114113, "kl": 0.056163787841796875, "learning_rate": 6.770343064419282e-07, "loss": 0.0022, "reward": 0.53125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2014 }, { "completion_length": 647.28125, "epoch": 2.1482666666666668, "grad_norm": 0.007649117149412632, "kl": 0.03464508056640625, "learning_rate": 6.754770841350373e-07, "loss": 0.0014, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2015 }, { "completion_length": 644.84375, "epoch": 2.1493333333333333, "grad_norm": 0.010297021828591824, "kl": 0.0701904296875, "learning_rate": 6.739211341727761e-07, "loss": 0.0028, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2016 }, { "completion_length": 568.25, "epoch": 2.1504, "grad_norm": 0.0046885814517736435, "kl": 0.011646270751953125, "learning_rate": 6.723664589561753e-07, "loss": 0.0005, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2017 }, { "completion_length": 580.59375, "epoch": 2.151466666666667, "grad_norm": 0.002042775508016348, "kl": 0.0342559814453125, "learning_rate": 6.708130608842994e-07, "loss": 0.0014, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2018 }, { "completion_length": 589.15625, "epoch": 2.1525333333333334, "grad_norm": 0.003409054595977068, "kl": 0.06600379943847656, "learning_rate": 6.692609423542393e-07, "loss": 0.0026, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2019 }, { "completion_length": 584.90625, "epoch": 2.1536, "grad_norm": 0.017396187409758568, "kl": 0.059246063232421875, "learning_rate": 6.677101057611133e-07, "loss": 0.0024, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2020 }, { "completion_length": 533.4375, "epoch": 2.1546666666666665, "grad_norm": 0.0008060990367084742, "kl": 0.011322021484375, "learning_rate": 6.661605534980629e-07, "loss": 0.0005, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2021 }, { "completion_length": 471.625, "epoch": 2.1557333333333335, "grad_norm": 0.016658125445246696, "kl": 0.02442169189453125, "learning_rate": 6.646122879562435e-07, "loss": 0.001, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2022 }, { "completion_length": 589.40625, "epoch": 2.1568, "grad_norm": 0.011611389927566051, "kl": 0.05383110046386719, "learning_rate": 6.630653115248291e-07, "loss": 0.0022, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2023 }, { "completion_length": 727.09375, "epoch": 2.1578666666666666, "grad_norm": 0.021810879930853844, "kl": 0.09678840637207031, "learning_rate": 6.615196265910031e-07, "loss": 0.0039, "reward": 0.5625, "reward_std": 0.375, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2024 }, { "completion_length": 585.9375, "epoch": 2.158933333333333, "grad_norm": 0.010224547237157822, "kl": 0.06444931030273438, "learning_rate": 6.599752355399538e-07, "loss": 0.0026, "reward": 0.6875, "reward_std": 0.375, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2025 }, { "completion_length": 624.3125, "epoch": 2.16, "grad_norm": 0.023372555151581764, "kl": 0.06512451171875, "learning_rate": 6.584321407548762e-07, "loss": 0.0026, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2026 }, { "completion_length": 534.3125, "epoch": 2.1610666666666667, "grad_norm": 0.0004954792093485594, "kl": 0.057888031005859375, "learning_rate": 6.568903446169638e-07, "loss": 0.0023, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2027 }, { "completion_length": 730.0, "epoch": 2.1621333333333332, "grad_norm": 0.11261662095785141, "kl": 0.13980484008789062, "learning_rate": 6.55349849505404e-07, "loss": 0.0056, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2028 }, { "completion_length": 590.90625, "epoch": 2.1632, "grad_norm": 0.023660976439714432, "kl": 0.08354949951171875, "learning_rate": 6.538106577973801e-07, "loss": 0.0033, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2029 }, { "completion_length": 597.125, "epoch": 2.164266666666667, "grad_norm": 0.021441912278532982, "kl": 0.04004096984863281, "learning_rate": 6.522727718680623e-07, "loss": 0.0016, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2030 }, { "completion_length": 700.84375, "epoch": 2.1653333333333333, "grad_norm": 0.020621415227651596, "kl": 0.07669830322265625, "learning_rate": 6.507361940906042e-07, "loss": 0.0031, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2031 }, { "completion_length": 722.5, "epoch": 2.1664, "grad_norm": 0.013002869673073292, "kl": 0.024126052856445312, "learning_rate": 6.492009268361442e-07, "loss": 0.001, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2032 }, { "completion_length": 564.65625, "epoch": 2.167466666666667, "grad_norm": 0.009066318161785603, "kl": 0.03766441345214844, "learning_rate": 6.476669724737964e-07, "loss": 0.0015, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2033 }, { "completion_length": 585.21875, "epoch": 2.1685333333333334, "grad_norm": 0.02602987550199032, "kl": 0.08016204833984375, "learning_rate": 6.461343333706476e-07, "loss": 0.0032, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2034 }, { "completion_length": 814.1875, "epoch": 2.1696, "grad_norm": 0.014953980222344398, "kl": 0.08742523193359375, "learning_rate": 6.446030118917585e-07, "loss": 0.0035, "reward": 0.28125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 2035 }, { "completion_length": 568.9375, "epoch": 2.1706666666666665, "grad_norm": 0.016886603087186813, "kl": 0.07769012451171875, "learning_rate": 6.430730104001541e-07, "loss": 0.0031, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2036 }, { "completion_length": 688.3125, "epoch": 2.1717333333333335, "grad_norm": 0.0003020955191459507, "kl": 0.022556304931640625, "learning_rate": 6.415443312568216e-07, "loss": 0.0009, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2037 }, { "completion_length": 577.59375, "epoch": 2.1728, "grad_norm": 0.0010787001810967922, "kl": 0.12613677978515625, "learning_rate": 6.400169768207107e-07, "loss": 0.005, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2038 }, { "completion_length": 565.3125, "epoch": 2.1738666666666666, "grad_norm": 0.01608763262629509, "kl": 0.059360504150390625, "learning_rate": 6.38490949448725e-07, "loss": 0.0024, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2039 }, { "completion_length": 630.75, "epoch": 2.174933333333333, "grad_norm": 0.007542843464761972, "kl": 0.048282623291015625, "learning_rate": 6.369662514957191e-07, "loss": 0.0019, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2040 }, { "completion_length": 685.65625, "epoch": 2.176, "grad_norm": 0.01599849760532379, "kl": 0.1205596923828125, "learning_rate": 6.354428853144991e-07, "loss": 0.0048, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2041 }, { "completion_length": 635.90625, "epoch": 2.1770666666666667, "grad_norm": 0.01153373159468174, "kl": 0.05179405212402344, "learning_rate": 6.339208532558138e-07, "loss": 0.0021, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2042 }, { "completion_length": 618.9375, "epoch": 2.1781333333333333, "grad_norm": 0.014074288308620453, "kl": 0.104888916015625, "learning_rate": 6.324001576683539e-07, "loss": 0.0042, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2043 }, { "completion_length": 619.25, "epoch": 2.1792, "grad_norm": 0.005974677391350269, "kl": 0.0869293212890625, "learning_rate": 6.30880800898748e-07, "loss": 0.0035, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2044 }, { "completion_length": 492.53125, "epoch": 2.180266666666667, "grad_norm": 0.01703925058245659, "kl": 0.07147979736328125, "learning_rate": 6.293627852915581e-07, "loss": 0.0029, "reward": 0.8125, "reward_std": 0.375, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2045 }, { "completion_length": 533.375, "epoch": 2.1813333333333333, "grad_norm": 0.01842154935002327, "kl": 0.040149688720703125, "learning_rate": 6.278461131892775e-07, "loss": 0.0016, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2046 }, { "completion_length": 653.90625, "epoch": 2.1824, "grad_norm": 0.02115774154663086, "kl": 0.05898284912109375, "learning_rate": 6.263307869323255e-07, "loss": 0.0024, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2047 }, { "completion_length": 578.875, "epoch": 2.183466666666667, "grad_norm": 0.0011919833486899734, "kl": 0.08094406127929688, "learning_rate": 6.24816808859045e-07, "loss": 0.0032, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2048 }, { "completion_length": 617.1875, "epoch": 2.1845333333333334, "grad_norm": 0.00041096913628280163, "kl": 0.050838470458984375, "learning_rate": 6.233041813056982e-07, "loss": 0.002, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2049 }, { "completion_length": 641.59375, "epoch": 2.1856, "grad_norm": 0.01655735820531845, "kl": 0.06173515319824219, "learning_rate": 6.217929066064637e-07, "loss": 0.0025, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2050 }, { "completion_length": 562.3125, "epoch": 2.1866666666666665, "grad_norm": 0.0007908053230494261, "kl": 0.08275794982910156, "learning_rate": 6.202829870934321e-07, "loss": 0.0033, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2051 }, { "completion_length": 736.4375, "epoch": 2.1877333333333335, "grad_norm": 0.01164803933352232, "kl": 0.0519256591796875, "learning_rate": 6.187744250966031e-07, "loss": 0.0021, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2052 }, { "completion_length": 619.6875, "epoch": 2.1888, "grad_norm": 0.025260908529162407, "kl": 0.08294296264648438, "learning_rate": 6.172672229438812e-07, "loss": 0.0033, "reward": 0.5625, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2053 }, { "completion_length": 619.71875, "epoch": 2.1898666666666666, "grad_norm": 0.013836993835866451, "kl": 0.07696342468261719, "learning_rate": 6.157613829610726e-07, "loss": 0.0031, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2054 }, { "completion_length": 567.03125, "epoch": 2.190933333333333, "grad_norm": 0.0046409121714532375, "kl": 0.07569122314453125, "learning_rate": 6.142569074718818e-07, "loss": 0.003, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2055 }, { "completion_length": 629.375, "epoch": 2.192, "grad_norm": 0.02401569113135338, "kl": 0.10495758056640625, "learning_rate": 6.127537987979073e-07, "loss": 0.0042, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2056 }, { "completion_length": 535.5, "epoch": 2.1930666666666667, "grad_norm": 0.018413692712783813, "kl": 0.0273590087890625, "learning_rate": 6.112520592586385e-07, "loss": 0.0011, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2057 }, { "completion_length": 728.5, "epoch": 2.1941333333333333, "grad_norm": 0.021117646247148514, "kl": 0.1362476348876953, "learning_rate": 6.097516911714523e-07, "loss": 0.0054, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2058 }, { "completion_length": 734.3125, "epoch": 2.1952, "grad_norm": 0.0046619544737041, "kl": 0.12375259399414062, "learning_rate": 6.082526968516094e-07, "loss": 0.005, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 2059 }, { "completion_length": 638.65625, "epoch": 2.196266666666667, "grad_norm": 0.0006809482583776116, "kl": 0.09923171997070312, "learning_rate": 6.067550786122498e-07, "loss": 0.004, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2060 }, { "completion_length": 651.90625, "epoch": 2.1973333333333334, "grad_norm": 0.0019434057176113129, "kl": 0.056003570556640625, "learning_rate": 6.052588387643908e-07, "loss": 0.0022, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2061 }, { "completion_length": 545.375, "epoch": 2.1984, "grad_norm": 0.015626437962055206, "kl": 0.06577682495117188, "learning_rate": 6.037639796169225e-07, "loss": 0.0026, "reward": 0.8125, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2062 }, { "completion_length": 524.21875, "epoch": 2.1994666666666665, "grad_norm": 0.001667922711931169, "kl": 0.056720733642578125, "learning_rate": 6.022705034766038e-07, "loss": 0.0023, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2063 }, { "completion_length": 617.59375, "epoch": 2.2005333333333335, "grad_norm": 0.0007419260800816119, "kl": 0.08794784545898438, "learning_rate": 6.007784126480615e-07, "loss": 0.0035, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2064 }, { "completion_length": 625.15625, "epoch": 2.2016, "grad_norm": 0.01868678815662861, "kl": 0.0691680908203125, "learning_rate": 5.992877094337816e-07, "loss": 0.0028, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2065 }, { "completion_length": 793.03125, "epoch": 2.2026666666666666, "grad_norm": 0.0012951483950018883, "kl": 0.049327850341796875, "learning_rate": 5.977983961341102e-07, "loss": 0.002, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2066 }, { "completion_length": 563.8125, "epoch": 2.203733333333333, "grad_norm": 0.01988961733877659, "kl": 0.10664749145507812, "learning_rate": 5.963104750472507e-07, "loss": 0.0043, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2067 }, { "completion_length": 646.6875, "epoch": 2.2048, "grad_norm": 0.00035836632014252245, "kl": 0.0406341552734375, "learning_rate": 5.948239484692543e-07, "loss": 0.0016, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2068 }, { "completion_length": 628.15625, "epoch": 2.2058666666666666, "grad_norm": 0.013858212158083916, "kl": 0.12830734252929688, "learning_rate": 5.933388186940224e-07, "loss": 0.0051, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2069 }, { "completion_length": 703.8125, "epoch": 2.206933333333333, "grad_norm": 0.09565028548240662, "kl": 0.06672286987304688, "learning_rate": 5.918550880133018e-07, "loss": 0.0027, "reward": 0.5, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2070 }, { "completion_length": 675.5, "epoch": 2.208, "grad_norm": 0.0011596918338909745, "kl": 0.04909515380859375, "learning_rate": 5.903727587166792e-07, "loss": 0.002, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 2071 }, { "completion_length": 622.8125, "epoch": 2.2090666666666667, "grad_norm": 0.023356711491942406, "kl": 0.06995201110839844, "learning_rate": 5.888918330915772e-07, "loss": 0.0028, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2072 }, { "completion_length": 608.28125, "epoch": 2.2101333333333333, "grad_norm": 0.018439898267388344, "kl": 0.11505889892578125, "learning_rate": 5.874123134232558e-07, "loss": 0.0046, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2073 }, { "completion_length": 574.46875, "epoch": 2.2112, "grad_norm": 0.017145849764347076, "kl": 0.058490753173828125, "learning_rate": 5.859342019948036e-07, "loss": 0.0023, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2074 }, { "completion_length": 572.3125, "epoch": 2.212266666666667, "grad_norm": 0.027143189683556557, "kl": 0.1164703369140625, "learning_rate": 5.844575010871346e-07, "loss": 0.0047, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2075 }, { "completion_length": 676.40625, "epoch": 2.2133333333333334, "grad_norm": 0.01826479658484459, "kl": 0.13582611083984375, "learning_rate": 5.829822129789891e-07, "loss": 0.0054, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2076 }, { "completion_length": 606.125, "epoch": 2.2144, "grad_norm": 0.018110129982233047, "kl": 0.085968017578125, "learning_rate": 5.81508339946926e-07, "loss": 0.0034, "reward": 0.625, "reward_std": 0.375, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2077 }, { "completion_length": 550.75, "epoch": 2.2154666666666665, "grad_norm": 0.023531127721071243, "kl": 0.06703567504882812, "learning_rate": 5.80035884265319e-07, "loss": 0.0027, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2078 }, { "completion_length": 596.1875, "epoch": 2.2165333333333335, "grad_norm": 0.01991463452577591, "kl": 0.06376075744628906, "learning_rate": 5.785648482063575e-07, "loss": 0.0026, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2079 }, { "completion_length": 652.59375, "epoch": 2.2176, "grad_norm": 0.0030256491154432297, "kl": 0.07985305786132812, "learning_rate": 5.770952340400391e-07, "loss": 0.0032, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2080 }, { "completion_length": 622.9375, "epoch": 2.2186666666666666, "grad_norm": 0.00980803556740284, "kl": 0.07931137084960938, "learning_rate": 5.756270440341652e-07, "loss": 0.0032, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2081 }, { "completion_length": 525.1875, "epoch": 2.219733333333333, "grad_norm": 0.016950588673353195, "kl": 0.1049041748046875, "learning_rate": 5.741602804543429e-07, "loss": 0.0042, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2082 }, { "completion_length": 777.3125, "epoch": 2.2208, "grad_norm": 0.014057106338441372, "kl": 0.04787445068359375, "learning_rate": 5.726949455639767e-07, "loss": 0.0019, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2083 }, { "completion_length": 696.0, "epoch": 2.2218666666666667, "grad_norm": 0.0015735687920823693, "kl": 0.06812286376953125, "learning_rate": 5.712310416242644e-07, "loss": 0.0027, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2084 }, { "completion_length": 764.5, "epoch": 2.222933333333333, "grad_norm": 0.01887371763586998, "kl": 0.09856796264648438, "learning_rate": 5.697685708941996e-07, "loss": 0.0039, "reward": 0.34375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 2085 }, { "completion_length": 675.28125, "epoch": 2.224, "grad_norm": 0.0008114817901514471, "kl": 0.05791473388671875, "learning_rate": 5.683075356305615e-07, "loss": 0.0023, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2086 }, { "completion_length": 625.5, "epoch": 2.2250666666666667, "grad_norm": 0.017909200862050056, "kl": 0.06923294067382812, "learning_rate": 5.668479380879151e-07, "loss": 0.0028, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 2087 }, { "completion_length": 672.8125, "epoch": 2.2261333333333333, "grad_norm": 0.010559565387666225, "kl": 0.0802459716796875, "learning_rate": 5.653897805186062e-07, "loss": 0.0032, "reward": 0.59375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2088 }, { "completion_length": 633.78125, "epoch": 2.2272, "grad_norm": 0.007978220470249653, "kl": 0.06562042236328125, "learning_rate": 5.639330651727595e-07, "loss": 0.0026, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2089 }, { "completion_length": 686.5625, "epoch": 2.228266666666667, "grad_norm": 0.015757746994495392, "kl": 0.101409912109375, "learning_rate": 5.624777942982735e-07, "loss": 0.0041, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2090 }, { "completion_length": 743.84375, "epoch": 2.2293333333333334, "grad_norm": 0.011603654362261295, "kl": 0.04717826843261719, "learning_rate": 5.610239701408176e-07, "loss": 0.0019, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2091 }, { "completion_length": 603.21875, "epoch": 2.2304, "grad_norm": 0.01789730042219162, "kl": 0.10071945190429688, "learning_rate": 5.595715949438291e-07, "loss": 0.004, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2092 }, { "completion_length": 574.4375, "epoch": 2.2314666666666665, "grad_norm": 0.0027359838131815195, "kl": 0.030447006225585938, "learning_rate": 5.581206709485094e-07, "loss": 0.0012, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2093 }, { "completion_length": 757.46875, "epoch": 2.2325333333333335, "grad_norm": 0.007074303459376097, "kl": 0.15423583984375, "learning_rate": 5.566712003938203e-07, "loss": 0.0062, "reward": 0.40625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 2094 }, { "completion_length": 566.65625, "epoch": 2.2336, "grad_norm": 0.0012123648775741458, "kl": 0.03566741943359375, "learning_rate": 5.552231855164807e-07, "loss": 0.0014, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2095 }, { "completion_length": 605.75, "epoch": 2.2346666666666666, "grad_norm": 0.0011263254564255476, "kl": 0.1050262451171875, "learning_rate": 5.537766285509632e-07, "loss": 0.0042, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2096 }, { "completion_length": 629.8125, "epoch": 2.235733333333333, "grad_norm": 0.007621960714459419, "kl": 0.01739501953125, "learning_rate": 5.52331531729491e-07, "loss": 0.0007, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2097 }, { "completion_length": 657.625, "epoch": 2.2368, "grad_norm": 0.015060768462717533, "kl": 0.089569091796875, "learning_rate": 5.508878972820339e-07, "loss": 0.0036, "reward": 0.53125, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2098 }, { "completion_length": 663.5625, "epoch": 2.2378666666666667, "grad_norm": 0.017996182665228844, "kl": 0.038997650146484375, "learning_rate": 5.494457274363049e-07, "loss": 0.0016, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2099 }, { "completion_length": 634.65625, "epoch": 2.238933333333333, "grad_norm": 0.0218709334731102, "kl": 0.105438232421875, "learning_rate": 5.480050244177573e-07, "loss": 0.0042, "reward": 0.4375, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2100 }, { "completion_length": 611.09375, "epoch": 2.24, "grad_norm": 0.010996553115546703, "kl": 0.0853118896484375, "learning_rate": 5.465657904495802e-07, "loss": 0.0034, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2101 }, { "completion_length": 668.9375, "epoch": 2.2410666666666668, "grad_norm": 0.019651144742965698, "kl": 0.09289932250976562, "learning_rate": 5.451280277526967e-07, "loss": 0.0037, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2102 }, { "completion_length": 586.6875, "epoch": 2.2421333333333333, "grad_norm": 0.014685137197375298, "kl": 0.05780792236328125, "learning_rate": 5.436917385457589e-07, "loss": 0.0023, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2103 }, { "completion_length": 614.6875, "epoch": 2.2432, "grad_norm": 0.010203918442130089, "kl": 0.054351806640625, "learning_rate": 5.422569250451447e-07, "loss": 0.0022, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2104 }, { "completion_length": 595.90625, "epoch": 2.244266666666667, "grad_norm": 0.019808350130915642, "kl": 0.07369232177734375, "learning_rate": 5.408235894649571e-07, "loss": 0.0029, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2105 }, { "completion_length": 668.0625, "epoch": 2.2453333333333334, "grad_norm": 0.029009150341153145, "kl": 0.08265304565429688, "learning_rate": 5.393917340170151e-07, "loss": 0.0033, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2106 }, { "completion_length": 635.71875, "epoch": 2.2464, "grad_norm": 0.020047293975949287, "kl": 0.090057373046875, "learning_rate": 5.379613609108555e-07, "loss": 0.0036, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2107 }, { "completion_length": 610.84375, "epoch": 2.2474666666666665, "grad_norm": 0.019431183114647865, "kl": 0.031935691833496094, "learning_rate": 5.365324723537288e-07, "loss": 0.0013, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2108 }, { "completion_length": 706.96875, "epoch": 2.2485333333333335, "grad_norm": 0.0187976136803627, "kl": 0.1204833984375, "learning_rate": 5.351050705505919e-07, "loss": 0.0048, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2109 }, { "completion_length": 616.96875, "epoch": 2.2496, "grad_norm": 0.0062036458402872086, "kl": 0.08541107177734375, "learning_rate": 5.33679157704109e-07, "loss": 0.0034, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2110 }, { "completion_length": 631.46875, "epoch": 2.2506666666666666, "grad_norm": 0.01947176828980446, "kl": 0.09366989135742188, "learning_rate": 5.322547360146482e-07, "loss": 0.0037, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2111 }, { "completion_length": 589.4375, "epoch": 2.251733333333333, "grad_norm": 0.0013168803416192532, "kl": 0.040729522705078125, "learning_rate": 5.308318076802728e-07, "loss": 0.0016, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2112 }, { "completion_length": 588.46875, "epoch": 2.2528, "grad_norm": 0.010877847671508789, "kl": 0.07098388671875, "learning_rate": 5.294103748967444e-07, "loss": 0.0028, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2113 }, { "completion_length": 664.0, "epoch": 2.2538666666666667, "grad_norm": 0.0019960689824074507, "kl": 0.03514671325683594, "learning_rate": 5.279904398575172e-07, "loss": 0.0014, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2114 }, { "completion_length": 569.3125, "epoch": 2.2549333333333332, "grad_norm": 0.015117140486836433, "kl": 0.07825851440429688, "learning_rate": 5.265720047537318e-07, "loss": 0.0031, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2115 }, { "completion_length": 525.75, "epoch": 2.2560000000000002, "grad_norm": 0.01660499908030033, "kl": 0.110107421875, "learning_rate": 5.251550717742156e-07, "loss": 0.0044, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2116 }, { "completion_length": 691.65625, "epoch": 2.2570666666666668, "grad_norm": 0.010251498781144619, "kl": 0.053562164306640625, "learning_rate": 5.237396431054793e-07, "loss": 0.0021, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2117 }, { "completion_length": 548.25, "epoch": 2.2581333333333333, "grad_norm": 0.0006882054731249809, "kl": 0.07526016235351562, "learning_rate": 5.223257209317092e-07, "loss": 0.003, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2118 }, { "completion_length": 670.03125, "epoch": 2.2592, "grad_norm": 0.01723530702292919, "kl": 0.08110427856445312, "learning_rate": 5.209133074347693e-07, "loss": 0.0032, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2119 }, { "completion_length": 677.40625, "epoch": 2.2602666666666664, "grad_norm": 0.017184393480420113, "kl": 0.04179954528808594, "learning_rate": 5.195024047941956e-07, "loss": 0.0017, "reward": 0.5625, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2120 }, { "completion_length": 771.78125, "epoch": 2.2613333333333334, "grad_norm": 0.011202690191566944, "kl": 0.029842376708984375, "learning_rate": 5.180930151871906e-07, "loss": 0.0012, "reward": 0.3125, "reward_std": 0.125, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 2121 }, { "completion_length": 523.0625, "epoch": 2.2624, "grad_norm": 0.022349286824464798, "kl": 0.063232421875, "learning_rate": 5.166851407886234e-07, "loss": 0.0025, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2122 }, { "completion_length": 723.78125, "epoch": 2.2634666666666665, "grad_norm": 0.00143665901850909, "kl": 0.07886886596679688, "learning_rate": 5.152787837710262e-07, "loss": 0.0032, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2123 }, { "completion_length": 537.75, "epoch": 2.2645333333333335, "grad_norm": 0.013692643493413925, "kl": 0.033359527587890625, "learning_rate": 5.138739463045863e-07, "loss": 0.0013, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2124 }, { "completion_length": 698.875, "epoch": 2.2656, "grad_norm": 0.014208831824362278, "kl": 0.058544158935546875, "learning_rate": 5.124706305571495e-07, "loss": 0.0023, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2125 }, { "completion_length": 733.875, "epoch": 2.2666666666666666, "grad_norm": 0.014818862080574036, "kl": 0.09597396850585938, "learning_rate": 5.110688386942123e-07, "loss": 0.0038, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2126 }, { "completion_length": 728.5625, "epoch": 2.267733333333333, "grad_norm": 0.01602022349834442, "kl": 0.07097244262695312, "learning_rate": 5.096685728789175e-07, "loss": 0.0028, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2127 }, { "completion_length": 528.5625, "epoch": 2.2688, "grad_norm": 0.015910515561699867, "kl": 0.0631256103515625, "learning_rate": 5.082698352720566e-07, "loss": 0.0025, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2128 }, { "completion_length": 671.6875, "epoch": 2.2698666666666667, "grad_norm": 0.002439353847876191, "kl": 0.07932281494140625, "learning_rate": 5.068726280320607e-07, "loss": 0.0032, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2129 }, { "completion_length": 679.28125, "epoch": 2.2709333333333332, "grad_norm": 0.009350123815238476, "kl": 0.055301666259765625, "learning_rate": 5.054769533149999e-07, "loss": 0.0022, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2130 }, { "completion_length": 573.4375, "epoch": 2.2720000000000002, "grad_norm": 0.009756210260093212, "kl": 0.048099517822265625, "learning_rate": 5.04082813274579e-07, "loss": 0.0019, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2131 }, { "completion_length": 692.15625, "epoch": 2.273066666666667, "grad_norm": 0.022170143201947212, "kl": 0.1054534912109375, "learning_rate": 5.026902100621351e-07, "loss": 0.0042, "reward": 0.375, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 2132 }, { "completion_length": 540.5, "epoch": 2.2741333333333333, "grad_norm": 0.0011208717478439212, "kl": 0.059261322021484375, "learning_rate": 5.012991458266337e-07, "loss": 0.0024, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2133 }, { "completion_length": 619.03125, "epoch": 2.2752, "grad_norm": 0.0009625537786632776, "kl": 0.0477752685546875, "learning_rate": 4.999096227146651e-07, "loss": 0.0019, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2134 }, { "completion_length": 511.375, "epoch": 2.2762666666666664, "grad_norm": 0.0009831058559939265, "kl": 0.019565582275390625, "learning_rate": 4.985216428704421e-07, "loss": 0.0008, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2135 }, { "completion_length": 488.75, "epoch": 2.2773333333333334, "grad_norm": 0.01791790872812271, "kl": 0.0615997314453125, "learning_rate": 4.971352084357953e-07, "loss": 0.0025, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 2136 }, { "completion_length": 597.75, "epoch": 2.2784, "grad_norm": 0.0004034796729683876, "kl": 0.0360107421875, "learning_rate": 4.957503215501711e-07, "loss": 0.0014, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2137 }, { "completion_length": 597.6875, "epoch": 2.2794666666666665, "grad_norm": 0.017749540507793427, "kl": 0.0825347900390625, "learning_rate": 4.943669843506272e-07, "loss": 0.0033, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2138 }, { "completion_length": 537.25, "epoch": 2.2805333333333335, "grad_norm": 0.0010168690932914615, "kl": 0.061004638671875, "learning_rate": 4.92985198971831e-07, "loss": 0.0024, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2139 }, { "completion_length": 574.375, "epoch": 2.2816, "grad_norm": 0.015018833801150322, "kl": 0.050167083740234375, "learning_rate": 4.916049675460543e-07, "loss": 0.002, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2140 }, { "completion_length": 600.21875, "epoch": 2.2826666666666666, "grad_norm": 0.017269205302000046, "kl": 0.033599853515625, "learning_rate": 4.902262922031711e-07, "loss": 0.0013, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2141 }, { "completion_length": 527.875, "epoch": 2.283733333333333, "grad_norm": 0.0008207141072489321, "kl": 0.0261383056640625, "learning_rate": 4.888491750706547e-07, "loss": 0.001, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2142 }, { "completion_length": 505.125, "epoch": 2.2848, "grad_norm": 0.0017537340754643083, "kl": 0.02823638916015625, "learning_rate": 4.87473618273573e-07, "loss": 0.0011, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2143 }, { "completion_length": 597.15625, "epoch": 2.2858666666666667, "grad_norm": 0.0005765341920778155, "kl": 0.023036956787109375, "learning_rate": 4.860996239345868e-07, "loss": 0.0009, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2144 }, { "completion_length": 630.84375, "epoch": 2.2869333333333333, "grad_norm": 0.012731056660413742, "kl": 0.02410125732421875, "learning_rate": 4.847271941739458e-07, "loss": 0.001, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2145 }, { "completion_length": 523.9375, "epoch": 2.288, "grad_norm": 0.0007834271527826786, "kl": 0.07898330688476562, "learning_rate": 4.83356331109485e-07, "loss": 0.0032, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2146 }, { "completion_length": 597.9375, "epoch": 2.289066666666667, "grad_norm": 0.00044715090189129114, "kl": 0.04759025573730469, "learning_rate": 4.81987036856622e-07, "loss": 0.0019, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2147 }, { "completion_length": 627.5, "epoch": 2.2901333333333334, "grad_norm": 0.02010367438197136, "kl": 0.10334396362304688, "learning_rate": 4.806193135283535e-07, "loss": 0.0041, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2148 }, { "completion_length": 521.34375, "epoch": 2.2912, "grad_norm": 0.003398078726604581, "kl": 0.07711410522460938, "learning_rate": 4.792531632352521e-07, "loss": 0.0031, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2149 }, { "completion_length": 648.6875, "epoch": 2.2922666666666665, "grad_norm": 0.01460427325218916, "kl": 0.09116363525390625, "learning_rate": 4.778885880854628e-07, "loss": 0.0036, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2150 }, { "completion_length": 635.53125, "epoch": 2.2933333333333334, "grad_norm": 0.02500210329890251, "kl": 0.07270431518554688, "learning_rate": 4.765255901847003e-07, "loss": 0.0029, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2151 }, { "completion_length": 624.03125, "epoch": 2.2944, "grad_norm": 0.020991496741771698, "kl": 0.06443405151367188, "learning_rate": 4.75164171636245e-07, "loss": 0.0026, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2152 }, { "completion_length": 556.78125, "epoch": 2.2954666666666665, "grad_norm": 0.02695426344871521, "kl": 0.051990509033203125, "learning_rate": 4.738043345409406e-07, "loss": 0.0021, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2153 }, { "completion_length": 508.5625, "epoch": 2.2965333333333335, "grad_norm": 0.028033124282956123, "kl": 0.06622695922851562, "learning_rate": 4.7244608099719e-07, "loss": 0.0026, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2154 }, { "completion_length": 527.625, "epoch": 2.2976, "grad_norm": 0.00995529443025589, "kl": 0.05931854248046875, "learning_rate": 4.710894131009527e-07, "loss": 0.0024, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2155 }, { "completion_length": 621.0, "epoch": 2.2986666666666666, "grad_norm": 0.012830071151256561, "kl": 0.05042266845703125, "learning_rate": 4.697343329457413e-07, "loss": 0.002, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2156 }, { "completion_length": 655.8125, "epoch": 2.299733333333333, "grad_norm": 0.018272949382662773, "kl": 0.0478973388671875, "learning_rate": 4.6838084262261776e-07, "loss": 0.0019, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2157 }, { "completion_length": 529.28125, "epoch": 2.3008, "grad_norm": 0.014158574864268303, "kl": 0.0185699462890625, "learning_rate": 4.67028944220193e-07, "loss": 0.0007, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2158 }, { "completion_length": 676.9375, "epoch": 2.3018666666666667, "grad_norm": 0.013508127070963383, "kl": 0.05728912353515625, "learning_rate": 4.6567863982461787e-07, "loss": 0.0023, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2159 }, { "completion_length": 598.21875, "epoch": 2.3029333333333333, "grad_norm": 0.0006444290629588068, "kl": 0.07453155517578125, "learning_rate": 4.643299315195855e-07, "loss": 0.003, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2160 }, { "completion_length": 621.40625, "epoch": 2.304, "grad_norm": 0.0116100599989295, "kl": 0.05297660827636719, "learning_rate": 4.629828213863272e-07, "loss": 0.0021, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2161 }, { "completion_length": 713.59375, "epoch": 2.305066666666667, "grad_norm": 0.0006779617979191244, "kl": 0.075714111328125, "learning_rate": 4.616373115036054e-07, "loss": 0.003, "reward": 0.5625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2162 }, { "completion_length": 683.9375, "epoch": 2.3061333333333334, "grad_norm": 0.00876159779727459, "kl": 0.12277412414550781, "learning_rate": 4.6029340394771426e-07, "loss": 0.0049, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2163 }, { "completion_length": 580.28125, "epoch": 2.3072, "grad_norm": 0.020862413570284843, "kl": 0.040771484375, "learning_rate": 4.589511007924769e-07, "loss": 0.0016, "reward": 0.53125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2164 }, { "completion_length": 719.28125, "epoch": 2.3082666666666665, "grad_norm": 0.009349261410534382, "kl": 0.0337371826171875, "learning_rate": 4.576104041092377e-07, "loss": 0.0013, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2165 }, { "completion_length": 564.28125, "epoch": 2.3093333333333335, "grad_norm": 0.016642054542899132, "kl": 0.079833984375, "learning_rate": 4.562713159668648e-07, "loss": 0.0032, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2166 }, { "completion_length": 630.78125, "epoch": 2.3104, "grad_norm": 0.014729680493474007, "kl": 0.060733795166015625, "learning_rate": 4.5493383843174303e-07, "loss": 0.0024, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2167 }, { "completion_length": 636.25, "epoch": 2.3114666666666666, "grad_norm": 0.0015915848780423403, "kl": 0.025226593017578125, "learning_rate": 4.535979735677705e-07, "loss": 0.001, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2168 }, { "completion_length": 654.40625, "epoch": 2.3125333333333336, "grad_norm": 0.0028376556001603603, "kl": 0.03638267517089844, "learning_rate": 4.522637234363593e-07, "loss": 0.0015, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2169 }, { "completion_length": 609.8125, "epoch": 2.3136, "grad_norm": 0.013316385447978973, "kl": 0.09151840209960938, "learning_rate": 4.509310900964286e-07, "loss": 0.0037, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2170 }, { "completion_length": 497.8125, "epoch": 2.3146666666666667, "grad_norm": 0.01735689304769039, "kl": 0.04454803466796875, "learning_rate": 4.496000756044014e-07, "loss": 0.0018, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2171 }, { "completion_length": 719.8125, "epoch": 2.315733333333333, "grad_norm": 0.0065665519796311855, "kl": 0.031494140625, "learning_rate": 4.4827068201420486e-07, "loss": 0.0013, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2172 }, { "completion_length": 674.21875, "epoch": 2.3168, "grad_norm": 0.007322839926928282, "kl": 0.04660797119140625, "learning_rate": 4.4694291137726395e-07, "loss": 0.0019, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2173 }, { "completion_length": 666.34375, "epoch": 2.3178666666666667, "grad_norm": 0.013289954513311386, "kl": 0.08988666534423828, "learning_rate": 4.456167657424977e-07, "loss": 0.0036, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2174 }, { "completion_length": 683.75, "epoch": 2.3189333333333333, "grad_norm": 0.016672005876898766, "kl": 0.09946441650390625, "learning_rate": 4.442922471563205e-07, "loss": 0.004, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2175 }, { "completion_length": 658.25, "epoch": 2.32, "grad_norm": 0.005101365968585014, "kl": 0.03385162353515625, "learning_rate": 4.4296935766263435e-07, "loss": 0.0014, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2176 }, { "completion_length": 626.375, "epoch": 2.321066666666667, "grad_norm": 0.0180037934333086, "kl": 0.043304443359375, "learning_rate": 4.41648099302826e-07, "loss": 0.0017, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2177 }, { "completion_length": 638.15625, "epoch": 2.3221333333333334, "grad_norm": 0.01001822017133236, "kl": 0.04396247863769531, "learning_rate": 4.4032847411576785e-07, "loss": 0.0018, "reward": 0.59375, "reward_std": 0.4233439117670059, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2178 }, { "completion_length": 605.40625, "epoch": 2.3232, "grad_norm": 0.022641809657216072, "kl": 0.13177108764648438, "learning_rate": 4.390104841378112e-07, "loss": 0.0053, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2179 }, { "completion_length": 664.03125, "epoch": 2.3242666666666665, "grad_norm": 0.0021685180254280567, "kl": 0.04067420959472656, "learning_rate": 4.376941314027819e-07, "loss": 0.0016, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2180 }, { "completion_length": 600.6875, "epoch": 2.3253333333333335, "grad_norm": 0.0007840783218853176, "kl": 0.05089569091796875, "learning_rate": 4.3637941794198264e-07, "loss": 0.002, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2181 }, { "completion_length": 639.6875, "epoch": 2.3264, "grad_norm": 0.015643198043107986, "kl": 0.16925048828125, "learning_rate": 4.350663457841851e-07, "loss": 0.0068, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2182 }, { "completion_length": 581.96875, "epoch": 2.3274666666666666, "grad_norm": 0.001965449657291174, "kl": 0.046192169189453125, "learning_rate": 4.3375491695562635e-07, "loss": 0.0018, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2183 }, { "completion_length": 603.03125, "epoch": 2.3285333333333336, "grad_norm": 0.016819044947624207, "kl": 0.045352935791015625, "learning_rate": 4.3244513348001104e-07, "loss": 0.0018, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2184 }, { "completion_length": 618.625, "epoch": 2.3296, "grad_norm": 0.017988119274377823, "kl": 0.11515617370605469, "learning_rate": 4.311369973785028e-07, "loss": 0.0046, "reward": 0.59375, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2185 }, { "completion_length": 601.84375, "epoch": 2.3306666666666667, "grad_norm": 0.009893830865621567, "kl": 0.044513702392578125, "learning_rate": 4.298305106697222e-07, "loss": 0.0018, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2186 }, { "completion_length": 573.71875, "epoch": 2.331733333333333, "grad_norm": 0.013860546052455902, "kl": 0.039936065673828125, "learning_rate": 4.2852567536974705e-07, "loss": 0.0016, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2187 }, { "completion_length": 651.03125, "epoch": 2.3327999999999998, "grad_norm": 0.013822460547089577, "kl": 0.043025970458984375, "learning_rate": 4.2722249349210536e-07, "loss": 0.0017, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2188 }, { "completion_length": 491.9375, "epoch": 2.3338666666666668, "grad_norm": 0.0012499033473432064, "kl": 0.0225067138671875, "learning_rate": 4.259209670477739e-07, "loss": 0.0009, "reward": 0.96875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.0, "step": 2189 }, { "completion_length": 543.09375, "epoch": 2.3349333333333333, "grad_norm": 0.0015462357550859451, "kl": 0.0545806884765625, "learning_rate": 4.24621098045175e-07, "loss": 0.0022, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2190 }, { "completion_length": 656.21875, "epoch": 2.336, "grad_norm": 0.016638096421957016, "kl": 0.04559326171875, "learning_rate": 4.2332288849017305e-07, "loss": 0.0018, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2191 }, { "completion_length": 602.90625, "epoch": 2.337066666666667, "grad_norm": 0.020382041111588478, "kl": 0.0440521240234375, "learning_rate": 4.2202634038607194e-07, "loss": 0.0018, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2192 }, { "completion_length": 636.0625, "epoch": 2.3381333333333334, "grad_norm": 0.0006204188102856278, "kl": 0.07056808471679688, "learning_rate": 4.2073145573361197e-07, "loss": 0.0028, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2193 }, { "completion_length": 556.5625, "epoch": 2.3392, "grad_norm": 0.017112508416175842, "kl": 0.07307815551757812, "learning_rate": 4.1943823653096605e-07, "loss": 0.0029, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2194 }, { "completion_length": 650.40625, "epoch": 2.3402666666666665, "grad_norm": 0.01613982766866684, "kl": 0.0528717041015625, "learning_rate": 4.1814668477373756e-07, "loss": 0.0021, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2195 }, { "completion_length": 631.53125, "epoch": 2.3413333333333335, "grad_norm": 0.0209406316280365, "kl": 0.07150650024414062, "learning_rate": 4.168568024549562e-07, "loss": 0.0029, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2196 }, { "completion_length": 663.15625, "epoch": 2.3424, "grad_norm": 0.013577315025031567, "kl": 0.09820556640625, "learning_rate": 4.1556859156507615e-07, "loss": 0.0039, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2197 }, { "completion_length": 647.53125, "epoch": 2.3434666666666666, "grad_norm": 0.01659979857504368, "kl": 0.050388336181640625, "learning_rate": 4.142820540919719e-07, "loss": 0.002, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2198 }, { "completion_length": 717.5, "epoch": 2.3445333333333336, "grad_norm": 0.024715404957532883, "kl": 0.10224342346191406, "learning_rate": 4.129971920209359e-07, "loss": 0.0041, "reward": 0.59375, "reward_std": 0.4375, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2199 }, { "completion_length": 603.34375, "epoch": 2.3456, "grad_norm": 0.016731662675738335, "kl": 0.076995849609375, "learning_rate": 4.1171400733467495e-07, "loss": 0.0031, "reward": 0.71875, "reward_std": 0.4375, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2200 }, { "completion_length": 526.9375, "epoch": 2.3466666666666667, "grad_norm": 0.0009536818834021688, "kl": 0.06396865844726562, "learning_rate": 4.10432502013308e-07, "loss": 0.0026, "reward": 0.5625, "reward_std": 0.375, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2201 }, { "completion_length": 739.40625, "epoch": 2.3477333333333332, "grad_norm": 0.0005082301213406026, "kl": 0.038814544677734375, "learning_rate": 4.0915267803436186e-07, "loss": 0.0016, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2202 }, { "completion_length": 618.96875, "epoch": 2.3487999999999998, "grad_norm": 0.013092854991555214, "kl": 0.07708358764648438, "learning_rate": 4.0787453737276904e-07, "loss": 0.0031, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2203 }, { "completion_length": 595.8125, "epoch": 2.3498666666666668, "grad_norm": 0.023173442110419273, "kl": 0.09809112548828125, "learning_rate": 4.0659808200086485e-07, "loss": 0.0039, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2204 }, { "completion_length": 643.8125, "epoch": 2.3509333333333333, "grad_norm": 0.011082352139055729, "kl": 0.08430862426757812, "learning_rate": 4.053233138883835e-07, "loss": 0.0034, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2205 }, { "completion_length": 550.46875, "epoch": 2.352, "grad_norm": 0.0055898274295032024, "kl": 0.0743408203125, "learning_rate": 4.0405023500245515e-07, "loss": 0.003, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2206 }, { "completion_length": 599.0625, "epoch": 2.353066666666667, "grad_norm": 0.016915544867515564, "kl": 0.045352935791015625, "learning_rate": 4.027788473076054e-07, "loss": 0.0018, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2207 }, { "completion_length": 527.8125, "epoch": 2.3541333333333334, "grad_norm": 0.009917398914694786, "kl": 0.0161285400390625, "learning_rate": 4.015091527657472e-07, "loss": 0.0006, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2208 }, { "completion_length": 508.15625, "epoch": 2.3552, "grad_norm": 0.017238784581422806, "kl": 0.049640655517578125, "learning_rate": 4.0024115333618207e-07, "loss": 0.002, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2209 }, { "completion_length": 615.375, "epoch": 2.3562666666666665, "grad_norm": 0.0013481955975294113, "kl": 0.04793548583984375, "learning_rate": 3.989748509755969e-07, "loss": 0.0019, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2210 }, { "completion_length": 568.90625, "epoch": 2.3573333333333335, "grad_norm": 0.00046289435704238713, "kl": 0.020900726318359375, "learning_rate": 3.977102476380576e-07, "loss": 0.0008, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2211 }, { "completion_length": 568.15625, "epoch": 2.3584, "grad_norm": 0.009715728461742401, "kl": 0.04659271240234375, "learning_rate": 3.964473452750093e-07, "loss": 0.0019, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2212 }, { "completion_length": 718.5, "epoch": 2.3594666666666666, "grad_norm": 0.02679680846631527, "kl": 0.12282562255859375, "learning_rate": 3.9518614583527367e-07, "loss": 0.0049, "reward": 0.4375, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2213 }, { "completion_length": 660.25, "epoch": 2.360533333333333, "grad_norm": 0.01618972420692444, "kl": 0.0476837158203125, "learning_rate": 3.9392665126504196e-07, "loss": 0.0019, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2214 }, { "completion_length": 670.71875, "epoch": 2.3616, "grad_norm": 0.02193146012723446, "kl": 0.07258224487304688, "learning_rate": 3.926688635078758e-07, "loss": 0.0029, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2215 }, { "completion_length": 680.96875, "epoch": 2.3626666666666667, "grad_norm": 0.016013996675610542, "kl": 0.03777313232421875, "learning_rate": 3.9141278450470376e-07, "loss": 0.0015, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2216 }, { "completion_length": 658.0625, "epoch": 2.3637333333333332, "grad_norm": 0.007426870986819267, "kl": 0.08719635009765625, "learning_rate": 3.901584161938172e-07, "loss": 0.0035, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2217 }, { "completion_length": 645.3125, "epoch": 2.3648, "grad_norm": 0.015045268461108208, "kl": 0.08474349975585938, "learning_rate": 3.8890576051086577e-07, "loss": 0.0034, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2218 }, { "completion_length": 596.3125, "epoch": 2.365866666666667, "grad_norm": 0.020122934132814407, "kl": 0.044582366943359375, "learning_rate": 3.8765481938885945e-07, "loss": 0.0018, "reward": 0.75, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2219 }, { "completion_length": 640.65625, "epoch": 2.3669333333333333, "grad_norm": 0.018084537237882614, "kl": 0.0477294921875, "learning_rate": 3.864055947581605e-07, "loss": 0.0019, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2220 }, { "completion_length": 689.40625, "epoch": 2.368, "grad_norm": 0.01862640678882599, "kl": 0.07045173645019531, "learning_rate": 3.851580885464818e-07, "loss": 0.0028, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2221 }, { "completion_length": 724.90625, "epoch": 2.369066666666667, "grad_norm": 0.01590430550277233, "kl": 0.07815933227539062, "learning_rate": 3.839123026788867e-07, "loss": 0.0031, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2222 }, { "completion_length": 703.71875, "epoch": 2.3701333333333334, "grad_norm": 0.02355983667075634, "kl": 0.045406341552734375, "learning_rate": 3.8266823907778244e-07, "loss": 0.0018, "reward": 0.5625, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2223 }, { "completion_length": 688.40625, "epoch": 2.3712, "grad_norm": 0.01692921854555607, "kl": 0.0505523681640625, "learning_rate": 3.8142589966291747e-07, "loss": 0.002, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2224 }, { "completion_length": 687.34375, "epoch": 2.3722666666666665, "grad_norm": 0.010254896245896816, "kl": 0.06327438354492188, "learning_rate": 3.801852863513824e-07, "loss": 0.0025, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2225 }, { "completion_length": 571.34375, "epoch": 2.3733333333333335, "grad_norm": 0.016076968982815742, "kl": 0.08425521850585938, "learning_rate": 3.7894640105760217e-07, "loss": 0.0034, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2226 }, { "completion_length": 629.3125, "epoch": 2.3744, "grad_norm": 0.02014952339231968, "kl": 0.046970367431640625, "learning_rate": 3.7770924569333567e-07, "loss": 0.0019, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2227 }, { "completion_length": 724.1875, "epoch": 2.3754666666666666, "grad_norm": 0.012694806791841984, "kl": 0.08026123046875, "learning_rate": 3.7647382216767216e-07, "loss": 0.0032, "reward": 0.1875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 2228 }, { "completion_length": 567.6875, "epoch": 2.376533333333333, "grad_norm": 0.017221694812178612, "kl": 0.0377044677734375, "learning_rate": 3.7524013238702907e-07, "loss": 0.0015, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2229 }, { "completion_length": 449.28125, "epoch": 2.3776, "grad_norm": 0.016596324741840363, "kl": 0.048854827880859375, "learning_rate": 3.740081782551477e-07, "loss": 0.002, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2230 }, { "completion_length": 560.84375, "epoch": 2.3786666666666667, "grad_norm": 0.018105005845427513, "kl": 0.033443450927734375, "learning_rate": 3.727779616730912e-07, "loss": 0.0013, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2231 }, { "completion_length": 667.90625, "epoch": 2.3797333333333333, "grad_norm": 0.01010214351117611, "kl": 0.041576385498046875, "learning_rate": 3.715494845392418e-07, "loss": 0.0017, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2232 }, { "completion_length": 599.8125, "epoch": 2.3808, "grad_norm": 0.01984730362892151, "kl": 0.0657501220703125, "learning_rate": 3.703227487492973e-07, "loss": 0.0026, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2233 }, { "completion_length": 681.90625, "epoch": 2.381866666666667, "grad_norm": 0.02122659981250763, "kl": 0.10028457641601562, "learning_rate": 3.6909775619626834e-07, "loss": 0.004, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2234 }, { "completion_length": 629.03125, "epoch": 2.3829333333333333, "grad_norm": 0.014359474182128906, "kl": 0.060977935791015625, "learning_rate": 3.6787450877047543e-07, "loss": 0.0024, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2235 }, { "completion_length": 595.1875, "epoch": 2.384, "grad_norm": 0.0007591079920530319, "kl": 0.07712554931640625, "learning_rate": 3.666530083595464e-07, "loss": 0.0031, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2236 }, { "completion_length": 757.09375, "epoch": 2.385066666666667, "grad_norm": 0.010188625194132328, "kl": 0.06427860260009766, "learning_rate": 3.654332568484132e-07, "loss": 0.0026, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2237 }, { "completion_length": 647.8125, "epoch": 2.3861333333333334, "grad_norm": 0.010705986060202122, "kl": 0.0425872802734375, "learning_rate": 3.6421525611930873e-07, "loss": 0.0017, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2238 }, { "completion_length": 602.96875, "epoch": 2.3872, "grad_norm": 0.007055577822029591, "kl": 0.041957855224609375, "learning_rate": 3.6299900805176426e-07, "loss": 0.0017, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2239 }, { "completion_length": 580.375, "epoch": 2.3882666666666665, "grad_norm": 0.01320534385740757, "kl": 0.06409835815429688, "learning_rate": 3.6178451452260675e-07, "loss": 0.0026, "reward": 0.78125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2240 }, { "completion_length": 564.84375, "epoch": 2.389333333333333, "grad_norm": 0.018752342090010643, "kl": 0.0508270263671875, "learning_rate": 3.6057177740595546e-07, "loss": 0.002, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2241 }, { "completion_length": 826.3125, "epoch": 2.3904, "grad_norm": 0.018736500293016434, "kl": 0.17529678344726562, "learning_rate": 3.5936079857321953e-07, "loss": 0.007, "reward": 0.1875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 2242 }, { "completion_length": 691.96875, "epoch": 2.3914666666666666, "grad_norm": 0.002337356563657522, "kl": 0.06691741943359375, "learning_rate": 3.581515798930942e-07, "loss": 0.0027, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 2243 }, { "completion_length": 649.9375, "epoch": 2.392533333333333, "grad_norm": 0.011858872137963772, "kl": 0.093231201171875, "learning_rate": 3.569441232315594e-07, "loss": 0.0037, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2244 }, { "completion_length": 690.28125, "epoch": 2.3936, "grad_norm": 0.004926718771457672, "kl": 0.07904052734375, "learning_rate": 3.557384304518756e-07, "loss": 0.0032, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2245 }, { "completion_length": 706.0, "epoch": 2.3946666666666667, "grad_norm": 0.016207538545131683, "kl": 0.1161346435546875, "learning_rate": 3.545345034145812e-07, "loss": 0.0046, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2246 }, { "completion_length": 566.03125, "epoch": 2.3957333333333333, "grad_norm": 0.0009537880541756749, "kl": 0.0362396240234375, "learning_rate": 3.5333234397748987e-07, "loss": 0.0015, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2247 }, { "completion_length": 551.5625, "epoch": 2.3968, "grad_norm": 0.0036946360487490892, "kl": 0.01471710205078125, "learning_rate": 3.52131953995689e-07, "loss": 0.0006, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2248 }, { "completion_length": 649.8125, "epoch": 2.397866666666667, "grad_norm": 0.0008184541366063058, "kl": 0.02851104736328125, "learning_rate": 3.5093333532153313e-07, "loss": 0.0011, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2249 }, { "completion_length": 567.71875, "epoch": 2.3989333333333334, "grad_norm": 0.018680837005376816, "kl": 0.0887451171875, "learning_rate": 3.4973648980464454e-07, "loss": 0.0036, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2250 }, { "completion_length": 745.0625, "epoch": 2.4, "grad_norm": 0.006964284460991621, "kl": 0.039104461669921875, "learning_rate": 3.4854141929191067e-07, "loss": 0.0016, "reward": 0.53125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2251 }, { "completion_length": 611.71875, "epoch": 2.401066666666667, "grad_norm": 0.0012596148299053311, "kl": 0.0655670166015625, "learning_rate": 3.4734812562747753e-07, "loss": 0.0026, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2252 }, { "completion_length": 425.625, "epoch": 2.4021333333333335, "grad_norm": 0.0006867061601951718, "kl": 0.041706085205078125, "learning_rate": 3.4615661065275007e-07, "loss": 0.0017, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2253 }, { "completion_length": 617.0, "epoch": 2.4032, "grad_norm": 0.013728809542953968, "kl": 0.05896759033203125, "learning_rate": 3.4496687620639016e-07, "loss": 0.0024, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2254 }, { "completion_length": 496.0625, "epoch": 2.4042666666666666, "grad_norm": 0.022962205111980438, "kl": 0.036956787109375, "learning_rate": 3.4377892412430927e-07, "loss": 0.0015, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 2255 }, { "completion_length": 448.3125, "epoch": 2.405333333333333, "grad_norm": 0.011327630840241909, "kl": 0.04634857177734375, "learning_rate": 3.425927562396702e-07, "loss": 0.0019, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2256 }, { "completion_length": 514.4375, "epoch": 2.4064, "grad_norm": 0.0008234147098846734, "kl": 0.03238677978515625, "learning_rate": 3.4140837438288305e-07, "loss": 0.0013, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2257 }, { "completion_length": 626.4375, "epoch": 2.4074666666666666, "grad_norm": 0.010788279585540295, "kl": 0.05522918701171875, "learning_rate": 3.402257803815997e-07, "loss": 0.0022, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2258 }, { "completion_length": 548.84375, "epoch": 2.408533333333333, "grad_norm": 0.0016174869379028678, "kl": 0.035335540771484375, "learning_rate": 3.3904497606071473e-07, "loss": 0.0014, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2259 }, { "completion_length": 637.3125, "epoch": 2.4096, "grad_norm": 0.004002600908279419, "kl": 0.09290313720703125, "learning_rate": 3.3786596324236155e-07, "loss": 0.0037, "reward": 0.53125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2260 }, { "completion_length": 574.96875, "epoch": 2.4106666666666667, "grad_norm": 0.024468066170811653, "kl": 0.09682846069335938, "learning_rate": 3.3668874374590724e-07, "loss": 0.0039, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2261 }, { "completion_length": 708.28125, "epoch": 2.4117333333333333, "grad_norm": 0.017216714099049568, "kl": 0.05381965637207031, "learning_rate": 3.3551331938795246e-07, "loss": 0.0022, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2262 }, { "completion_length": 586.84375, "epoch": 2.4128, "grad_norm": 0.01631547324359417, "kl": 0.050510406494140625, "learning_rate": 3.343396919823289e-07, "loss": 0.002, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2263 }, { "completion_length": 470.6875, "epoch": 2.413866666666667, "grad_norm": 0.02309548854827881, "kl": 0.02154541015625, "learning_rate": 3.3316786334009293e-07, "loss": 0.0009, "reward": 0.84375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2264 }, { "completion_length": 599.1875, "epoch": 2.4149333333333334, "grad_norm": 0.005699659697711468, "kl": 0.05039215087890625, "learning_rate": 3.3199783526952656e-07, "loss": 0.002, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2265 }, { "completion_length": 672.25, "epoch": 2.416, "grad_norm": 0.012036792002618313, "kl": 0.08833694458007812, "learning_rate": 3.308296095761345e-07, "loss": 0.0035, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2266 }, { "completion_length": 589.9375, "epoch": 2.4170666666666665, "grad_norm": 0.012574342079460621, "kl": 0.06688117980957031, "learning_rate": 3.2966318806263695e-07, "loss": 0.0027, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2267 }, { "completion_length": 547.0625, "epoch": 2.4181333333333335, "grad_norm": 0.021228071302175522, "kl": 0.06869888305664062, "learning_rate": 3.284985725289734e-07, "loss": 0.0028, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2268 }, { "completion_length": 580.5625, "epoch": 2.4192, "grad_norm": 0.020362380892038345, "kl": 0.05072021484375, "learning_rate": 3.2733576477229516e-07, "loss": 0.002, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2269 }, { "completion_length": 628.96875, "epoch": 2.4202666666666666, "grad_norm": 0.014629225246608257, "kl": 0.058017730712890625, "learning_rate": 3.2617476658696217e-07, "loss": 0.0023, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2270 }, { "completion_length": 695.53125, "epoch": 2.421333333333333, "grad_norm": 0.015002531930804253, "kl": 0.06270599365234375, "learning_rate": 3.25015579764545e-07, "loss": 0.0025, "reward": 0.4375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2271 }, { "completion_length": 601.34375, "epoch": 2.4224, "grad_norm": 0.02967243827879429, "kl": 0.08664894104003906, "learning_rate": 3.2385820609381747e-07, "loss": 0.0035, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2272 }, { "completion_length": 561.28125, "epoch": 2.4234666666666667, "grad_norm": 0.0007299423450604081, "kl": 0.019603729248046875, "learning_rate": 3.2270264736075474e-07, "loss": 0.0008, "reward": 0.5, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2273 }, { "completion_length": 602.0, "epoch": 2.424533333333333, "grad_norm": 0.014933968894183636, "kl": 0.0707244873046875, "learning_rate": 3.2154890534853295e-07, "loss": 0.0028, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2274 }, { "completion_length": 637.53125, "epoch": 2.4256, "grad_norm": 0.007416430860757828, "kl": 0.008052825927734375, "learning_rate": 3.2039698183752393e-07, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2275 }, { "completion_length": 757.34375, "epoch": 2.4266666666666667, "grad_norm": 0.012609961442649364, "kl": 0.10573577880859375, "learning_rate": 3.192468786052933e-07, "loss": 0.0042, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2276 }, { "completion_length": 539.25, "epoch": 2.4277333333333333, "grad_norm": 0.020324833691120148, "kl": 0.08559036254882812, "learning_rate": 3.1809859742659784e-07, "loss": 0.0034, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2277 }, { "completion_length": 553.84375, "epoch": 2.4288, "grad_norm": 0.017054397612810135, "kl": 0.08509063720703125, "learning_rate": 3.1695214007338293e-07, "loss": 0.0034, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2278 }, { "completion_length": 569.3125, "epoch": 2.429866666666667, "grad_norm": 0.01680150255560875, "kl": 0.0732421875, "learning_rate": 3.1580750831477913e-07, "loss": 0.0029, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2279 }, { "completion_length": 639.9375, "epoch": 2.4309333333333334, "grad_norm": 0.002944020554423332, "kl": 0.07390594482421875, "learning_rate": 3.146647039171002e-07, "loss": 0.003, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2280 }, { "completion_length": 677.9375, "epoch": 2.432, "grad_norm": 0.021760886535048485, "kl": 0.077728271484375, "learning_rate": 3.1352372864383995e-07, "loss": 0.0031, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2281 }, { "completion_length": 745.09375, "epoch": 2.4330666666666665, "grad_norm": 0.05992412567138672, "kl": 0.07614517211914062, "learning_rate": 3.123845842556693e-07, "loss": 0.003, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2282 }, { "completion_length": 571.3125, "epoch": 2.4341333333333335, "grad_norm": 0.014913070946931839, "kl": 0.0540008544921875, "learning_rate": 3.112472725104345e-07, "loss": 0.0022, "reward": 0.65625, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2283 }, { "completion_length": 524.65625, "epoch": 2.4352, "grad_norm": 0.013228124938905239, "kl": 0.01262664794921875, "learning_rate": 3.1011179516315333e-07, "loss": 0.0005, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2284 }, { "completion_length": 662.21875, "epoch": 2.4362666666666666, "grad_norm": 0.013313538394868374, "kl": 0.08560943603515625, "learning_rate": 3.0897815396601315e-07, "loss": 0.0034, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2285 }, { "completion_length": 712.53125, "epoch": 2.437333333333333, "grad_norm": 0.0016948715783655643, "kl": 0.05872917175292969, "learning_rate": 3.078463506683674e-07, "loss": 0.0023, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2286 }, { "completion_length": 539.6875, "epoch": 2.4384, "grad_norm": 0.013490189798176289, "kl": 0.11095428466796875, "learning_rate": 3.067163870167341e-07, "loss": 0.0044, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2287 }, { "completion_length": 503.09375, "epoch": 2.4394666666666667, "grad_norm": 0.02178187295794487, "kl": 0.06855010986328125, "learning_rate": 3.055882647547921e-07, "loss": 0.0027, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2288 }, { "completion_length": 649.21875, "epoch": 2.440533333333333, "grad_norm": 0.02666303887963295, "kl": 0.0865936279296875, "learning_rate": 3.0446198562337857e-07, "loss": 0.0035, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2289 }, { "completion_length": 604.59375, "epoch": 2.4416, "grad_norm": 0.013952727429568768, "kl": 0.0337066650390625, "learning_rate": 3.033375513604867e-07, "loss": 0.0014, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2290 }, { "completion_length": 652.8125, "epoch": 2.4426666666666668, "grad_norm": 0.015621324069797993, "kl": 0.0810394287109375, "learning_rate": 3.022149637012629e-07, "loss": 0.0032, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2291 }, { "completion_length": 484.75, "epoch": 2.4437333333333333, "grad_norm": 0.00202406314201653, "kl": 0.04327392578125, "learning_rate": 3.0109422437800415e-07, "loss": 0.0017, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2292 }, { "completion_length": 522.0, "epoch": 2.4448, "grad_norm": 0.0191813837736845, "kl": 0.032955169677734375, "learning_rate": 2.9997533512015457e-07, "loss": 0.0013, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2293 }, { "completion_length": 635.59375, "epoch": 2.445866666666667, "grad_norm": 0.017111238092184067, "kl": 0.12247848510742188, "learning_rate": 2.988582976543041e-07, "loss": 0.0049, "reward": 0.75, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2294 }, { "completion_length": 537.46875, "epoch": 2.4469333333333334, "grad_norm": 0.01503616664558649, "kl": 0.10065460205078125, "learning_rate": 2.977431137041848e-07, "loss": 0.004, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2295 }, { "completion_length": 568.03125, "epoch": 2.448, "grad_norm": 0.021369468420743942, "kl": 0.06440353393554688, "learning_rate": 2.966297849906685e-07, "loss": 0.0026, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2296 }, { "completion_length": 532.875, "epoch": 2.4490666666666665, "grad_norm": 0.020602652803063393, "kl": 0.0723419189453125, "learning_rate": 2.9551831323176416e-07, "loss": 0.0029, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2297 }, { "completion_length": 628.875, "epoch": 2.4501333333333335, "grad_norm": 0.022180499508976936, "kl": 0.09804534912109375, "learning_rate": 2.944087001426154e-07, "loss": 0.0039, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2298 }, { "completion_length": 525.84375, "epoch": 2.4512, "grad_norm": 0.0010076725156977773, "kl": 0.040172576904296875, "learning_rate": 2.9330094743549776e-07, "loss": 0.0016, "reward": 0.78125, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2299 }, { "completion_length": 528.375, "epoch": 2.4522666666666666, "grad_norm": 0.016985423862934113, "kl": 0.057422637939453125, "learning_rate": 2.921950568198153e-07, "loss": 0.0023, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2300 }, { "completion_length": 506.25, "epoch": 2.453333333333333, "grad_norm": 0.003932615742087364, "kl": 0.031169891357421875, "learning_rate": 2.9109103000209945e-07, "loss": 0.0012, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2301 }, { "completion_length": 727.71875, "epoch": 2.4544, "grad_norm": 0.009172700345516205, "kl": 0.08584213256835938, "learning_rate": 2.89988868686005e-07, "loss": 0.0034, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2302 }, { "completion_length": 624.34375, "epoch": 2.4554666666666667, "grad_norm": 0.028692344203591347, "kl": 0.07608795166015625, "learning_rate": 2.8888857457230824e-07, "loss": 0.003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2303 }, { "completion_length": 489.4375, "epoch": 2.4565333333333332, "grad_norm": 0.02457267977297306, "kl": 0.051235198974609375, "learning_rate": 2.877901493589048e-07, "loss": 0.0021, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2304 }, { "completion_length": 585.21875, "epoch": 2.4576000000000002, "grad_norm": 0.017978275194764137, "kl": 0.02324676513671875, "learning_rate": 2.8669359474080453e-07, "loss": 0.0009, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2305 }, { "completion_length": 629.875, "epoch": 2.458666666666667, "grad_norm": 0.0009064972400665283, "kl": 0.07686614990234375, "learning_rate": 2.855989124101327e-07, "loss": 0.0031, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2306 }, { "completion_length": 590.09375, "epoch": 2.4597333333333333, "grad_norm": 0.002017345279455185, "kl": 0.04828643798828125, "learning_rate": 2.8450610405612504e-07, "loss": 0.0019, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2307 }, { "completion_length": 678.5625, "epoch": 2.4608, "grad_norm": 0.016239533200860023, "kl": 0.06024932861328125, "learning_rate": 2.834151713651233e-07, "loss": 0.0024, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2308 }, { "completion_length": 686.6875, "epoch": 2.4618666666666664, "grad_norm": 0.0010368989314883947, "kl": 0.022769927978515625, "learning_rate": 2.823261160205782e-07, "loss": 0.0009, "reward": 0.53125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2309 }, { "completion_length": 758.875, "epoch": 2.4629333333333334, "grad_norm": 0.017095020040869713, "kl": 0.037242889404296875, "learning_rate": 2.8123893970304154e-07, "loss": 0.0015, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2310 }, { "completion_length": 662.8125, "epoch": 2.464, "grad_norm": 0.013802355155348778, "kl": 0.11032867431640625, "learning_rate": 2.801536440901649e-07, "loss": 0.0044, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2311 }, { "completion_length": 582.15625, "epoch": 2.4650666666666665, "grad_norm": 0.0013744814787060022, "kl": 0.047943115234375, "learning_rate": 2.790702308566996e-07, "loss": 0.0019, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2312 }, { "completion_length": 560.96875, "epoch": 2.4661333333333335, "grad_norm": 0.013737165369093418, "kl": 0.055652618408203125, "learning_rate": 2.779887016744915e-07, "loss": 0.0022, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2313 }, { "completion_length": 544.90625, "epoch": 2.4672, "grad_norm": 0.01378327514976263, "kl": 0.0841064453125, "learning_rate": 2.769090582124774e-07, "loss": 0.0034, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2314 }, { "completion_length": 578.84375, "epoch": 2.4682666666666666, "grad_norm": 0.003236262360587716, "kl": 0.0538177490234375, "learning_rate": 2.758313021366871e-07, "loss": 0.0022, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2315 }, { "completion_length": 553.28125, "epoch": 2.469333333333333, "grad_norm": 0.015764085575938225, "kl": 0.03786468505859375, "learning_rate": 2.7475543511023627e-07, "loss": 0.0015, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 2316 }, { "completion_length": 538.65625, "epoch": 2.4704, "grad_norm": 0.010712438262999058, "kl": 0.04691314697265625, "learning_rate": 2.736814587933247e-07, "loss": 0.0019, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2317 }, { "completion_length": 604.46875, "epoch": 2.4714666666666667, "grad_norm": 0.0012365283910185099, "kl": 0.07255935668945312, "learning_rate": 2.7260937484323666e-07, "loss": 0.0029, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2318 }, { "completion_length": 753.96875, "epoch": 2.4725333333333332, "grad_norm": 0.0028201304376125336, "kl": 0.049465179443359375, "learning_rate": 2.715391849143354e-07, "loss": 0.002, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2319 }, { "completion_length": 542.75, "epoch": 2.4736000000000002, "grad_norm": 0.016745468601584435, "kl": 0.07066726684570312, "learning_rate": 2.704708906580598e-07, "loss": 0.0028, "reward": 0.6875, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2320 }, { "completion_length": 670.0, "epoch": 2.474666666666667, "grad_norm": 0.009731484577059746, "kl": 0.039875030517578125, "learning_rate": 2.694044937229266e-07, "loss": 0.0016, "reward": 0.40625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 2321 }, { "completion_length": 659.65625, "epoch": 2.4757333333333333, "grad_norm": 0.018105562776327133, "kl": 0.06458282470703125, "learning_rate": 2.6833999575452256e-07, "loss": 0.0026, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2322 }, { "completion_length": 634.84375, "epoch": 2.4768, "grad_norm": 0.0009235786274075508, "kl": 0.039257049560546875, "learning_rate": 2.672773983955036e-07, "loss": 0.0016, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2323 }, { "completion_length": 527.28125, "epoch": 2.4778666666666664, "grad_norm": 0.01102540735155344, "kl": 0.07849502563476562, "learning_rate": 2.6621670328559517e-07, "loss": 0.0031, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2324 }, { "completion_length": 823.25, "epoch": 2.4789333333333334, "grad_norm": 0.010027418844401836, "kl": 0.026996612548828125, "learning_rate": 2.651579120615855e-07, "loss": 0.0011, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2325 }, { "completion_length": 537.5625, "epoch": 2.48, "grad_norm": 0.0008955160155892372, "kl": 0.009510040283203125, "learning_rate": 2.6410102635732435e-07, "loss": 0.0004, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2326 }, { "completion_length": 650.84375, "epoch": 2.4810666666666665, "grad_norm": 0.009847068227827549, "kl": 0.09490203857421875, "learning_rate": 2.630460478037231e-07, "loss": 0.0038, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2327 }, { "completion_length": 765.6875, "epoch": 2.4821333333333335, "grad_norm": 0.014361974783241749, "kl": 0.07525253295898438, "learning_rate": 2.6199297802874865e-07, "loss": 0.003, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2328 }, { "completion_length": 711.28125, "epoch": 2.4832, "grad_norm": 0.01535746082663536, "kl": 0.03496551513671875, "learning_rate": 2.609418186574225e-07, "loss": 0.0014, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2329 }, { "completion_length": 687.8125, "epoch": 2.4842666666666666, "grad_norm": 0.022059815004467964, "kl": 0.06434440612792969, "learning_rate": 2.5989257131181876e-07, "loss": 0.0026, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2330 }, { "completion_length": 545.375, "epoch": 2.485333333333333, "grad_norm": 0.0030767752323299646, "kl": 0.08573532104492188, "learning_rate": 2.5884523761106026e-07, "loss": 0.0034, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2331 }, { "completion_length": 604.0625, "epoch": 2.4864, "grad_norm": 0.0031233776826411486, "kl": 0.07221603393554688, "learning_rate": 2.5779981917131757e-07, "loss": 0.0029, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2332 }, { "completion_length": 675.25, "epoch": 2.4874666666666667, "grad_norm": 0.0023954915814101696, "kl": 0.11196136474609375, "learning_rate": 2.567563176058054e-07, "loss": 0.0045, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2333 }, { "completion_length": 480.875, "epoch": 2.4885333333333333, "grad_norm": 0.01600964367389679, "kl": 0.04315948486328125, "learning_rate": 2.5571473452478045e-07, "loss": 0.0017, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 2334 }, { "completion_length": 517.15625, "epoch": 2.4896, "grad_norm": 0.02196871116757393, "kl": 0.046314239501953125, "learning_rate": 2.546750715355391e-07, "loss": 0.0018, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2335 }, { "completion_length": 643.5625, "epoch": 2.490666666666667, "grad_norm": 0.0175961684435606, "kl": 0.1240386962890625, "learning_rate": 2.5363733024241484e-07, "loss": 0.005, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2336 }, { "completion_length": 602.03125, "epoch": 2.4917333333333334, "grad_norm": 0.017406243830919266, "kl": 0.07131004333496094, "learning_rate": 2.526015122467751e-07, "loss": 0.0029, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2337 }, { "completion_length": 669.3125, "epoch": 2.4928, "grad_norm": 0.010671515949070454, "kl": 0.1059112548828125, "learning_rate": 2.5156761914702063e-07, "loss": 0.0042, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2338 }, { "completion_length": 669.0625, "epoch": 2.4938666666666665, "grad_norm": 0.0163334421813488, "kl": 0.04964447021484375, "learning_rate": 2.5053565253858047e-07, "loss": 0.002, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2339 }, { "completion_length": 602.3125, "epoch": 2.4949333333333334, "grad_norm": 0.01557721197605133, "kl": 0.05706787109375, "learning_rate": 2.495056140139119e-07, "loss": 0.0023, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2340 }, { "completion_length": 695.09375, "epoch": 2.496, "grad_norm": 0.001797364209778607, "kl": 0.06215667724609375, "learning_rate": 2.484775051624964e-07, "loss": 0.0025, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2341 }, { "completion_length": 690.59375, "epoch": 2.4970666666666665, "grad_norm": 0.019698206335306168, "kl": 0.025501251220703125, "learning_rate": 2.4745132757083785e-07, "loss": 0.001, "reward": 0.6875, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2342 }, { "completion_length": 707.5, "epoch": 2.4981333333333335, "grad_norm": 0.015202712267637253, "kl": 0.071319580078125, "learning_rate": 2.464270828224597e-07, "loss": 0.0028, "reward": 0.34375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 2343 }, { "completion_length": 530.65625, "epoch": 2.4992, "grad_norm": 0.013497039675712585, "kl": 0.052669525146484375, "learning_rate": 2.454047724979032e-07, "loss": 0.0021, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2344 }, { "completion_length": 631.625, "epoch": 2.5002666666666666, "grad_norm": 0.017940932884812355, "kl": 0.06837081909179688, "learning_rate": 2.443843981747242e-07, "loss": 0.0027, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2345 }, { "completion_length": 668.53125, "epoch": 2.501333333333333, "grad_norm": 0.015152668580412865, "kl": 0.05610084533691406, "learning_rate": 2.433659614274909e-07, "loss": 0.0022, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2346 }, { "completion_length": 558.875, "epoch": 2.5023999999999997, "grad_norm": 0.0011001983657479286, "kl": 0.06468582153320312, "learning_rate": 2.4234946382778286e-07, "loss": 0.0026, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2347 }, { "completion_length": 672.09375, "epoch": 2.5034666666666667, "grad_norm": 0.017405999824404716, "kl": 0.0644378662109375, "learning_rate": 2.4133490694418526e-07, "loss": 0.0026, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2348 }, { "completion_length": 570.65625, "epoch": 2.5045333333333333, "grad_norm": 0.0013511290308088064, "kl": 0.02481842041015625, "learning_rate": 2.403222923422895e-07, "loss": 0.001, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2349 }, { "completion_length": 603.375, "epoch": 2.5056000000000003, "grad_norm": 0.02514742501080036, "kl": 0.07216262817382812, "learning_rate": 2.393116215846909e-07, "loss": 0.0029, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2350 }, { "completion_length": 451.65625, "epoch": 2.506666666666667, "grad_norm": 0.017530666664242744, "kl": 0.08519744873046875, "learning_rate": 2.383028962309831e-07, "loss": 0.0034, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2351 }, { "completion_length": 554.0625, "epoch": 2.5077333333333334, "grad_norm": 0.0010180514072999358, "kl": 0.044307708740234375, "learning_rate": 2.372961178377585e-07, "loss": 0.0018, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2352 }, { "completion_length": 667.59375, "epoch": 2.5088, "grad_norm": 0.02565375529229641, "kl": 0.12060928344726562, "learning_rate": 2.362912879586065e-07, "loss": 0.0048, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 2353 }, { "completion_length": 737.9375, "epoch": 2.5098666666666665, "grad_norm": 0.015580583363771439, "kl": 0.0784759521484375, "learning_rate": 2.3528840814410742e-07, "loss": 0.0031, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2354 }, { "completion_length": 686.1875, "epoch": 2.5109333333333335, "grad_norm": 0.010340064764022827, "kl": 0.025716781616210938, "learning_rate": 2.3428747994183364e-07, "loss": 0.001, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2355 }, { "completion_length": 519.9375, "epoch": 2.512, "grad_norm": 0.0006680566584691405, "kl": 0.053455352783203125, "learning_rate": 2.332885048963465e-07, "loss": 0.0021, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 2356 }, { "completion_length": 632.9375, "epoch": 2.5130666666666666, "grad_norm": 0.01713837869465351, "kl": 0.07182884216308594, "learning_rate": 2.3229148454919192e-07, "loss": 0.0029, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2357 }, { "completion_length": 583.34375, "epoch": 2.5141333333333336, "grad_norm": 0.02888021431863308, "kl": 0.04372406005859375, "learning_rate": 2.312964204389e-07, "loss": 0.0017, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2358 }, { "completion_length": 649.0, "epoch": 2.5152, "grad_norm": 0.0005855423514731228, "kl": 0.06637191772460938, "learning_rate": 2.3030331410098326e-07, "loss": 0.0027, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2359 }, { "completion_length": 509.25, "epoch": 2.5162666666666667, "grad_norm": 0.014952368102967739, "kl": 0.07249832153320312, "learning_rate": 2.293121670679314e-07, "loss": 0.0029, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2360 }, { "completion_length": 585.6875, "epoch": 2.517333333333333, "grad_norm": 0.024307064712047577, "kl": 0.0704803466796875, "learning_rate": 2.2832298086921127e-07, "loss": 0.0028, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2361 }, { "completion_length": 610.71875, "epoch": 2.5183999999999997, "grad_norm": 0.0007893489091657102, "kl": 0.0833892822265625, "learning_rate": 2.2733575703126487e-07, "loss": 0.0033, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2362 }, { "completion_length": 626.90625, "epoch": 2.5194666666666667, "grad_norm": 0.0007869472028687596, "kl": 0.041332244873046875, "learning_rate": 2.2635049707750522e-07, "loss": 0.0016, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2363 }, { "completion_length": 620.59375, "epoch": 2.5205333333333333, "grad_norm": 0.010178355500102043, "kl": 0.05568504333496094, "learning_rate": 2.2536720252831367e-07, "loss": 0.0022, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2364 }, { "completion_length": 644.1875, "epoch": 2.5216, "grad_norm": 0.013564160093665123, "kl": 0.017887115478515625, "learning_rate": 2.243858749010408e-07, "loss": 0.0007, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2365 }, { "completion_length": 527.8125, "epoch": 2.522666666666667, "grad_norm": 0.02603117749094963, "kl": 0.050151824951171875, "learning_rate": 2.2340651571000125e-07, "loss": 0.002, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2366 }, { "completion_length": 637.25, "epoch": 2.5237333333333334, "grad_norm": 0.011614619754254818, "kl": 0.064239501953125, "learning_rate": 2.2242912646647086e-07, "loss": 0.0026, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2367 }, { "completion_length": 616.09375, "epoch": 2.5248, "grad_norm": 0.01696772687137127, "kl": 0.02098846435546875, "learning_rate": 2.2145370867868742e-07, "loss": 0.0008, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2368 }, { "completion_length": 607.875, "epoch": 2.5258666666666665, "grad_norm": 0.009022177197039127, "kl": 0.06517410278320312, "learning_rate": 2.204802638518456e-07, "loss": 0.0026, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2369 }, { "completion_length": 611.8125, "epoch": 2.5269333333333335, "grad_norm": 0.023828573524951935, "kl": 0.049259185791015625, "learning_rate": 2.1950879348809548e-07, "loss": 0.002, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2370 }, { "completion_length": 537.90625, "epoch": 2.528, "grad_norm": 0.0011715841246768832, "kl": 0.051151275634765625, "learning_rate": 2.1853929908654058e-07, "loss": 0.002, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2371 }, { "completion_length": 692.15625, "epoch": 2.5290666666666666, "grad_norm": 0.016483264043927193, "kl": 0.044677734375, "learning_rate": 2.1757178214323498e-07, "loss": 0.0018, "reward": 0.4375, "reward_std": 0.375, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2372 }, { "completion_length": 596.5, "epoch": 2.5301333333333336, "grad_norm": 0.018208816647529602, "kl": 0.01792144775390625, "learning_rate": 2.1660624415118158e-07, "loss": 0.0007, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2373 }, { "completion_length": 714.40625, "epoch": 2.5312, "grad_norm": 0.02035815455019474, "kl": 0.029537200927734375, "learning_rate": 2.1564268660032922e-07, "loss": 0.0012, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2374 }, { "completion_length": 562.625, "epoch": 2.5322666666666667, "grad_norm": 0.014843503944575787, "kl": 0.02713775634765625, "learning_rate": 2.1468111097757099e-07, "loss": 0.0011, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2375 }, { "completion_length": 663.375, "epoch": 2.533333333333333, "grad_norm": 0.014582827687263489, "kl": 0.054813385009765625, "learning_rate": 2.1372151876674112e-07, "loss": 0.0022, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2376 }, { "completion_length": 686.6875, "epoch": 2.5343999999999998, "grad_norm": 0.017265524715185165, "kl": 0.02364349365234375, "learning_rate": 2.1276391144861396e-07, "loss": 0.0009, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2377 }, { "completion_length": 651.0, "epoch": 2.5354666666666668, "grad_norm": 0.0016690950142219663, "kl": 0.04128265380859375, "learning_rate": 2.1180829050089995e-07, "loss": 0.0017, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2378 }, { "completion_length": 548.5, "epoch": 2.5365333333333333, "grad_norm": 0.0012524580815806985, "kl": 0.028156280517578125, "learning_rate": 2.1085465739824516e-07, "loss": 0.0011, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2379 }, { "completion_length": 633.78125, "epoch": 2.5376, "grad_norm": 0.0007863541250117123, "kl": 0.07545280456542969, "learning_rate": 2.0990301361222762e-07, "loss": 0.003, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2380 }, { "completion_length": 539.1875, "epoch": 2.538666666666667, "grad_norm": 0.0054992386139929295, "kl": 0.08245468139648438, "learning_rate": 2.0895336061135584e-07, "loss": 0.0033, "reward": 0.6875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2381 }, { "completion_length": 729.5, "epoch": 2.5397333333333334, "grad_norm": 0.021195316687226295, "kl": 0.08097457885742188, "learning_rate": 2.080056998610662e-07, "loss": 0.0032, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2382 }, { "completion_length": 543.90625, "epoch": 2.5408, "grad_norm": 0.017610035836696625, "kl": 0.07686614990234375, "learning_rate": 2.0706003282372081e-07, "loss": 0.0031, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2383 }, { "completion_length": 591.8125, "epoch": 2.5418666666666665, "grad_norm": 0.0011838851496577263, "kl": 0.03737640380859375, "learning_rate": 2.0611636095860537e-07, "loss": 0.0015, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 2384 }, { "completion_length": 623.625, "epoch": 2.5429333333333335, "grad_norm": 0.0008845278644002974, "kl": 0.013248443603515625, "learning_rate": 2.0517468572192632e-07, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2385 }, { "completion_length": 652.09375, "epoch": 2.544, "grad_norm": 0.011158399283885956, "kl": 0.054447174072265625, "learning_rate": 2.042350085668095e-07, "loss": 0.0022, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2386 }, { "completion_length": 508.0625, "epoch": 2.5450666666666666, "grad_norm": 0.015134338289499283, "kl": 0.04573822021484375, "learning_rate": 2.0329733094329727e-07, "loss": 0.0018, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2387 }, { "completion_length": 568.625, "epoch": 2.5461333333333336, "grad_norm": 0.07116440683603287, "kl": 0.12482452392578125, "learning_rate": 2.023616542983466e-07, "loss": 0.005, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2388 }, { "completion_length": 596.46875, "epoch": 2.5472, "grad_norm": 0.0012367215240374207, "kl": 0.05995941162109375, "learning_rate": 2.0142798007582658e-07, "loss": 0.0024, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2389 }, { "completion_length": 699.3125, "epoch": 2.5482666666666667, "grad_norm": 0.0012706229463219643, "kl": 0.06733322143554688, "learning_rate": 2.004963097165158e-07, "loss": 0.0027, "reward": 0.40625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 2390 }, { "completion_length": 601.5, "epoch": 2.5493333333333332, "grad_norm": 0.022447336465120316, "kl": 0.06244087219238281, "learning_rate": 1.995666446581023e-07, "loss": 0.0025, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2391 }, { "completion_length": 580.78125, "epoch": 2.5504, "grad_norm": 0.0015624066581949592, "kl": 0.0424041748046875, "learning_rate": 1.9863898633517746e-07, "loss": 0.0017, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2392 }, { "completion_length": 651.84375, "epoch": 2.5514666666666668, "grad_norm": 0.012660082429647446, "kl": 0.06472396850585938, "learning_rate": 1.9771333617923698e-07, "loss": 0.0026, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2393 }, { "completion_length": 631.3125, "epoch": 2.5525333333333333, "grad_norm": 0.002050988143309951, "kl": 0.045337677001953125, "learning_rate": 1.9678969561867894e-07, "loss": 0.0018, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2394 }, { "completion_length": 434.1875, "epoch": 2.5536, "grad_norm": 0.038082972168922424, "kl": 0.06644058227539062, "learning_rate": 1.9586806607879838e-07, "loss": 0.0027, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2395 }, { "completion_length": 746.71875, "epoch": 2.554666666666667, "grad_norm": 0.0006386293680407107, "kl": 0.054691314697265625, "learning_rate": 1.949484489817875e-07, "loss": 0.0022, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2396 }, { "completion_length": 637.875, "epoch": 2.5557333333333334, "grad_norm": 0.02277209609746933, "kl": 0.09952163696289062, "learning_rate": 1.9403084574673463e-07, "loss": 0.004, "reward": 0.5, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2397 }, { "completion_length": 578.03125, "epoch": 2.5568, "grad_norm": 0.009267466142773628, "kl": 0.021785736083984375, "learning_rate": 1.931152577896183e-07, "loss": 0.0009, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2398 }, { "completion_length": 591.78125, "epoch": 2.5578666666666665, "grad_norm": 0.016697945073246956, "kl": 0.060428619384765625, "learning_rate": 1.9220168652330833e-07, "loss": 0.0024, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2399 }, { "completion_length": 615.75, "epoch": 2.558933333333333, "grad_norm": 0.017161909490823746, "kl": 0.05701255798339844, "learning_rate": 1.9129013335756317e-07, "loss": 0.0023, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2400 }, { "completion_length": 788.0625, "epoch": 2.56, "grad_norm": 0.0027442758437246084, "kl": 0.08225250244140625, "learning_rate": 1.9038059969902515e-07, "loss": 0.0033, "reward": 0.375, "reward_std": 0.125, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 2401 }, { "completion_length": 587.4375, "epoch": 2.5610666666666666, "grad_norm": 0.01824698969721794, "kl": 0.0788421630859375, "learning_rate": 1.8947308695122172e-07, "loss": 0.0032, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2402 }, { "completion_length": 612.375, "epoch": 2.5621333333333336, "grad_norm": 0.009445586241781712, "kl": 0.04264068603515625, "learning_rate": 1.8856759651456234e-07, "loss": 0.0017, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2403 }, { "completion_length": 654.21875, "epoch": 2.5632, "grad_norm": 0.019851647317409515, "kl": 0.056610107421875, "learning_rate": 1.8766412978633395e-07, "loss": 0.0023, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2404 }, { "completion_length": 680.4375, "epoch": 2.5642666666666667, "grad_norm": 0.02063467539846897, "kl": 0.1519775390625, "learning_rate": 1.8676268816070152e-07, "loss": 0.0061, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2405 }, { "completion_length": 578.75, "epoch": 2.5653333333333332, "grad_norm": 0.02450595051050186, "kl": 0.0606536865234375, "learning_rate": 1.8586327302870599e-07, "loss": 0.0024, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2406 }, { "completion_length": 696.375, "epoch": 2.5664, "grad_norm": 0.00280088372528553, "kl": 0.07262420654296875, "learning_rate": 1.849658857782593e-07, "loss": 0.0029, "reward": 0.53125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2407 }, { "completion_length": 583.5, "epoch": 2.567466666666667, "grad_norm": 0.01828412525355816, "kl": 0.05078125, "learning_rate": 1.8407052779414573e-07, "loss": 0.002, "reward": 0.6875, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2408 }, { "completion_length": 638.40625, "epoch": 2.5685333333333333, "grad_norm": 0.002162623219192028, "kl": 0.08055877685546875, "learning_rate": 1.8317720045801778e-07, "loss": 0.0032, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2409 }, { "completion_length": 589.0625, "epoch": 2.5696, "grad_norm": 0.018809877336025238, "kl": 0.008504867553710938, "learning_rate": 1.822859051483932e-07, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2410 }, { "completion_length": 538.5, "epoch": 2.570666666666667, "grad_norm": 0.018150508403778076, "kl": 0.03220367431640625, "learning_rate": 1.813966432406557e-07, "loss": 0.0013, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2411 }, { "completion_length": 723.03125, "epoch": 2.5717333333333334, "grad_norm": 0.02054457552731037, "kl": 0.07772254943847656, "learning_rate": 1.8050941610705053e-07, "loss": 0.0031, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2412 }, { "completion_length": 526.71875, "epoch": 2.5728, "grad_norm": 0.0332336463034153, "kl": 0.055301666259765625, "learning_rate": 1.7962422511668203e-07, "loss": 0.0022, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2413 }, { "completion_length": 596.9375, "epoch": 2.5738666666666665, "grad_norm": 0.0024721066001802683, "kl": 0.05430412292480469, "learning_rate": 1.7874107163551457e-07, "loss": 0.0022, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2414 }, { "completion_length": 674.5, "epoch": 2.574933333333333, "grad_norm": 0.018954096361994743, "kl": 0.059833526611328125, "learning_rate": 1.7785995702636698e-07, "loss": 0.0024, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2415 }, { "completion_length": 566.53125, "epoch": 2.576, "grad_norm": 0.024358632043004036, "kl": 0.06533050537109375, "learning_rate": 1.769808826489115e-07, "loss": 0.0026, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2416 }, { "completion_length": 717.59375, "epoch": 2.5770666666666666, "grad_norm": 0.0005784044042229652, "kl": 0.024427413940429688, "learning_rate": 1.761038498596735e-07, "loss": 0.001, "reward": 0.46875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2417 }, { "completion_length": 692.375, "epoch": 2.5781333333333336, "grad_norm": 0.018673265352845192, "kl": 0.07275772094726562, "learning_rate": 1.7522886001202687e-07, "loss": 0.0029, "reward": 0.6875, "reward_std": 0.375, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2418 }, { "completion_length": 684.125, "epoch": 2.5792, "grad_norm": 0.0010016262531280518, "kl": 0.05313873291015625, "learning_rate": 1.7435591445619258e-07, "loss": 0.0021, "reward": 0.34375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 2419 }, { "completion_length": 733.6875, "epoch": 2.5802666666666667, "grad_norm": 0.00894667487591505, "kl": 0.10435104370117188, "learning_rate": 1.7348501453923827e-07, "loss": 0.0042, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2420 }, { "completion_length": 496.5, "epoch": 2.5813333333333333, "grad_norm": 0.015844231471419334, "kl": 0.022216796875, "learning_rate": 1.7261616160507403e-07, "loss": 0.0009, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2421 }, { "completion_length": 784.03125, "epoch": 2.5824, "grad_norm": 0.0005288978572934866, "kl": 0.0644073486328125, "learning_rate": 1.7174935699445144e-07, "loss": 0.0026, "reward": 0.40625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 2422 }, { "completion_length": 528.875, "epoch": 2.583466666666667, "grad_norm": 0.016005760058760643, "kl": 0.06322479248046875, "learning_rate": 1.7088460204496133e-07, "loss": 0.0025, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2423 }, { "completion_length": 562.15625, "epoch": 2.5845333333333333, "grad_norm": 0.014705426059663296, "kl": 0.0568389892578125, "learning_rate": 1.700218980910311e-07, "loss": 0.0023, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2424 }, { "completion_length": 462.9375, "epoch": 2.5856, "grad_norm": 0.0006450843065977097, "kl": 0.05966949462890625, "learning_rate": 1.69161246463924e-07, "loss": 0.0024, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2425 }, { "completion_length": 561.40625, "epoch": 2.586666666666667, "grad_norm": 0.000569248863030225, "kl": 0.039703369140625, "learning_rate": 1.683026484917357e-07, "loss": 0.0016, "reward": 0.71875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2426 }, { "completion_length": 583.9375, "epoch": 2.5877333333333334, "grad_norm": 0.01182971615344286, "kl": 0.04990386962890625, "learning_rate": 1.6744610549939322e-07, "loss": 0.002, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2427 }, { "completion_length": 649.625, "epoch": 2.5888, "grad_norm": 0.01485409215092659, "kl": 0.09173965454101562, "learning_rate": 1.6659161880865204e-07, "loss": 0.0037, "reward": 0.59375, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2428 }, { "completion_length": 824.5625, "epoch": 2.5898666666666665, "grad_norm": 0.010102675296366215, "kl": 0.09135246276855469, "learning_rate": 1.6573918973809477e-07, "loss": 0.0037, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2429 }, { "completion_length": 751.75, "epoch": 2.590933333333333, "grad_norm": 0.01372555922716856, "kl": 0.09735107421875, "learning_rate": 1.64888819603129e-07, "loss": 0.0039, "reward": 0.40625, "reward_std": 0.4233439117670059, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 2430 }, { "completion_length": 528.0, "epoch": 2.592, "grad_norm": 0.013819290325045586, "kl": 0.026073455810546875, "learning_rate": 1.6404050971598477e-07, "loss": 0.001, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2431 }, { "completion_length": 605.4375, "epoch": 2.5930666666666666, "grad_norm": 0.014722039923071861, "kl": 0.08646774291992188, "learning_rate": 1.631942613857131e-07, "loss": 0.0035, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2432 }, { "completion_length": 697.75, "epoch": 2.594133333333333, "grad_norm": 0.023450037464499474, "kl": 0.1224822998046875, "learning_rate": 1.6235007591818385e-07, "loss": 0.0049, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2433 }, { "completion_length": 478.625, "epoch": 2.5952, "grad_norm": 0.030401447787880898, "kl": 0.0520477294921875, "learning_rate": 1.615079546160832e-07, "loss": 0.0021, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2434 }, { "completion_length": 574.25, "epoch": 2.5962666666666667, "grad_norm": 0.004185417201370001, "kl": 0.1473388671875, "learning_rate": 1.6066789877891265e-07, "loss": 0.0059, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2435 }, { "completion_length": 438.5625, "epoch": 2.5973333333333333, "grad_norm": 0.00262792082503438, "kl": 0.0409088134765625, "learning_rate": 1.598299097029859e-07, "loss": 0.0016, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2436 }, { "completion_length": 790.6875, "epoch": 2.5984, "grad_norm": 0.01424426306039095, "kl": 0.032558441162109375, "learning_rate": 1.5899398868142772e-07, "loss": 0.0013, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2437 }, { "completion_length": 704.4375, "epoch": 2.599466666666667, "grad_norm": 0.0010237727547064424, "kl": 0.024730682373046875, "learning_rate": 1.5816013700417148e-07, "loss": 0.001, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2438 }, { "completion_length": 635.28125, "epoch": 2.6005333333333334, "grad_norm": 0.008448068052530289, "kl": 0.044246673583984375, "learning_rate": 1.573283559579572e-07, "loss": 0.0018, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2439 }, { "completion_length": 612.875, "epoch": 2.6016, "grad_norm": 0.01721864752471447, "kl": 0.012615203857421875, "learning_rate": 1.564986468263298e-07, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2440 }, { "completion_length": 731.75, "epoch": 2.602666666666667, "grad_norm": 0.023010659962892532, "kl": 0.046062469482421875, "learning_rate": 1.5567101088963699e-07, "loss": 0.0018, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2441 }, { "completion_length": 637.1875, "epoch": 2.6037333333333335, "grad_norm": 0.017322838306427002, "kl": 0.0792236328125, "learning_rate": 1.5484544942502694e-07, "loss": 0.0032, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2442 }, { "completion_length": 631.78125, "epoch": 2.6048, "grad_norm": 0.010939340107142925, "kl": 0.0680999755859375, "learning_rate": 1.540219637064471e-07, "loss": 0.0027, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2443 }, { "completion_length": 893.15625, "epoch": 2.6058666666666666, "grad_norm": 0.011113664135336876, "kl": 0.10790634155273438, "learning_rate": 1.5320055500464153e-07, "loss": 0.0043, "reward": 0.125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 2444 }, { "completion_length": 494.21875, "epoch": 2.606933333333333, "grad_norm": 0.0015196837484836578, "kl": 0.03081512451171875, "learning_rate": 1.5238122458714925e-07, "loss": 0.0012, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2445 }, { "completion_length": 796.0625, "epoch": 2.608, "grad_norm": 0.012137518264353275, "kl": 0.044063568115234375, "learning_rate": 1.515639737183021e-07, "loss": 0.0018, "reward": 0.40625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 2446 }, { "completion_length": 597.90625, "epoch": 2.6090666666666666, "grad_norm": 0.0012508333893492818, "kl": 0.052677154541015625, "learning_rate": 1.5074880365922327e-07, "loss": 0.0021, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2447 }, { "completion_length": 526.875, "epoch": 2.610133333333333, "grad_norm": 0.010428422130644321, "kl": 0.04254150390625, "learning_rate": 1.4993571566782404e-07, "loss": 0.0017, "reward": 0.65625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2448 }, { "completion_length": 547.59375, "epoch": 2.6112, "grad_norm": 0.017791621387004852, "kl": 0.05524444580078125, "learning_rate": 1.4912471099880427e-07, "loss": 0.0022, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2449 }, { "completion_length": 620.0, "epoch": 2.6122666666666667, "grad_norm": 0.022364625707268715, "kl": 0.11650848388671875, "learning_rate": 1.4831579090364833e-07, "loss": 0.0047, "reward": 0.65625, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2450 }, { "completion_length": 548.8125, "epoch": 2.6133333333333333, "grad_norm": 0.015004497021436691, "kl": 0.042514801025390625, "learning_rate": 1.475089566306226e-07, "loss": 0.0017, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2451 }, { "completion_length": 449.625, "epoch": 2.6144, "grad_norm": 0.0006467122002504766, "kl": 0.047809600830078125, "learning_rate": 1.4670420942477692e-07, "loss": 0.0019, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2452 }, { "completion_length": 642.84375, "epoch": 2.615466666666667, "grad_norm": 0.01284162700176239, "kl": 0.09527587890625, "learning_rate": 1.459015505279393e-07, "loss": 0.0038, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2453 }, { "completion_length": 628.96875, "epoch": 2.6165333333333334, "grad_norm": 0.006713234819471836, "kl": 0.1029205322265625, "learning_rate": 1.4510098117871462e-07, "loss": 0.0041, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2454 }, { "completion_length": 680.625, "epoch": 2.6176, "grad_norm": 0.003250311128795147, "kl": 0.11217498779296875, "learning_rate": 1.4430250261248485e-07, "loss": 0.0045, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2455 }, { "completion_length": 516.59375, "epoch": 2.618666666666667, "grad_norm": 0.0019857638981193304, "kl": 0.0321197509765625, "learning_rate": 1.43506116061405e-07, "loss": 0.0013, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2456 }, { "completion_length": 657.96875, "epoch": 2.6197333333333335, "grad_norm": 0.014358269982039928, "kl": 0.03253364562988281, "learning_rate": 1.4271182275440077e-07, "loss": 0.0013, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2457 }, { "completion_length": 517.3125, "epoch": 2.6208, "grad_norm": 0.013847565278410912, "kl": 0.03753089904785156, "learning_rate": 1.4191962391716944e-07, "loss": 0.0015, "reward": 0.90625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 2458 }, { "completion_length": 615.53125, "epoch": 2.6218666666666666, "grad_norm": 0.017652809619903564, "kl": 0.058261871337890625, "learning_rate": 1.4112952077217538e-07, "loss": 0.0023, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2459 }, { "completion_length": 641.03125, "epoch": 2.622933333333333, "grad_norm": 0.010322575457394123, "kl": 0.1135101318359375, "learning_rate": 1.4034151453864846e-07, "loss": 0.0045, "reward": 0.53125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2460 }, { "completion_length": 587.90625, "epoch": 2.624, "grad_norm": 0.00084843416698277, "kl": 0.06381607055664062, "learning_rate": 1.3955560643258397e-07, "loss": 0.0026, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2461 }, { "completion_length": 569.4375, "epoch": 2.6250666666666667, "grad_norm": 0.015852006152272224, "kl": 0.05877685546875, "learning_rate": 1.3877179766673915e-07, "loss": 0.0024, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2462 }, { "completion_length": 616.21875, "epoch": 2.626133333333333, "grad_norm": 0.0216466523706913, "kl": 0.026765823364257812, "learning_rate": 1.3799008945063046e-07, "loss": 0.0011, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2463 }, { "completion_length": 729.4375, "epoch": 2.6272, "grad_norm": 0.017208600416779518, "kl": 0.039989471435546875, "learning_rate": 1.3721048299053502e-07, "loss": 0.0016, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2464 }, { "completion_length": 599.5, "epoch": 2.6282666666666668, "grad_norm": 0.015849456191062927, "kl": 0.07391357421875, "learning_rate": 1.364329794894853e-07, "loss": 0.003, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2465 }, { "completion_length": 540.5, "epoch": 2.6293333333333333, "grad_norm": 0.0013800945598632097, "kl": 0.07883453369140625, "learning_rate": 1.3565758014726843e-07, "loss": 0.0032, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2466 }, { "completion_length": 657.34375, "epoch": 2.6304, "grad_norm": 0.00990377739071846, "kl": 0.12916946411132812, "learning_rate": 1.348842861604256e-07, "loss": 0.0052, "reward": 0.4375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2467 }, { "completion_length": 567.59375, "epoch": 2.6314666666666664, "grad_norm": 0.004122610203921795, "kl": 0.1041107177734375, "learning_rate": 1.3411309872224892e-07, "loss": 0.0042, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2468 }, { "completion_length": 730.0, "epoch": 2.6325333333333334, "grad_norm": 0.01646481268107891, "kl": 0.07445907592773438, "learning_rate": 1.3334401902277849e-07, "loss": 0.003, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2469 }, { "completion_length": 533.1875, "epoch": 2.6336, "grad_norm": 0.012218046002089977, "kl": 0.0793609619140625, "learning_rate": 1.3257704824880378e-07, "loss": 0.0032, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2470 }, { "completion_length": 600.40625, "epoch": 2.634666666666667, "grad_norm": 0.01777045987546444, "kl": 0.08768463134765625, "learning_rate": 1.3181218758385894e-07, "loss": 0.0035, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2471 }, { "completion_length": 612.53125, "epoch": 2.6357333333333335, "grad_norm": 0.0018606868106871843, "kl": 0.09758186340332031, "learning_rate": 1.3104943820822195e-07, "loss": 0.0039, "reward": 0.71875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2472 }, { "completion_length": 741.75, "epoch": 2.6368, "grad_norm": 0.016902711242437363, "kl": 0.12690353393554688, "learning_rate": 1.30288801298913e-07, "loss": 0.0051, "reward": 0.40625, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 2473 }, { "completion_length": 750.78125, "epoch": 2.6378666666666666, "grad_norm": 0.0072024730034172535, "kl": 0.043010711669921875, "learning_rate": 1.2953027802969253e-07, "loss": 0.0017, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2474 }, { "completion_length": 610.8125, "epoch": 2.638933333333333, "grad_norm": 0.016388045623898506, "kl": 0.05439186096191406, "learning_rate": 1.287738695710592e-07, "loss": 0.0022, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2475 }, { "completion_length": 595.4375, "epoch": 2.64, "grad_norm": 0.012899201363325119, "kl": 0.047039031982421875, "learning_rate": 1.2801957709024814e-07, "loss": 0.0019, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2476 }, { "completion_length": 648.46875, "epoch": 2.6410666666666667, "grad_norm": 0.016799405217170715, "kl": 0.0811614990234375, "learning_rate": 1.2726740175122965e-07, "loss": 0.0032, "reward": 0.59375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2477 }, { "completion_length": 783.90625, "epoch": 2.6421333333333332, "grad_norm": 0.02212878316640854, "kl": 0.04855155944824219, "learning_rate": 1.265173447147064e-07, "loss": 0.0019, "reward": 0.4375, "reward_std": 0.375, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2478 }, { "completion_length": 526.03125, "epoch": 2.6432, "grad_norm": 0.00048113573575392365, "kl": 0.034122467041015625, "learning_rate": 1.2576940713811302e-07, "loss": 0.0014, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2479 }, { "completion_length": 558.0625, "epoch": 2.6442666666666668, "grad_norm": 0.019913898780941963, "kl": 0.08440017700195312, "learning_rate": 1.25023590175613e-07, "loss": 0.0034, "reward": 0.65625, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2480 }, { "completion_length": 576.0625, "epoch": 2.6453333333333333, "grad_norm": 0.024110250174999237, "kl": 0.0895843505859375, "learning_rate": 1.2427989497809733e-07, "loss": 0.0036, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2481 }, { "completion_length": 560.8125, "epoch": 2.6464, "grad_norm": 0.018074044957756996, "kl": 0.04228973388671875, "learning_rate": 1.2353832269318355e-07, "loss": 0.0017, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 2482 }, { "completion_length": 552.5, "epoch": 2.6474666666666664, "grad_norm": 0.0017653614049777389, "kl": 0.05585479736328125, "learning_rate": 1.227988744652126e-07, "loss": 0.0022, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2483 }, { "completion_length": 555.4375, "epoch": 2.6485333333333334, "grad_norm": 0.014315692707896233, "kl": 0.07793045043945312, "learning_rate": 1.220615514352479e-07, "loss": 0.0031, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2484 }, { "completion_length": 654.375, "epoch": 2.6496, "grad_norm": 0.019583098590373993, "kl": 0.05938720703125, "learning_rate": 1.2132635474107346e-07, "loss": 0.0024, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2485 }, { "completion_length": 617.40625, "epoch": 2.6506666666666665, "grad_norm": 0.011188296601176262, "kl": 0.050090789794921875, "learning_rate": 1.2059328551719228e-07, "loss": 0.002, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2486 }, { "completion_length": 742.625, "epoch": 2.6517333333333335, "grad_norm": 0.010929955169558525, "kl": 0.08128738403320312, "learning_rate": 1.19862344894824e-07, "loss": 0.0033, "reward": 0.5, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2487 }, { "completion_length": 620.53125, "epoch": 2.6528, "grad_norm": 0.019282514229416847, "kl": 0.08579635620117188, "learning_rate": 1.1913353400190386e-07, "loss": 0.0034, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2488 }, { "completion_length": 488.28125, "epoch": 2.6538666666666666, "grad_norm": 0.010498331859707832, "kl": 0.00975799560546875, "learning_rate": 1.1840685396308037e-07, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.125, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 2489 }, { "completion_length": 634.8125, "epoch": 2.654933333333333, "grad_norm": 0.017697889357805252, "kl": 0.0558624267578125, "learning_rate": 1.1768230589971457e-07, "loss": 0.0022, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2490 }, { "completion_length": 666.84375, "epoch": 2.656, "grad_norm": 0.00993855856359005, "kl": 0.03676605224609375, "learning_rate": 1.1695989092987675e-07, "loss": 0.0015, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2491 }, { "completion_length": 711.1875, "epoch": 2.6570666666666667, "grad_norm": 0.00046764439321123064, "kl": 0.11579132080078125, "learning_rate": 1.162396101683455e-07, "loss": 0.0046, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2492 }, { "completion_length": 683.90625, "epoch": 2.6581333333333332, "grad_norm": 0.0031391168013215065, "kl": 0.08011245727539062, "learning_rate": 1.1552146472660724e-07, "loss": 0.0032, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2493 }, { "completion_length": 526.78125, "epoch": 2.6592000000000002, "grad_norm": 0.014251752756536007, "kl": 0.06898117065429688, "learning_rate": 1.1480545571285184e-07, "loss": 0.0028, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2494 }, { "completion_length": 716.4375, "epoch": 2.660266666666667, "grad_norm": 0.013734602369368076, "kl": 0.04174041748046875, "learning_rate": 1.1409158423197313e-07, "loss": 0.0017, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2495 }, { "completion_length": 710.53125, "epoch": 2.6613333333333333, "grad_norm": 0.009768237359821796, "kl": 0.047328948974609375, "learning_rate": 1.1337985138556695e-07, "loss": 0.0019, "reward": 0.59375, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2496 }, { "completion_length": 634.15625, "epoch": 2.6624, "grad_norm": 0.0004908567061647773, "kl": 0.058544158935546875, "learning_rate": 1.1267025827192772e-07, "loss": 0.0023, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2497 }, { "completion_length": 624.65625, "epoch": 2.6634666666666664, "grad_norm": 0.010460629127919674, "kl": 0.047664642333984375, "learning_rate": 1.1196280598604874e-07, "loss": 0.0019, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2498 }, { "completion_length": 556.375, "epoch": 2.6645333333333334, "grad_norm": 0.000721078715287149, "kl": 0.07195663452148438, "learning_rate": 1.1125749561962023e-07, "loss": 0.0029, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2499 }, { "completion_length": 540.96875, "epoch": 2.6656, "grad_norm": 0.014180813916027546, "kl": 0.035381317138671875, "learning_rate": 1.105543282610259e-07, "loss": 0.0014, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2500 }, { "completion_length": 666.65625, "epoch": 2.6666666666666665, "grad_norm": 0.014409059658646584, "kl": 0.07383155822753906, "learning_rate": 1.0985330499534312e-07, "loss": 0.003, "reward": 0.59375, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2501 }, { "completion_length": 779.28125, "epoch": 2.6677333333333335, "grad_norm": 0.0218245480209589, "kl": 0.07230186462402344, "learning_rate": 1.0915442690434158e-07, "loss": 0.0029, "reward": 0.46875, "reward_std": 0.3125, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2502 }, { "completion_length": 544.6875, "epoch": 2.6688, "grad_norm": 0.010221362113952637, "kl": 0.02817535400390625, "learning_rate": 1.084576950664789e-07, "loss": 0.0011, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2503 }, { "completion_length": 604.78125, "epoch": 2.6698666666666666, "grad_norm": 0.01884009875357151, "kl": 0.08475875854492188, "learning_rate": 1.0776311055690191e-07, "loss": 0.0034, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2504 }, { "completion_length": 619.09375, "epoch": 2.670933333333333, "grad_norm": 0.01870802976191044, "kl": 0.11053848266601562, "learning_rate": 1.0707067444744439e-07, "loss": 0.0044, "reward": 0.65625, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2505 }, { "completion_length": 646.375, "epoch": 2.672, "grad_norm": 0.0029957673978060484, "kl": 0.07209014892578125, "learning_rate": 1.0638038780662296e-07, "loss": 0.0029, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2506 }, { "completion_length": 629.8125, "epoch": 2.6730666666666667, "grad_norm": 0.0009833007352426648, "kl": 0.041744232177734375, "learning_rate": 1.0569225169963887e-07, "loss": 0.0017, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2507 }, { "completion_length": 691.25, "epoch": 2.6741333333333333, "grad_norm": 0.0027400171384215355, "kl": 0.098602294921875, "learning_rate": 1.0500626718837453e-07, "loss": 0.0039, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2508 }, { "completion_length": 638.0625, "epoch": 2.6752000000000002, "grad_norm": 0.023218929767608643, "kl": 0.08023834228515625, "learning_rate": 1.0432243533139218e-07, "loss": 0.0032, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2509 }, { "completion_length": 475.3125, "epoch": 2.676266666666667, "grad_norm": 0.001358655164949596, "kl": 0.012697219848632812, "learning_rate": 1.0364075718393174e-07, "loss": 0.0005, "reward": 0.75, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2510 }, { "completion_length": 677.15625, "epoch": 2.6773333333333333, "grad_norm": 0.009678996168076992, "kl": 0.01161956787109375, "learning_rate": 1.0296123379791039e-07, "loss": 0.0005, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2511 }, { "completion_length": 623.09375, "epoch": 2.6784, "grad_norm": 0.014731419272720814, "kl": 0.08442306518554688, "learning_rate": 1.0228386622191954e-07, "loss": 0.0034, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2512 }, { "completion_length": 695.6875, "epoch": 2.6794666666666664, "grad_norm": 0.003916412126272917, "kl": 0.086090087890625, "learning_rate": 1.0160865550122439e-07, "loss": 0.0034, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2513 }, { "completion_length": 623.5, "epoch": 2.6805333333333334, "grad_norm": 0.010898669250309467, "kl": 0.06866455078125, "learning_rate": 1.009356026777618e-07, "loss": 0.0027, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2514 }, { "completion_length": 697.71875, "epoch": 2.6816, "grad_norm": 0.009574610739946365, "kl": 0.04537773132324219, "learning_rate": 1.0026470879013866e-07, "loss": 0.0018, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2515 }, { "completion_length": 535.78125, "epoch": 2.6826666666666665, "grad_norm": 0.019421054050326347, "kl": 0.06352615356445312, "learning_rate": 9.959597487362998e-08, "loss": 0.0025, "reward": 0.75, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2516 }, { "completion_length": 527.71875, "epoch": 2.6837333333333335, "grad_norm": 0.0007810814422555268, "kl": 0.038360595703125, "learning_rate": 9.89294019601783e-08, "loss": 0.0015, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2517 }, { "completion_length": 554.28125, "epoch": 2.6848, "grad_norm": 0.010551415383815765, "kl": 0.01618194580078125, "learning_rate": 9.826499107839099e-08, "loss": 0.0006, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2518 }, { "completion_length": 611.78125, "epoch": 2.6858666666666666, "grad_norm": 0.013863954693078995, "kl": 0.018068313598632812, "learning_rate": 9.76027432535394e-08, "loss": 0.0007, "reward": 0.65625, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2519 }, { "completion_length": 669.5625, "epoch": 2.686933333333333, "grad_norm": 0.026994729414582253, "kl": 0.05753326416015625, "learning_rate": 9.69426595075566e-08, "loss": 0.0023, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2520 }, { "completion_length": 638.9375, "epoch": 2.6879999999999997, "grad_norm": 0.03538212180137634, "kl": 0.06090545654296875, "learning_rate": 9.628474085903677e-08, "loss": 0.0024, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2521 }, { "completion_length": 586.78125, "epoch": 2.6890666666666667, "grad_norm": 0.018010685220360756, "kl": 0.059101104736328125, "learning_rate": 9.562898832323264e-08, "loss": 0.0024, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2522 }, { "completion_length": 693.5625, "epoch": 2.6901333333333333, "grad_norm": 0.002784036099910736, "kl": 0.036590576171875, "learning_rate": 9.497540291205459e-08, "loss": 0.0015, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2523 }, { "completion_length": 565.28125, "epoch": 2.6912000000000003, "grad_norm": 0.022125760093331337, "kl": 0.08909225463867188, "learning_rate": 9.432398563406868e-08, "loss": 0.0036, "reward": 0.6875, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2524 }, { "completion_length": 555.75, "epoch": 2.692266666666667, "grad_norm": 0.02246527560055256, "kl": 0.04051971435546875, "learning_rate": 9.367473749449534e-08, "loss": 0.0016, "reward": 0.5, "reward_std": 0.375, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2525 }, { "completion_length": 632.21875, "epoch": 2.6933333333333334, "grad_norm": 0.00068635493516922, "kl": 0.0402984619140625, "learning_rate": 9.302765949520765e-08, "loss": 0.0016, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2526 }, { "completion_length": 568.90625, "epoch": 2.6944, "grad_norm": 0.0008412073948420584, "kl": 0.054645538330078125, "learning_rate": 9.238275263473018e-08, "loss": 0.0022, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2527 }, { "completion_length": 580.0, "epoch": 2.6954666666666665, "grad_norm": 0.0030944219324737787, "kl": 0.11211395263671875, "learning_rate": 9.174001790823671e-08, "loss": 0.0045, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2528 }, { "completion_length": 514.96875, "epoch": 2.6965333333333334, "grad_norm": 0.015656255185604095, "kl": 0.05670928955078125, "learning_rate": 9.109945630754974e-08, "loss": 0.0023, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2529 }, { "completion_length": 611.59375, "epoch": 2.6976, "grad_norm": 0.0013732810039073229, "kl": 0.018213272094726562, "learning_rate": 9.046106882113752e-08, "loss": 0.0007, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2530 }, { "completion_length": 579.25, "epoch": 2.6986666666666665, "grad_norm": 0.002519357716664672, "kl": 0.053615570068359375, "learning_rate": 8.982485643411409e-08, "loss": 0.0021, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2531 }, { "completion_length": 670.28125, "epoch": 2.6997333333333335, "grad_norm": 0.013969697058200836, "kl": 0.055698394775390625, "learning_rate": 8.919082012823675e-08, "loss": 0.0022, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2532 }, { "completion_length": 532.53125, "epoch": 2.7008, "grad_norm": 0.016432391479611397, "kl": 0.06504440307617188, "learning_rate": 8.85589608819049e-08, "loss": 0.0026, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2533 }, { "completion_length": 428.5625, "epoch": 2.7018666666666666, "grad_norm": 0.0036597021389752626, "kl": 0.052154541015625, "learning_rate": 8.79292796701584e-08, "loss": 0.0021, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2534 }, { "completion_length": 680.15625, "epoch": 2.702933333333333, "grad_norm": 0.022346165031194687, "kl": 0.08005142211914062, "learning_rate": 8.730177746467616e-08, "loss": 0.0032, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2535 }, { "completion_length": 772.0625, "epoch": 2.7039999999999997, "grad_norm": 0.01970984786748886, "kl": 0.1038970947265625, "learning_rate": 8.66764552337741e-08, "loss": 0.0042, "reward": 0.375, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 2536 }, { "completion_length": 708.21875, "epoch": 2.7050666666666667, "grad_norm": 0.021304503083229065, "kl": 0.053150177001953125, "learning_rate": 8.605331394240567e-08, "loss": 0.0021, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2537 }, { "completion_length": 693.9375, "epoch": 2.7061333333333333, "grad_norm": 0.01582360826432705, "kl": 0.08150863647460938, "learning_rate": 8.543235455215687e-08, "loss": 0.0033, "reward": 0.375, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 2538 }, { "completion_length": 523.1875, "epoch": 2.7072000000000003, "grad_norm": 0.000950214802287519, "kl": 0.036800384521484375, "learning_rate": 8.4813578021248e-08, "loss": 0.0015, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2539 }, { "completion_length": 471.8125, "epoch": 2.708266666666667, "grad_norm": 0.016317984089255333, "kl": 0.05741119384765625, "learning_rate": 8.419698530453096e-08, "loss": 0.0023, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2540 }, { "completion_length": 667.03125, "epoch": 2.7093333333333334, "grad_norm": 0.018810594454407692, "kl": 0.0572509765625, "learning_rate": 8.358257735348695e-08, "loss": 0.0023, "reward": 0.65625, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2541 }, { "completion_length": 566.125, "epoch": 2.7104, "grad_norm": 0.0003083272313233465, "kl": 0.024322509765625, "learning_rate": 8.297035511622609e-08, "loss": 0.001, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2542 }, { "completion_length": 524.71875, "epoch": 2.7114666666666665, "grad_norm": 0.012000400573015213, "kl": 0.028942108154296875, "learning_rate": 8.236031953748679e-08, "loss": 0.0012, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2543 }, { "completion_length": 691.65625, "epoch": 2.7125333333333335, "grad_norm": 0.021631889045238495, "kl": 0.104888916015625, "learning_rate": 8.175247155863124e-08, "loss": 0.0042, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2544 }, { "completion_length": 657.125, "epoch": 2.7136, "grad_norm": 0.022163404151797295, "kl": 0.025300979614257812, "learning_rate": 8.114681211764696e-08, "loss": 0.001, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2545 }, { "completion_length": 556.90625, "epoch": 2.7146666666666666, "grad_norm": 0.017116568982601166, "kl": 0.06678390502929688, "learning_rate": 8.054334214914483e-08, "loss": 0.0027, "reward": 0.78125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2546 }, { "completion_length": 783.4375, "epoch": 2.7157333333333336, "grad_norm": 0.006460799369961023, "kl": 0.0563507080078125, "learning_rate": 7.994206258435576e-08, "loss": 0.0023, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2547 }, { "completion_length": 684.0, "epoch": 2.7168, "grad_norm": 0.026408949866890907, "kl": 0.09649276733398438, "learning_rate": 7.934297435113124e-08, "loss": 0.0039, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2548 }, { "completion_length": 612.09375, "epoch": 2.7178666666666667, "grad_norm": 0.00446344306692481, "kl": 0.06565475463867188, "learning_rate": 7.874607837394193e-08, "loss": 0.0026, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2549 }, { "completion_length": 691.5, "epoch": 2.718933333333333, "grad_norm": 0.000809385091997683, "kl": 0.043575286865234375, "learning_rate": 7.81513755738742e-08, "loss": 0.0017, "reward": 0.40625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 2550 }, { "completion_length": 571.1875, "epoch": 2.7199999999999998, "grad_norm": 0.013231039047241211, "kl": 0.01845550537109375, "learning_rate": 7.755886686863095e-08, "loss": 0.0007, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2551 }, { "completion_length": 594.34375, "epoch": 2.7210666666666667, "grad_norm": 0.013552096672356129, "kl": 0.0082244873046875, "learning_rate": 7.696855317252965e-08, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2552 }, { "completion_length": 741.53125, "epoch": 2.7221333333333333, "grad_norm": 0.01824931800365448, "kl": 0.056163787841796875, "learning_rate": 7.638043539649897e-08, "loss": 0.0022, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2553 }, { "completion_length": 670.5, "epoch": 2.7232, "grad_norm": 0.019320912659168243, "kl": 0.08879470825195312, "learning_rate": 7.579451444808111e-08, "loss": 0.0036, "reward": 0.53125, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2554 }, { "completion_length": 575.8125, "epoch": 2.724266666666667, "grad_norm": 0.0015135620487853885, "kl": 0.07924842834472656, "learning_rate": 7.521079123142698e-08, "loss": 0.0032, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2555 }, { "completion_length": 736.34375, "epoch": 2.7253333333333334, "grad_norm": 0.017615193501114845, "kl": 0.08385467529296875, "learning_rate": 7.462926664729592e-08, "loss": 0.0034, "reward": 0.46875, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2556 }, { "completion_length": 587.5, "epoch": 2.7264, "grad_norm": 0.022876137867569923, "kl": 0.039218902587890625, "learning_rate": 7.404994159305539e-08, "loss": 0.0016, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2557 }, { "completion_length": 705.09375, "epoch": 2.7274666666666665, "grad_norm": 0.020886600017547607, "kl": 0.0968780517578125, "learning_rate": 7.34728169626785e-08, "loss": 0.0039, "reward": 0.5625, "reward_std": 0.4471687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2558 }, { "completion_length": 676.09375, "epoch": 2.7285333333333335, "grad_norm": 0.010647960938513279, "kl": 0.121795654296875, "learning_rate": 7.289789364674165e-08, "loss": 0.0049, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2559 }, { "completion_length": 608.46875, "epoch": 2.7296, "grad_norm": 0.016141578555107117, "kl": 0.055423736572265625, "learning_rate": 7.232517253242638e-08, "loss": 0.0022, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2560 }, { "completion_length": 539.21875, "epoch": 2.7306666666666666, "grad_norm": 0.01865977793931961, "kl": 0.033672332763671875, "learning_rate": 7.175465450351437e-08, "loss": 0.0013, "reward": 0.875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 2561 }, { "completion_length": 612.125, "epoch": 2.7317333333333336, "grad_norm": 0.022269384935498238, "kl": 0.12361907958984375, "learning_rate": 7.118634044038774e-08, "loss": 0.005, "reward": 0.6875, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2562 }, { "completion_length": 667.15625, "epoch": 2.7328, "grad_norm": 0.0009651589789427817, "kl": 0.061992645263671875, "learning_rate": 7.062023122002864e-08, "loss": 0.0025, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2563 }, { "completion_length": 631.46875, "epoch": 2.7338666666666667, "grad_norm": 0.0011704674689099193, "kl": 0.0555267333984375, "learning_rate": 7.005632771601578e-08, "loss": 0.0022, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2564 }, { "completion_length": 648.9375, "epoch": 2.734933333333333, "grad_norm": 0.0025312574580311775, "kl": 0.0285797119140625, "learning_rate": 6.949463079852491e-08, "loss": 0.0011, "reward": 0.625, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2565 }, { "completion_length": 485.03125, "epoch": 2.7359999999999998, "grad_norm": 0.0006320833344943821, "kl": 0.02101898193359375, "learning_rate": 6.893514133432622e-08, "loss": 0.0008, "reward": 0.78125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2566 }, { "completion_length": 604.1875, "epoch": 2.7370666666666668, "grad_norm": 0.006891170516610146, "kl": 0.044116973876953125, "learning_rate": 6.837786018678371e-08, "loss": 0.0018, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2567 }, { "completion_length": 575.34375, "epoch": 2.7381333333333333, "grad_norm": 0.0008722927886992693, "kl": 0.05833625793457031, "learning_rate": 6.782278821585386e-08, "loss": 0.0023, "reward": 0.84375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2568 }, { "completion_length": 632.71875, "epoch": 2.7392, "grad_norm": 0.007655451074242592, "kl": 0.0220184326171875, "learning_rate": 6.726992627808382e-08, "loss": 0.0009, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2569 }, { "completion_length": 575.9375, "epoch": 2.740266666666667, "grad_norm": 0.01629561185836792, "kl": 0.08401107788085938, "learning_rate": 6.671927522661048e-08, "loss": 0.0034, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2570 }, { "completion_length": 576.1875, "epoch": 2.7413333333333334, "grad_norm": 0.04106631875038147, "kl": 0.10319900512695312, "learning_rate": 6.617083591115897e-08, "loss": 0.0041, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2571 }, { "completion_length": 560.1875, "epoch": 2.7424, "grad_norm": 0.0070044356398284435, "kl": 0.047443389892578125, "learning_rate": 6.562460917804165e-08, "loss": 0.0019, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2572 }, { "completion_length": 556.6875, "epoch": 2.7434666666666665, "grad_norm": 0.016587432473897934, "kl": 0.03369903564453125, "learning_rate": 6.508059587015642e-08, "loss": 0.0013, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2573 }, { "completion_length": 618.5625, "epoch": 2.7445333333333335, "grad_norm": 0.012011725455522537, "kl": 0.09574508666992188, "learning_rate": 6.453879682698543e-08, "loss": 0.0038, "reward": 0.65625, "reward_std": 0.47617512941360474, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2574 }, { "completion_length": 670.875, "epoch": 2.7456, "grad_norm": 0.014072652906179428, "kl": 0.0770111083984375, "learning_rate": 6.399921288459443e-08, "loss": 0.0031, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2575 }, { "completion_length": 477.40625, "epoch": 2.7466666666666666, "grad_norm": 0.020277870818972588, "kl": 0.051372528076171875, "learning_rate": 6.346184487563033e-08, "loss": 0.0021, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2576 }, { "completion_length": 587.34375, "epoch": 2.7477333333333336, "grad_norm": 0.017287997528910637, "kl": 0.07484245300292969, "learning_rate": 6.292669362932102e-08, "loss": 0.003, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2577 }, { "completion_length": 551.3125, "epoch": 2.7488, "grad_norm": 0.0006126582156866789, "kl": 0.019962310791015625, "learning_rate": 6.239375997147373e-08, "loss": 0.0008, "reward": 0.8125, "reward_std": 0.25, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2578 }, { "completion_length": 708.34375, "epoch": 2.7498666666666667, "grad_norm": 0.0009086563950404525, "kl": 0.09235763549804688, "learning_rate": 6.186304472447313e-08, "loss": 0.0037, "reward": 0.4375, "reward_std": 0.125, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2579 }, { "completion_length": 609.21875, "epoch": 2.7509333333333332, "grad_norm": 0.01936659961938858, "kl": 0.0829925537109375, "learning_rate": 6.133454870728111e-08, "loss": 0.0033, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2580 }, { "completion_length": 664.84375, "epoch": 2.752, "grad_norm": 0.013367303647100925, "kl": 0.08235931396484375, "learning_rate": 6.080827273543483e-08, "loss": 0.0033, "reward": 0.3125, "reward_std": 0.125, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 2581 }, { "completion_length": 542.75, "epoch": 2.7530666666666668, "grad_norm": 0.008011281490325928, "kl": 0.06949234008789062, "learning_rate": 6.028421762104546e-08, "loss": 0.0028, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2582 }, { "completion_length": 586.53125, "epoch": 2.7541333333333333, "grad_norm": 0.023964274674654007, "kl": 0.07929229736328125, "learning_rate": 5.97623841727975e-08, "loss": 0.0032, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2583 }, { "completion_length": 543.75, "epoch": 2.7552, "grad_norm": 0.017342492938041687, "kl": 0.010305404663085938, "learning_rate": 5.9242773195946786e-08, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2584 }, { "completion_length": 740.59375, "epoch": 2.756266666666667, "grad_norm": 0.02302803099155426, "kl": 0.060001373291015625, "learning_rate": 5.872538549231976e-08, "loss": 0.0024, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2585 }, { "completion_length": 574.75, "epoch": 2.7573333333333334, "grad_norm": 0.02334793470799923, "kl": 0.06538009643554688, "learning_rate": 5.8210221860311774e-08, "loss": 0.0026, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2586 }, { "completion_length": 499.25, "epoch": 2.7584, "grad_norm": 0.017600199207663536, "kl": 0.07265853881835938, "learning_rate": 5.769728309488675e-08, "loss": 0.0029, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2587 }, { "completion_length": 707.625, "epoch": 2.7594666666666665, "grad_norm": 0.012560177594423294, "kl": 0.08864974975585938, "learning_rate": 5.718656998757471e-08, "loss": 0.0035, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2588 }, { "completion_length": 620.3125, "epoch": 2.760533333333333, "grad_norm": 0.01771816797554493, "kl": 0.03890228271484375, "learning_rate": 5.6678083326472064e-08, "loss": 0.0016, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2589 }, { "completion_length": 626.375, "epoch": 2.7616, "grad_norm": 0.007040307391434908, "kl": 0.0414886474609375, "learning_rate": 5.617182389623815e-08, "loss": 0.0017, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2590 }, { "completion_length": 622.9375, "epoch": 2.7626666666666666, "grad_norm": 0.013327219523489475, "kl": 0.1311492919921875, "learning_rate": 5.5667792478096556e-08, "loss": 0.0052, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2591 }, { "completion_length": 612.59375, "epoch": 2.7637333333333336, "grad_norm": 0.000701750221196562, "kl": 0.08199310302734375, "learning_rate": 5.516598984983279e-08, "loss": 0.0033, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2592 }, { "completion_length": 502.21875, "epoch": 2.7648, "grad_norm": 0.017085058614611626, "kl": 0.030414581298828125, "learning_rate": 5.4666416785792096e-08, "loss": 0.0012, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2593 }, { "completion_length": 564.6875, "epoch": 2.7658666666666667, "grad_norm": 0.0010318378917872906, "kl": 0.02610015869140625, "learning_rate": 5.416907405687999e-08, "loss": 0.001, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2594 }, { "completion_length": 676.28125, "epoch": 2.7669333333333332, "grad_norm": 0.0244564451277256, "kl": 0.048908233642578125, "learning_rate": 5.367396243056022e-08, "loss": 0.002, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2595 }, { "completion_length": 665.9375, "epoch": 2.768, "grad_norm": 0.011700129136443138, "kl": 0.07081985473632812, "learning_rate": 5.318108267085381e-08, "loss": 0.0028, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2596 }, { "completion_length": 488.59375, "epoch": 2.769066666666667, "grad_norm": 0.0012301853857934475, "kl": 0.01146697998046875, "learning_rate": 5.269043553833669e-08, "loss": 0.0005, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2597 }, { "completion_length": 488.6875, "epoch": 2.7701333333333333, "grad_norm": 0.0164400152862072, "kl": 0.06192779541015625, "learning_rate": 5.2202021790140884e-08, "loss": 0.0025, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2598 }, { "completion_length": 659.0625, "epoch": 2.7712, "grad_norm": 0.017174771055579185, "kl": 0.09818267822265625, "learning_rate": 5.171584217995168e-08, "loss": 0.0039, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2599 }, { "completion_length": 617.78125, "epoch": 2.772266666666667, "grad_norm": 0.035894520580768585, "kl": 0.0901947021484375, "learning_rate": 5.123189745800577e-08, "loss": 0.0036, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2600 }, { "completion_length": 586.34375, "epoch": 2.7733333333333334, "grad_norm": 0.025252070277929306, "kl": 0.07825469970703125, "learning_rate": 5.075018837109263e-08, "loss": 0.0031, "reward": 0.53125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2601 }, { "completion_length": 687.96875, "epoch": 2.7744, "grad_norm": 0.012589182704687119, "kl": 0.03816413879394531, "learning_rate": 5.027071566255115e-08, "loss": 0.0015, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2602 }, { "completion_length": 481.375, "epoch": 2.7754666666666665, "grad_norm": 0.0013624418061226606, "kl": 0.01682281494140625, "learning_rate": 4.9793480072268635e-08, "loss": 0.0007, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2603 }, { "completion_length": 599.3125, "epoch": 2.776533333333333, "grad_norm": 0.008714355528354645, "kl": 0.07294845581054688, "learning_rate": 4.9318482336681515e-08, "loss": 0.0029, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2604 }, { "completion_length": 671.46875, "epoch": 2.7776, "grad_norm": 0.021603859961032867, "kl": 0.06908798217773438, "learning_rate": 4.8845723188771953e-08, "loss": 0.0028, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2605 }, { "completion_length": 621.40625, "epoch": 2.7786666666666666, "grad_norm": 0.013143188320100307, "kl": 0.05199432373046875, "learning_rate": 4.8375203358067723e-08, "loss": 0.0021, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2606 }, { "completion_length": 549.75, "epoch": 2.779733333333333, "grad_norm": 0.0014355225721374154, "kl": 0.03973388671875, "learning_rate": 4.7906923570641695e-08, "loss": 0.0016, "reward": 0.84375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2607 }, { "completion_length": 697.75, "epoch": 2.7808, "grad_norm": 0.026630908250808716, "kl": 0.06422042846679688, "learning_rate": 4.74408845491095e-08, "loss": 0.0026, "reward": 0.34375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.0, "step": 2608 }, { "completion_length": 546.125, "epoch": 2.7818666666666667, "grad_norm": 0.01956663653254509, "kl": 0.058502197265625, "learning_rate": 4.697708701262887e-08, "loss": 0.0023, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2609 }, { "completion_length": 487.375, "epoch": 2.7829333333333333, "grad_norm": 0.0017733919667080045, "kl": 0.09303665161132812, "learning_rate": 4.6515531676899316e-08, "loss": 0.0037, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2610 }, { "completion_length": 620.1875, "epoch": 2.784, "grad_norm": 0.020433789119124413, "kl": 0.038364410400390625, "learning_rate": 4.6056219254159594e-08, "loss": 0.0015, "reward": 0.78125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2611 }, { "completion_length": 659.53125, "epoch": 2.785066666666667, "grad_norm": 0.020023595541715622, "kl": 0.07317733764648438, "learning_rate": 4.5599150453188076e-08, "loss": 0.0029, "reward": 0.625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2612 }, { "completion_length": 716.8125, "epoch": 2.7861333333333334, "grad_norm": 0.01321417186409235, "kl": 0.12512969970703125, "learning_rate": 4.514432597930007e-08, "loss": 0.005, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2613 }, { "completion_length": 709.15625, "epoch": 2.7872, "grad_norm": 0.020117582753300667, "kl": 0.04022407531738281, "learning_rate": 4.469174653434849e-08, "loss": 0.0016, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2614 }, { "completion_length": 690.75, "epoch": 2.788266666666667, "grad_norm": 0.014633359387516975, "kl": 0.036640167236328125, "learning_rate": 4.4241412816721335e-08, "loss": 0.0015, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2615 }, { "completion_length": 618.59375, "epoch": 2.7893333333333334, "grad_norm": 0.025850553065538406, "kl": 0.07606887817382812, "learning_rate": 4.379332552134124e-08, "loss": 0.003, "reward": 0.59375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2616 }, { "completion_length": 666.28125, "epoch": 2.7904, "grad_norm": 0.014451717026531696, "kl": 0.07080268859863281, "learning_rate": 4.334748533966443e-08, "loss": 0.0028, "reward": 0.75, "reward_std": 0.375, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2617 }, { "completion_length": 732.28125, "epoch": 2.7914666666666665, "grad_norm": 0.013626202940940857, "kl": 0.059078216552734375, "learning_rate": 4.290389295967956e-08, "loss": 0.0024, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2618 }, { "completion_length": 580.9375, "epoch": 2.792533333333333, "grad_norm": 0.0022774396929889917, "kl": 0.057102203369140625, "learning_rate": 4.246254906590641e-08, "loss": 0.0023, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2619 }, { "completion_length": 671.71875, "epoch": 2.7936, "grad_norm": 0.012587996199727058, "kl": 0.037200927734375, "learning_rate": 4.202345433939553e-08, "loss": 0.0015, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2620 }, { "completion_length": 695.125, "epoch": 2.7946666666666666, "grad_norm": 0.011409686878323555, "kl": 0.018941879272460938, "learning_rate": 4.1586609457726075e-08, "loss": 0.0008, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2621 }, { "completion_length": 604.96875, "epoch": 2.795733333333333, "grad_norm": 0.020661456510424614, "kl": 0.06632232666015625, "learning_rate": 4.115201509500582e-08, "loss": 0.0027, "reward": 0.6875, "reward_std": 0.375, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2622 }, { "completion_length": 658.6875, "epoch": 2.7968, "grad_norm": 0.02947366237640381, "kl": 0.08559036254882812, "learning_rate": 4.071967192186982e-08, "loss": 0.0034, "reward": 0.4375, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2623 }, { "completion_length": 627.53125, "epoch": 2.7978666666666667, "grad_norm": 0.023277709260582924, "kl": 0.07098007202148438, "learning_rate": 4.0289580605478716e-08, "loss": 0.0028, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2624 }, { "completion_length": 623.9375, "epoch": 2.7989333333333333, "grad_norm": 0.009142681956291199, "kl": 0.03388214111328125, "learning_rate": 3.986174180951896e-08, "loss": 0.0014, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2625 }, { "completion_length": 586.125, "epoch": 2.8, "grad_norm": 0.011413462460041046, "kl": 0.07018661499023438, "learning_rate": 3.9436156194200435e-08, "loss": 0.0028, "reward": 0.71875, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2626 }, { "completion_length": 611.4375, "epoch": 2.801066666666667, "grad_norm": 0.00041894245077855885, "kl": 0.03600311279296875, "learning_rate": 3.901282441625631e-08, "loss": 0.0014, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2627 }, { "completion_length": 400.875, "epoch": 2.8021333333333334, "grad_norm": 0.0007386899669654667, "kl": 0.05022430419921875, "learning_rate": 3.8591747128942033e-08, "loss": 0.002, "reward": 0.9375, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.0, "step": 2628 }, { "completion_length": 603.53125, "epoch": 2.8032, "grad_norm": 0.015120146796107292, "kl": 0.03248786926269531, "learning_rate": 3.8172924982033676e-08, "loss": 0.0013, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2629 }, { "completion_length": 638.46875, "epoch": 2.804266666666667, "grad_norm": 0.02208830416202545, "kl": 0.09305191040039062, "learning_rate": 3.7756358621827916e-08, "loss": 0.0037, "reward": 0.4375, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2630 }, { "completion_length": 572.6875, "epoch": 2.8053333333333335, "grad_norm": 0.023711489513516426, "kl": 0.05884552001953125, "learning_rate": 3.734204869113955e-08, "loss": 0.0024, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2631 }, { "completion_length": 772.4375, "epoch": 2.8064, "grad_norm": 0.0010789368534460664, "kl": 0.06385040283203125, "learning_rate": 3.6929995829302174e-08, "loss": 0.0025, "reward": 0.4375, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2632 }, { "completion_length": 541.71875, "epoch": 2.8074666666666666, "grad_norm": 0.015669984742999077, "kl": 0.09087371826171875, "learning_rate": 3.652020067216649e-08, "loss": 0.0036, "reward": 0.53125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2633 }, { "completion_length": 611.71875, "epoch": 2.808533333333333, "grad_norm": 0.01880403235554695, "kl": 0.07682037353515625, "learning_rate": 3.611266385209849e-08, "loss": 0.0031, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2634 }, { "completion_length": 684.21875, "epoch": 2.8096, "grad_norm": 0.000484222371596843, "kl": 0.07666206359863281, "learning_rate": 3.570738599797996e-08, "loss": 0.0031, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2635 }, { "completion_length": 511.0625, "epoch": 2.8106666666666666, "grad_norm": 0.02155846543610096, "kl": 0.09474563598632812, "learning_rate": 3.5304367735206954e-08, "loss": 0.0038, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2636 }, { "completion_length": 708.78125, "epoch": 2.811733333333333, "grad_norm": 0.014746192842721939, "kl": 0.0874786376953125, "learning_rate": 3.490360968568801e-08, "loss": 0.0035, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2637 }, { "completion_length": 624.03125, "epoch": 2.8128, "grad_norm": 0.012843047268688679, "kl": 0.0503082275390625, "learning_rate": 3.45051124678441e-08, "loss": 0.002, "reward": 0.6875, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2638 }, { "completion_length": 641.8125, "epoch": 2.8138666666666667, "grad_norm": 0.014666413888335228, "kl": 0.05039215087890625, "learning_rate": 3.410887669660801e-08, "loss": 0.002, "reward": 0.78125, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2639 }, { "completion_length": 556.84375, "epoch": 2.8149333333333333, "grad_norm": 0.0023764607030898333, "kl": 0.011859893798828125, "learning_rate": 3.3714902983421944e-08, "loss": 0.0005, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2640 }, { "completion_length": 588.28125, "epoch": 2.816, "grad_norm": 0.011952158994972706, "kl": 0.08716583251953125, "learning_rate": 3.3323191936237783e-08, "loss": 0.0035, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2641 }, { "completion_length": 613.28125, "epoch": 2.817066666666667, "grad_norm": 0.0031737638637423515, "kl": 0.06583404541015625, "learning_rate": 3.29337441595165e-08, "loss": 0.0026, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2642 }, { "completion_length": 813.09375, "epoch": 2.8181333333333334, "grad_norm": 0.010877658613026142, "kl": 0.005191802978515625, "learning_rate": 3.254656025422553e-08, "loss": 0.0002, "reward": 0.4375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2643 }, { "completion_length": 681.96875, "epoch": 2.8192, "grad_norm": 0.014387295581400394, "kl": 0.11468505859375, "learning_rate": 3.2161640817839287e-08, "loss": 0.0046, "reward": 0.59375, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2644 }, { "completion_length": 593.15625, "epoch": 2.820266666666667, "grad_norm": 0.009527570568025112, "kl": 0.056545257568359375, "learning_rate": 3.177898644433813e-08, "loss": 0.0023, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2645 }, { "completion_length": 450.5625, "epoch": 2.8213333333333335, "grad_norm": 0.012878965586423874, "kl": 0.04514312744140625, "learning_rate": 3.1398597724206555e-08, "loss": 0.0018, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2646 }, { "completion_length": 592.0, "epoch": 2.8224, "grad_norm": 0.010448310524225235, "kl": 0.0614776611328125, "learning_rate": 3.1020475244433186e-08, "loss": 0.0025, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2647 }, { "completion_length": 667.34375, "epoch": 2.8234666666666666, "grad_norm": 0.01531752198934555, "kl": 0.11257171630859375, "learning_rate": 3.064461958850995e-08, "loss": 0.0045, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2648 }, { "completion_length": 728.65625, "epoch": 2.824533333333333, "grad_norm": 0.0022281755227595568, "kl": 0.058349609375, "learning_rate": 3.027103133642972e-08, "loss": 0.0023, "reward": 0.375, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 2649 }, { "completion_length": 664.71875, "epoch": 2.8256, "grad_norm": 0.015773823484778404, "kl": 0.09637069702148438, "learning_rate": 2.9899711064687364e-08, "loss": 0.0039, "reward": 0.5625, "reward_std": 0.375, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2650 }, { "completion_length": 555.90625, "epoch": 2.8266666666666667, "grad_norm": 0.01561403926461935, "kl": 0.054508209228515625, "learning_rate": 2.9530659346277854e-08, "loss": 0.0022, "reward": 0.53125, "reward_std": 0.3125, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2651 }, { "completion_length": 564.6875, "epoch": 2.827733333333333, "grad_norm": 0.0011151190847158432, "kl": 0.00930023193359375, "learning_rate": 2.9163876750694986e-08, "loss": 0.0004, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2652 }, { "completion_length": 623.375, "epoch": 2.8288, "grad_norm": 0.015026366338133812, "kl": 0.0432891845703125, "learning_rate": 2.879936384393167e-08, "loss": 0.0017, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2653 }, { "completion_length": 588.875, "epoch": 2.8298666666666668, "grad_norm": 0.021989092230796814, "kl": 0.07146835327148438, "learning_rate": 2.8437121188477967e-08, "loss": 0.0029, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2654 }, { "completion_length": 614.375, "epoch": 2.8309333333333333, "grad_norm": 0.020286651328206062, "kl": 0.0862274169921875, "learning_rate": 2.807714934332073e-08, "loss": 0.0034, "reward": 0.5625, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2655 }, { "completion_length": 567.53125, "epoch": 2.832, "grad_norm": 0.014204192906618118, "kl": 0.0439910888671875, "learning_rate": 2.7719448863942788e-08, "loss": 0.0018, "reward": 0.5625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2656 }, { "completion_length": 624.125, "epoch": 2.8330666666666664, "grad_norm": 0.018164515495300293, "kl": 0.044490814208984375, "learning_rate": 2.7364020302321602e-08, "loss": 0.0018, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2657 }, { "completion_length": 532.34375, "epoch": 2.8341333333333334, "grad_norm": 0.0007067451952025294, "kl": 0.09256362915039062, "learning_rate": 2.7010864206929443e-08, "loss": 0.0037, "reward": 0.5, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2658 }, { "completion_length": 565.0625, "epoch": 2.8352, "grad_norm": 0.01644292287528515, "kl": 0.02362060546875, "learning_rate": 2.6659981122731215e-08, "loss": 0.0009, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2659 }, { "completion_length": 441.5, "epoch": 2.836266666666667, "grad_norm": 0.014165370725095272, "kl": 0.037750244140625, "learning_rate": 2.6311371591184464e-08, "loss": 0.0015, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2660 }, { "completion_length": 777.6875, "epoch": 2.8373333333333335, "grad_norm": 0.018665743991732597, "kl": 0.06728363037109375, "learning_rate": 2.5965036150238706e-08, "loss": 0.0027, "reward": 0.46875, "reward_std": 0.45683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2661 }, { "completion_length": 594.125, "epoch": 2.8384, "grad_norm": 0.010485416278243065, "kl": 0.03009033203125, "learning_rate": 2.5620975334333606e-08, "loss": 0.0012, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2662 }, { "completion_length": 685.6875, "epoch": 2.8394666666666666, "grad_norm": 0.0019697807729244232, "kl": 0.10276031494140625, "learning_rate": 2.527918967439946e-08, "loss": 0.0041, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2663 }, { "completion_length": 562.0, "epoch": 2.840533333333333, "grad_norm": 0.024359412491321564, "kl": 0.05373382568359375, "learning_rate": 2.4939679697855212e-08, "loss": 0.0022, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2664 }, { "completion_length": 702.625, "epoch": 2.8416, "grad_norm": 0.017598221078515053, "kl": 0.04794502258300781, "learning_rate": 2.4602445928608275e-08, "loss": 0.0019, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2665 }, { "completion_length": 562.71875, "epoch": 2.8426666666666667, "grad_norm": 0.011080052703619003, "kl": 0.0391693115234375, "learning_rate": 2.4267488887053714e-08, "loss": 0.0016, "reward": 0.65625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2666 }, { "completion_length": 512.90625, "epoch": 2.8437333333333332, "grad_norm": 0.0012790568871423602, "kl": 0.09211349487304688, "learning_rate": 2.393480909007306e-08, "loss": 0.0037, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2667 }, { "completion_length": 655.9375, "epoch": 2.8448, "grad_norm": 0.01624205894768238, "kl": 0.0777435302734375, "learning_rate": 2.360440705103417e-08, "loss": 0.0031, "reward": 0.59375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2668 }, { "completion_length": 613.53125, "epoch": 2.8458666666666668, "grad_norm": 0.021298758685588837, "kl": 0.05404472351074219, "learning_rate": 2.3276283279789535e-08, "loss": 0.0022, "reward": 0.75, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2669 }, { "completion_length": 622.65625, "epoch": 2.8469333333333333, "grad_norm": 0.0006229294231161475, "kl": 0.03409767150878906, "learning_rate": 2.2950438282676455e-08, "loss": 0.0014, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2670 }, { "completion_length": 532.21875, "epoch": 2.848, "grad_norm": 0.015280277468264103, "kl": 0.07373428344726562, "learning_rate": 2.2626872562515388e-08, "loss": 0.003, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2671 }, { "completion_length": 625.25, "epoch": 2.8490666666666664, "grad_norm": 0.0005578965647146106, "kl": 0.05849456787109375, "learning_rate": 2.2305586618609762e-08, "loss": 0.0023, "reward": 0.71875, "reward_std": 0.35117512941360474, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2672 }, { "completion_length": 564.65625, "epoch": 2.8501333333333334, "grad_norm": 0.01442085299640894, "kl": 0.08071517944335938, "learning_rate": 2.1986580946744993e-08, "loss": 0.0032, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2673 }, { "completion_length": 677.34375, "epoch": 2.8512, "grad_norm": 0.019534334540367126, "kl": 0.10707855224609375, "learning_rate": 2.166985603918781e-08, "loss": 0.0043, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2674 }, { "completion_length": 627.8125, "epoch": 2.8522666666666665, "grad_norm": 0.0018231312278658152, "kl": 0.028415679931640625, "learning_rate": 2.1355412384685413e-08, "loss": 0.0011, "reward": 0.53125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2675 }, { "completion_length": 574.6875, "epoch": 2.8533333333333335, "grad_norm": 0.0010985974222421646, "kl": 0.020442962646484375, "learning_rate": 2.104325046846467e-08, "loss": 0.0008, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2676 }, { "completion_length": 679.0, "epoch": 2.8544, "grad_norm": 0.018329786136746407, "kl": 0.08079719543457031, "learning_rate": 2.0733370772231253e-08, "loss": 0.0032, "reward": 0.71875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2677 }, { "completion_length": 732.21875, "epoch": 2.8554666666666666, "grad_norm": 0.010112528689205647, "kl": 0.0770111083984375, "learning_rate": 2.0425773774169655e-08, "loss": 0.0031, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2678 }, { "completion_length": 639.9375, "epoch": 2.856533333333333, "grad_norm": 0.025499915704131126, "kl": 0.055149078369140625, "learning_rate": 2.012045994894135e-08, "loss": 0.0022, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2679 }, { "completion_length": 547.6875, "epoch": 2.8576, "grad_norm": 0.018890537321567535, "kl": 0.08809661865234375, "learning_rate": 1.9817429767684468e-08, "loss": 0.0035, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2680 }, { "completion_length": 522.5, "epoch": 2.8586666666666667, "grad_norm": 0.0010659287218004465, "kl": 0.02394866943359375, "learning_rate": 1.9516683698013958e-08, "loss": 0.001, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2681 }, { "completion_length": 548.09375, "epoch": 2.8597333333333332, "grad_norm": 0.02296508476138115, "kl": 0.06494140625, "learning_rate": 1.9218222204019087e-08, "loss": 0.0026, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2682 }, { "completion_length": 553.25, "epoch": 2.8608000000000002, "grad_norm": 0.017842955887317657, "kl": 0.02898406982421875, "learning_rate": 1.8922045746264604e-08, "loss": 0.0012, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2683 }, { "completion_length": 618.53125, "epoch": 2.861866666666667, "grad_norm": 0.009016238152980804, "kl": 0.05484771728515625, "learning_rate": 1.8628154781788586e-08, "loss": 0.0022, "reward": 0.6875, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2684 }, { "completion_length": 600.625, "epoch": 2.8629333333333333, "grad_norm": 0.008477945812046528, "kl": 0.07346725463867188, "learning_rate": 1.8336549764102594e-08, "loss": 0.0029, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2685 }, { "completion_length": 577.84375, "epoch": 2.864, "grad_norm": 0.01007336750626564, "kl": 0.056427001953125, "learning_rate": 1.8047231143190513e-08, "loss": 0.0023, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2686 }, { "completion_length": 663.34375, "epoch": 2.8650666666666664, "grad_norm": 0.008610519580543041, "kl": 0.07395172119140625, "learning_rate": 1.7760199365508046e-08, "loss": 0.003, "reward": 0.53125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2687 }, { "completion_length": 668.75, "epoch": 2.8661333333333334, "grad_norm": 0.01935843750834465, "kl": 0.06838226318359375, "learning_rate": 1.7475454873982057e-08, "loss": 0.0027, "reward": 0.46875, "reward_std": 0.40400634706020355, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2688 }, { "completion_length": 600.0625, "epoch": 2.8672, "grad_norm": 0.017622286453843117, "kl": 0.050052642822265625, "learning_rate": 1.719299810801006e-08, "loss": 0.002, "reward": 0.5625, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2689 }, { "completion_length": 748.46875, "epoch": 2.8682666666666665, "grad_norm": 0.002158883260563016, "kl": 0.03536415100097656, "learning_rate": 1.6912829503458572e-08, "loss": 0.0014, "reward": 0.4375, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2690 }, { "completion_length": 517.9375, "epoch": 2.8693333333333335, "grad_norm": 0.016459640115499496, "kl": 0.027069091796875, "learning_rate": 1.6634949492664253e-08, "loss": 0.0011, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2691 }, { "completion_length": 549.46875, "epoch": 2.8704, "grad_norm": 0.011476346291601658, "kl": 0.048564910888671875, "learning_rate": 1.6359358504431264e-08, "loss": 0.0019, "reward": 0.65625, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2692 }, { "completion_length": 595.3125, "epoch": 2.8714666666666666, "grad_norm": 0.020195024088025093, "kl": 0.056667327880859375, "learning_rate": 1.6086056964031925e-08, "loss": 0.0023, "reward": 0.71875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2693 }, { "completion_length": 700.125, "epoch": 2.872533333333333, "grad_norm": 0.01619119755923748, "kl": 0.048717498779296875, "learning_rate": 1.5815045293205544e-08, "loss": 0.0019, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2694 }, { "completion_length": 627.53125, "epoch": 2.8736, "grad_norm": 0.0007577840588055551, "kl": 0.08325958251953125, "learning_rate": 1.5546323910158256e-08, "loss": 0.0033, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2695 }, { "completion_length": 630.8125, "epoch": 2.8746666666666667, "grad_norm": 0.001842444995418191, "kl": 0.06423187255859375, "learning_rate": 1.527989322956086e-08, "loss": 0.0026, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2696 }, { "completion_length": 723.78125, "epoch": 2.8757333333333333, "grad_norm": 0.010171917267143726, "kl": 0.0662841796875, "learning_rate": 1.5015753662550813e-08, "loss": 0.0027, "reward": 0.5625, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2697 }, { "completion_length": 529.03125, "epoch": 2.8768000000000002, "grad_norm": 0.010998489335179329, "kl": 0.07379150390625, "learning_rate": 1.4753905616729068e-08, "loss": 0.003, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2698 }, { "completion_length": 836.78125, "epoch": 2.877866666666667, "grad_norm": 0.002314636018127203, "kl": 0.09978675842285156, "learning_rate": 1.4494349496160575e-08, "loss": 0.004, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 2699 }, { "completion_length": 643.0625, "epoch": 2.8789333333333333, "grad_norm": 0.014680759981274605, "kl": 0.062358856201171875, "learning_rate": 1.4237085701374109e-08, "loss": 0.0025, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2700 }, { "completion_length": 551.03125, "epoch": 2.88, "grad_norm": 0.01226417813450098, "kl": 0.01374053955078125, "learning_rate": 1.3982114629360443e-08, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2701 }, { "completion_length": 663.03125, "epoch": 2.8810666666666664, "grad_norm": 0.00043867866043001413, "kl": 0.0514984130859375, "learning_rate": 1.3729436673572514e-08, "loss": 0.0021, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2702 }, { "completion_length": 672.28125, "epoch": 2.8821333333333334, "grad_norm": 0.0005056888912804425, "kl": 0.056568145751953125, "learning_rate": 1.3479052223925259e-08, "loss": 0.0023, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2703 }, { "completion_length": 646.75, "epoch": 2.8832, "grad_norm": 0.017112791538238525, "kl": 0.0489044189453125, "learning_rate": 1.3230961666793606e-08, "loss": 0.002, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2704 }, { "completion_length": 567.6875, "epoch": 2.8842666666666665, "grad_norm": 0.01690925657749176, "kl": 0.06727218627929688, "learning_rate": 1.298516538501332e-08, "loss": 0.0027, "reward": 0.84375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2705 }, { "completion_length": 659.0625, "epoch": 2.8853333333333335, "grad_norm": 0.0095510957762599, "kl": 0.05568695068359375, "learning_rate": 1.2741663757879496e-08, "loss": 0.0022, "reward": 0.46875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2706 }, { "completion_length": 552.1875, "epoch": 2.8864, "grad_norm": 0.026270661503076553, "kl": 0.09368896484375, "learning_rate": 1.2500457161146562e-08, "loss": 0.0037, "reward": 0.5625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2707 }, { "completion_length": 666.375, "epoch": 2.8874666666666666, "grad_norm": 0.01568722538650036, "kl": 0.040882110595703125, "learning_rate": 1.2261545967026778e-08, "loss": 0.0016, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2708 }, { "completion_length": 550.5, "epoch": 2.888533333333333, "grad_norm": 0.008096754550933838, "kl": 0.03751373291015625, "learning_rate": 1.2024930544191237e-08, "loss": 0.0015, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2709 }, { "completion_length": 603.90625, "epoch": 2.8895999999999997, "grad_norm": 0.015529275871813297, "kl": 0.06035614013671875, "learning_rate": 1.179061125776787e-08, "loss": 0.0024, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2710 }, { "completion_length": 506.5, "epoch": 2.8906666666666667, "grad_norm": 0.013878684490919113, "kl": 0.07376861572265625, "learning_rate": 1.1558588469341102e-08, "loss": 0.0029, "reward": 0.84375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2711 }, { "completion_length": 521.4375, "epoch": 2.8917333333333333, "grad_norm": 0.0006677703931927681, "kl": 0.075836181640625, "learning_rate": 1.1328862536952033e-08, "loss": 0.003, "reward": 0.8125, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2712 }, { "completion_length": 642.1875, "epoch": 2.8928000000000003, "grad_norm": 0.016674185171723366, "kl": 0.1533222198486328, "learning_rate": 1.1101433815097261e-08, "loss": 0.0061, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2713 }, { "completion_length": 760.1875, "epoch": 2.893866666666667, "grad_norm": 0.021286219358444214, "kl": 0.08852386474609375, "learning_rate": 1.0876302654728554e-08, "loss": 0.0035, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2714 }, { "completion_length": 634.53125, "epoch": 2.8949333333333334, "grad_norm": 0.008003344759345055, "kl": 0.021520614624023438, "learning_rate": 1.0653469403252015e-08, "loss": 0.0009, "reward": 0.6875, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2715 }, { "completion_length": 638.09375, "epoch": 2.896, "grad_norm": 0.0013620659010484815, "kl": 0.07136726379394531, "learning_rate": 1.0432934404527916e-08, "loss": 0.0029, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2716 }, { "completion_length": 679.375, "epoch": 2.8970666666666665, "grad_norm": 0.012400667183101177, "kl": 0.08450698852539062, "learning_rate": 1.0214697998870204e-08, "loss": 0.0034, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2717 }, { "completion_length": 544.5625, "epoch": 2.8981333333333335, "grad_norm": 0.01666473038494587, "kl": 0.07448959350585938, "learning_rate": 9.998760523045492e-09, "loss": 0.003, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2718 }, { "completion_length": 596.5, "epoch": 2.8992, "grad_norm": 0.015595482662320137, "kl": 0.0603485107421875, "learning_rate": 9.785122310273064e-09, "loss": 0.0024, "reward": 0.625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2719 }, { "completion_length": 482.78125, "epoch": 2.9002666666666665, "grad_norm": 0.013247810304164886, "kl": 0.0106201171875, "learning_rate": 9.573783690224213e-09, "loss": 0.0004, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2720 }, { "completion_length": 613.75, "epoch": 2.9013333333333335, "grad_norm": 0.017377600073814392, "kl": 0.07080650329589844, "learning_rate": 9.3647449890214e-09, "loss": 0.0028, "reward": 0.6875, "reward_std": 0.34150634706020355, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2721 }, { "completion_length": 636.0625, "epoch": 2.9024, "grad_norm": 0.014618327841162682, "kl": 0.02264404296875, "learning_rate": 9.158006529238094e-09, "loss": 0.0009, "reward": 0.59375, "reward_std": 0.1875, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2722 }, { "completion_length": 640.71875, "epoch": 2.9034666666666666, "grad_norm": 0.01811981201171875, "kl": 0.1159811019897461, "learning_rate": 8.953568629898435e-09, "loss": 0.0047, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2723 }, { "completion_length": 603.0625, "epoch": 2.904533333333333, "grad_norm": 0.0009661962394602597, "kl": 0.0590667724609375, "learning_rate": 8.751431606476234e-09, "loss": 0.0024, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2724 }, { "completion_length": 502.375, "epoch": 2.9055999999999997, "grad_norm": 0.006542176473885775, "kl": 0.02970123291015625, "learning_rate": 8.551595770894981e-09, "loss": 0.0012, "reward": 0.8125, "reward_std": 0.125, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2725 }, { "completion_length": 680.5625, "epoch": 2.9066666666666667, "grad_norm": 0.013882980681955814, "kl": 0.06730270385742188, "learning_rate": 8.354061431526672e-09, "loss": 0.0027, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2726 }, { "completion_length": 651.71875, "epoch": 2.9077333333333333, "grad_norm": 0.0009185501839965582, "kl": 0.022510528564453125, "learning_rate": 8.158828893192471e-09, "loss": 0.0009, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2727 }, { "completion_length": 492.25, "epoch": 2.9088000000000003, "grad_norm": 0.001655252301134169, "kl": 0.04694366455078125, "learning_rate": 7.965898457161225e-09, "loss": 0.0019, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2728 }, { "completion_length": 509.90625, "epoch": 2.909866666666667, "grad_norm": 0.01310690026730299, "kl": 0.03778076171875, "learning_rate": 7.77527042114895e-09, "loss": 0.0015, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2729 }, { "completion_length": 711.34375, "epoch": 2.9109333333333334, "grad_norm": 0.01646021567285061, "kl": 0.05783843994140625, "learning_rate": 7.586945079319673e-09, "loss": 0.0023, "reward": 0.46875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2730 }, { "completion_length": 606.1875, "epoch": 2.912, "grad_norm": 0.020569419488310814, "kl": 0.08432769775390625, "learning_rate": 7.400922722283099e-09, "loss": 0.0034, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2731 }, { "completion_length": 690.71875, "epoch": 2.9130666666666665, "grad_norm": 0.017735334113240242, "kl": 0.03438758850097656, "learning_rate": 7.217203637096104e-09, "loss": 0.0014, "reward": 0.5625, "reward_std": 0.46650634706020355, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2732 }, { "completion_length": 584.625, "epoch": 2.9141333333333335, "grad_norm": 0.016416404396295547, "kl": 0.008609771728515625, "learning_rate": 7.035788107260244e-09, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2733 }, { "completion_length": 653.4375, "epoch": 2.9152, "grad_norm": 0.009648832492530346, "kl": 0.08475875854492188, "learning_rate": 6.8566764127232505e-09, "loss": 0.0034, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2734 }, { "completion_length": 595.40625, "epoch": 2.9162666666666666, "grad_norm": 0.009954428300261497, "kl": 0.037517547607421875, "learning_rate": 6.6798688298777e-09, "loss": 0.0015, "reward": 0.78125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2735 }, { "completion_length": 672.5625, "epoch": 2.9173333333333336, "grad_norm": 0.0019154881592839956, "kl": 0.027299880981445312, "learning_rate": 6.5053656315598455e-09, "loss": 0.0011, "reward": 0.65625, "reward_std": 0.1875, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2736 }, { "completion_length": 800.25, "epoch": 2.9184, "grad_norm": 0.001926069031469524, "kl": 0.05010223388671875, "learning_rate": 6.333167087050617e-09, "loss": 0.002, "reward": 0.28125, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.0, "step": 2737 }, { "completion_length": 675.9375, "epoch": 2.9194666666666667, "grad_norm": 0.01890048198401928, "kl": 0.07226181030273438, "learning_rate": 6.163273462074792e-09, "loss": 0.0029, "reward": 0.5625, "reward_std": 0.25, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2738 }, { "completion_length": 562.65625, "epoch": 2.920533333333333, "grad_norm": 0.01706031523644924, "kl": 0.11294937133789062, "learning_rate": 5.9956850187998235e-09, "loss": 0.0045, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2739 }, { "completion_length": 517.75, "epoch": 2.9215999999999998, "grad_norm": 0.0007017714087851346, "kl": 0.07199859619140625, "learning_rate": 5.830402015836178e-09, "loss": 0.0029, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2740 }, { "completion_length": 566.625, "epoch": 2.9226666666666667, "grad_norm": 0.00422362657263875, "kl": 0.0922698974609375, "learning_rate": 5.6674247082366684e-09, "loss": 0.0037, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2741 }, { "completion_length": 511.53125, "epoch": 2.9237333333333333, "grad_norm": 0.010697025805711746, "kl": 0.052524566650390625, "learning_rate": 5.506753347496285e-09, "loss": 0.0021, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2742 }, { "completion_length": 841.25, "epoch": 2.9248, "grad_norm": 0.001479628961533308, "kl": 0.06577682495117188, "learning_rate": 5.348388181551534e-09, "loss": 0.0026, "reward": 0.21875, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "step": 2743 }, { "completion_length": 654.34375, "epoch": 2.925866666666667, "grad_norm": 0.016959475353360176, "kl": 0.0950775146484375, "learning_rate": 5.192329454780098e-09, "loss": 0.0038, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2744 }, { "completion_length": 619.28125, "epoch": 2.9269333333333334, "grad_norm": 0.02637273631989956, "kl": 0.058441162109375, "learning_rate": 5.038577408000844e-09, "loss": 0.0023, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2745 }, { "completion_length": 633.96875, "epoch": 2.928, "grad_norm": 0.006844265852123499, "kl": 0.057147979736328125, "learning_rate": 4.887132278472483e-09, "loss": 0.0023, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2746 }, { "completion_length": 695.375, "epoch": 2.9290666666666665, "grad_norm": 0.02008754201233387, "kl": 0.08468246459960938, "learning_rate": 4.7379942998947436e-09, "loss": 0.0034, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2747 }, { "completion_length": 616.46875, "epoch": 2.9301333333333335, "grad_norm": 0.01418281439691782, "kl": 0.05891990661621094, "learning_rate": 4.591163702406531e-09, "loss": 0.0024, "reward": 0.625, "reward_std": 0.375, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2748 }, { "completion_length": 619.96875, "epoch": 2.9312, "grad_norm": 0.009788545779883862, "kl": 0.05768394470214844, "learning_rate": 4.4466407125859365e-09, "loss": 0.0023, "reward": 0.84375, "reward_std": 0.3125, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.0, "step": 2749 }, { "completion_length": 663.25, "epoch": 2.9322666666666666, "grad_norm": 0.0004938154597766697, "kl": 0.04300880432128906, "learning_rate": 4.304425553450897e-09, "loss": 0.0017, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2750 }, { "completion_length": 598.65625, "epoch": 2.9333333333333336, "grad_norm": 0.030150193721055984, "kl": 0.10564804077148438, "learning_rate": 4.1645184444575325e-09, "loss": 0.0042, "reward": 0.625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2751 }, { "completion_length": 723.28125, "epoch": 2.9344, "grad_norm": 0.014425206929445267, "kl": 0.08465003967285156, "learning_rate": 4.0269196015003115e-09, "loss": 0.0034, "reward": 0.5, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2752 }, { "completion_length": 644.3125, "epoch": 2.9354666666666667, "grad_norm": 0.020463353022933006, "kl": 0.03909492492675781, "learning_rate": 3.891629236912053e-09, "loss": 0.0016, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2753 }, { "completion_length": 739.59375, "epoch": 2.936533333333333, "grad_norm": 0.0010005763033404946, "kl": 0.023143768310546875, "learning_rate": 3.758647559463091e-09, "loss": 0.0009, "reward": 0.625, "reward_std": 0.125, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2754 }, { "completion_length": 650.84375, "epoch": 2.9375999999999998, "grad_norm": 0.010710088536143303, "kl": 0.11594009399414062, "learning_rate": 3.6279747743612756e-09, "loss": 0.0046, "reward": 0.65625, "reward_std": 0.3125, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2755 }, { "completion_length": 586.0625, "epoch": 2.9386666666666668, "grad_norm": 0.0024480989668518305, "kl": 0.07318496704101562, "learning_rate": 3.499611083251475e-09, "loss": 0.0029, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2756 }, { "completion_length": 669.65625, "epoch": 2.9397333333333333, "grad_norm": 0.0013654778013005853, "kl": 0.046848297119140625, "learning_rate": 3.37355668421524e-09, "loss": 0.0019, "reward": 0.75, "reward_std": 0.125, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2757 }, { "completion_length": 464.15625, "epoch": 2.9408, "grad_norm": 0.002113830065354705, "kl": 0.08474349975585938, "learning_rate": 3.2498117717706386e-09, "loss": 0.0034, "reward": 0.8125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.0, "step": 2758 }, { "completion_length": 600.53125, "epoch": 2.941866666666667, "grad_norm": 0.012774026021361351, "kl": 0.052707672119140625, "learning_rate": 3.1283765368720908e-09, "loss": 0.0021, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2759 }, { "completion_length": 416.1875, "epoch": 2.9429333333333334, "grad_norm": 0.01759140007197857, "kl": 0.035434722900390625, "learning_rate": 3.009251166909699e-09, "loss": 0.0014, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.0, "step": 2760 }, { "completion_length": 624.5, "epoch": 2.944, "grad_norm": 0.00039272833964787424, "kl": 0.02378082275390625, "learning_rate": 2.892435845709085e-09, "loss": 0.001, "reward": 0.59375, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2761 }, { "completion_length": 630.875, "epoch": 2.9450666666666665, "grad_norm": 0.018035514280200005, "kl": 0.05434226989746094, "learning_rate": 2.7779307535315545e-09, "loss": 0.0022, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2762 }, { "completion_length": 530.46875, "epoch": 2.9461333333333335, "grad_norm": 0.0008275994332507253, "kl": 0.040775299072265625, "learning_rate": 2.665736067072766e-09, "loss": 0.0016, "reward": 0.90625, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 0.0, "step": 2763 }, { "completion_length": 491.21875, "epoch": 2.9472, "grad_norm": 0.01252064760774374, "kl": 0.059051513671875, "learning_rate": 2.5558519594637285e-09, "loss": 0.0024, "reward": 0.78125, "reward_std": 0.13466878235340118, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2764 }, { "completion_length": 622.53125, "epoch": 2.9482666666666666, "grad_norm": 0.0008909264579415321, "kl": 0.05368804931640625, "learning_rate": 2.4482786002698045e-09, "loss": 0.0021, "reward": 0.4375, "reward_std": 0.25, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2765 }, { "completion_length": 572.375, "epoch": 2.9493333333333336, "grad_norm": 0.011480681598186493, "kl": 0.0275421142578125, "learning_rate": 2.343016155490374e-09, "loss": 0.0011, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2766 }, { "completion_length": 610.5625, "epoch": 2.9504, "grad_norm": 0.013924062252044678, "kl": 0.055820465087890625, "learning_rate": 2.2400647875588377e-09, "loss": 0.0022, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2767 }, { "completion_length": 623.78125, "epoch": 2.9514666666666667, "grad_norm": 0.012425463646650314, "kl": 0.04392242431640625, "learning_rate": 2.139424655342448e-09, "loss": 0.0018, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2768 }, { "completion_length": 576.5, "epoch": 2.9525333333333332, "grad_norm": 0.013931631110608578, "kl": 0.084014892578125, "learning_rate": 2.041095914141644e-09, "loss": 0.0034, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2769 }, { "completion_length": 709.5625, "epoch": 2.9536, "grad_norm": 0.014867409132421017, "kl": 0.0768890380859375, "learning_rate": 1.945078715690718e-09, "loss": 0.0031, "reward": 0.53125, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2770 }, { "completion_length": 686.21875, "epoch": 2.9546666666666668, "grad_norm": 0.0032743248157203197, "kl": 0.09490203857421875, "learning_rate": 1.8513732081561485e-09, "loss": 0.0038, "reward": 0.5, "reward_std": 0.25, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2771 }, { "completion_length": 682.875, "epoch": 2.9557333333333333, "grad_norm": 0.009439028799533844, "kl": 0.0702972412109375, "learning_rate": 1.7599795361376015e-09, "loss": 0.0028, "reward": 0.65625, "reward_std": 0.27900634706020355, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2772 }, { "completion_length": 842.5625, "epoch": 2.9568, "grad_norm": 0.016790110617876053, "kl": 0.07078933715820312, "learning_rate": 1.670897840667429e-09, "loss": 0.0028, "reward": 0.3125, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 2773 }, { "completion_length": 634.125, "epoch": 2.957866666666667, "grad_norm": 0.026547547429800034, "kl": 0.09511566162109375, "learning_rate": 1.5841282592098383e-09, "loss": 0.0038, "reward": 0.6875, "reward_std": 0.41367512941360474, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2774 }, { "completion_length": 660.0, "epoch": 2.9589333333333334, "grad_norm": 0.01078865397721529, "kl": 0.07564926147460938, "learning_rate": 1.4996709256617225e-09, "loss": 0.003, "reward": 0.59375, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2775 }, { "completion_length": 665.875, "epoch": 2.96, "grad_norm": 0.016364673152565956, "kl": 0.0831451416015625, "learning_rate": 1.4175259703513299e-09, "loss": 0.0033, "reward": 0.53125, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.0, "step": 2776 }, { "completion_length": 729.5625, "epoch": 2.9610666666666665, "grad_norm": 0.017690684646368027, "kl": 0.027774810791015625, "learning_rate": 1.337693520039096e-09, "loss": 0.0011, "reward": 0.46875, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2777 }, { "completion_length": 668.71875, "epoch": 2.962133333333333, "grad_norm": 0.0011717757442966104, "kl": 0.068695068359375, "learning_rate": 1.260173697916478e-09, "loss": 0.0028, "reward": 0.46875, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2778 }, { "completion_length": 507.125, "epoch": 2.9632, "grad_norm": 0.0019742059521377087, "kl": 0.055522918701171875, "learning_rate": 1.1849666236067868e-09, "loss": 0.0022, "reward": 0.71875, "reward_std": 0.1875, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2779 }, { "completion_length": 713.4375, "epoch": 2.9642666666666666, "grad_norm": 0.012202279642224312, "kl": 0.05328369140625, "learning_rate": 1.112072413163856e-09, "loss": 0.0021, "reward": 0.5625, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2780 }, { "completion_length": 610.8125, "epoch": 2.9653333333333336, "grad_norm": 0.01730169542133808, "kl": 0.10491180419921875, "learning_rate": 1.0414911790730397e-09, "loss": 0.0042, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2781 }, { "completion_length": 577.6875, "epoch": 2.9664, "grad_norm": 0.014249739237129688, "kl": 0.0150299072265625, "learning_rate": 9.732230302502142e-10, "loss": 0.0006, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2782 }, { "completion_length": 665.09375, "epoch": 2.9674666666666667, "grad_norm": 0.02129077911376953, "kl": 0.10720443725585938, "learning_rate": 9.072680720417781e-10, "loss": 0.0043, "reward": 0.4375, "reward_std": 0.375, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "step": 2783 }, { "completion_length": 639.6875, "epoch": 2.9685333333333332, "grad_norm": 0.0003063268377445638, "kl": 0.0078125, "learning_rate": 8.436264062248178e-10, "loss": 0.0003, "reward": 0.5, "reward_std": 0.125, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 2784 }, { "completion_length": 726.75, "epoch": 2.9696, "grad_norm": 0.01978844590485096, "kl": 0.12518310546875, "learning_rate": 7.82298131006609e-10, "loss": 0.005, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2785 }, { "completion_length": 552.21875, "epoch": 2.970666666666667, "grad_norm": 0.015558172017335892, "kl": 0.03337860107421875, "learning_rate": 7.2328334102445e-10, "loss": 0.0013, "reward": 0.75, "reward_std": 0.25, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2786 }, { "completion_length": 616.96875, "epoch": 2.9717333333333333, "grad_norm": 0.01589163765311241, "kl": 0.06644248962402344, "learning_rate": 6.665821273456607e-10, "loss": 0.0027, "reward": 0.5625, "reward_std": 0.125, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2787 }, { "completion_length": 709.8125, "epoch": 2.9728, "grad_norm": 0.016034545376896858, "kl": 0.0465545654296875, "learning_rate": 6.121945774677506e-10, "loss": 0.0019, "reward": 0.6875, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2788 }, { "completion_length": 515.9375, "epoch": 2.973866666666667, "grad_norm": 0.02554580755531788, "kl": 0.0452117919921875, "learning_rate": 5.601207753174186e-10, "loss": 0.0018, "reward": 0.65625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.0, "step": 2789 }, { "completion_length": 694.53125, "epoch": 2.9749333333333334, "grad_norm": 0.02293931134045124, "kl": 0.10198211669921875, "learning_rate": 5.103608012512195e-10, "loss": 0.0041, "reward": 0.59375, "reward_std": 0.5290063470602036, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2790 }, { "completion_length": 467.5, "epoch": 2.976, "grad_norm": 0.011194913648068905, "kl": 0.026641845703125, "learning_rate": 4.6291473205539724e-10, "loss": 0.0011, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2791 }, { "completion_length": 592.09375, "epoch": 2.9770666666666665, "grad_norm": 0.017945056781172752, "kl": 0.10370635986328125, "learning_rate": 4.1778264094521903e-10, "loss": 0.0041, "reward": 0.6875, "reward_std": 0.25, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2792 }, { "completion_length": 735.28125, "epoch": 2.978133333333333, "grad_norm": 0.012997356243431568, "kl": 0.03818321228027344, "learning_rate": 3.749645975653082e-10, "loss": 0.0015, "reward": 0.40625, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.0, "step": 2793 }, { "completion_length": 521.0, "epoch": 2.9792, "grad_norm": 0.0017607678892090917, "kl": 0.06329345703125, "learning_rate": 3.3446066798931143e-10, "loss": 0.0025, "reward": 0.59375, "reward_std": 0.0625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2794 }, { "completion_length": 637.5, "epoch": 2.9802666666666666, "grad_norm": 0.023541472852230072, "kl": 0.07012939453125, "learning_rate": 2.9627091471989834e-10, "loss": 0.0028, "reward": 0.5625, "reward_std": 0.39433756470680237, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 2795 }, { "completion_length": 566.375, "epoch": 2.981333333333333, "grad_norm": 0.023457791656255722, "kl": 0.07598876953125, "learning_rate": 2.6039539668909486e-10, "loss": 0.003, "reward": 0.71875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2796 }, { "completion_length": 596.375, "epoch": 2.9824, "grad_norm": 0.0206616073846817, "kl": 0.08813095092773438, "learning_rate": 2.2683416925728396e-10, "loss": 0.0035, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2797 }, { "completion_length": 658.65625, "epoch": 2.9834666666666667, "grad_norm": 0.008072162047028542, "kl": 0.05171394348144531, "learning_rate": 1.9558728421353867e-10, "loss": 0.0021, "reward": 0.46875, "reward_std": 0.33183756470680237, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2798 }, { "completion_length": 474.6875, "epoch": 2.9845333333333333, "grad_norm": 0.030052172020077705, "kl": 0.06651687622070312, "learning_rate": 1.666547897761217e-10, "loss": 0.0027, "reward": 0.75, "reward_std": 0.19716878235340118, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "step": 2799 }, { "completion_length": 520.34375, "epoch": 2.9856, "grad_norm": 0.0006948498194105923, "kl": 0.045513153076171875, "learning_rate": 1.4003673059131973e-10, "loss": 0.0018, "reward": 0.78125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.0, "step": 2800 }, { "completion_length": 665.71875, "epoch": 2.986666666666667, "grad_norm": 0.018041161820292473, "kl": 0.08131217956542969, "learning_rate": 1.1573314773427601e-10, "loss": 0.0033, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2801 }, { "completion_length": 459.96875, "epoch": 2.9877333333333334, "grad_norm": 0.0006236971239559352, "kl": 0.02759552001953125, "learning_rate": 9.374407870882396e-11, "loss": 0.0011, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2802 }, { "completion_length": 652.625, "epoch": 2.9888, "grad_norm": 0.009059986099600792, "kl": 0.07361984252929688, "learning_rate": 7.40695574464878e-11, "loss": 0.0029, "reward": 0.59375, "reward_std": 0.3846687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2803 }, { "completion_length": 710.375, "epoch": 2.989866666666667, "grad_norm": 0.0030989977531135082, "kl": 0.08684539794921875, "learning_rate": 5.670961430781496e-11, "loss": 0.0035, "reward": 0.46875, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.0, "step": 2804 }, { "completion_length": 550.0, "epoch": 2.9909333333333334, "grad_norm": 0.0004030271084047854, "kl": 0.06357574462890625, "learning_rate": 4.1664276081376796e-11, "loss": 0.0025, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2805 }, { "completion_length": 578.21875, "epoch": 2.992, "grad_norm": 0.02012859843671322, "kl": 0.11603546142578125, "learning_rate": 2.8933565983935152e-11, "loss": 0.0046, "reward": 0.59375, "reward_std": 0.2596687823534012, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2806 }, { "completion_length": 569.4375, "epoch": 2.9930666666666665, "grad_norm": 0.01372765563428402, "kl": 0.033969879150390625, "learning_rate": 1.8517503660941958e-11, "loss": 0.0014, "reward": 0.625, "reward_std": 0.3221687823534012, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2807 }, { "completion_length": 608.4375, "epoch": 2.994133333333333, "grad_norm": 0.016153162345290184, "kl": 0.08599472045898438, "learning_rate": 1.0416105185373503e-11, "loss": 0.0034, "reward": 0.71875, "reward_std": 0.0625, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.0, "step": 2808 }, { "completion_length": 616.28125, "epoch": 2.9952, "grad_norm": 0.0006087830988690257, "kl": 0.08085250854492188, "learning_rate": 4.6293830588961526e-12, "loss": 0.0032, "reward": 0.59375, "reward_std": 0.20683756470680237, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.0, "step": 2809 }, { "completion_length": 593.0, "epoch": 2.9962666666666666, "grad_norm": 0.0005214029224589467, "kl": 0.059169769287109375, "learning_rate": 1.1573462112002276e-12, "loss": 0.0024, "reward": 0.6875, "reward_std": 0.125, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.0, "step": 2810 }, { "completion_length": 641.21875, "epoch": 2.997333333333333, "grad_norm": 0.011904069222509861, "kl": 0.04987335205078125, "learning_rate": 0.0, "loss": 0.002, "reward": 0.625, "reward_std": 0.26933756470680237, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.0, "step": 2811 }, { "epoch": 2.997333333333333, "step": 2811, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 2.529, "train_samples_per_second": 8896.716, "train_steps_per_second": 1111.496 } ], "logging_steps": 1, "max_steps": 2811, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }