{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 283, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 404.396875, "epoch": 0.0176678445229682, "grad_norm": 0.785922280854803, "kl": 0.00035384297370910645, "learning_rate": 3.448275862068966e-06, "loss": 0.0, "reward": 0.5984375, "reward_std": 0.31488348115235565, "rewards/accuracy_reward": 0.166015625, "rewards/format_reward": 0.432421875, "step": 5 }, { "completion_length": 410.702734375, "epoch": 0.0353356890459364, "grad_norm": 0.3143792157068426, "kl": 0.13354759216308593, "learning_rate": 6.896551724137932e-06, "loss": 0.0054, "reward": 0.57265625, "reward_std": 0.30052037397399545, "rewards/accuracy_reward": 0.160546875, "rewards/format_reward": 0.412109375, "step": 10 }, { "completion_length": 400.45234375, "epoch": 0.053003533568904596, "grad_norm": 0.6724373940493742, "kl": 0.024787521362304686, "learning_rate": 1.0344827586206898e-05, "loss": 0.001, "reward": 0.594921875, "reward_std": 0.3065970725379884, "rewards/accuracy_reward": 0.153515625, "rewards/format_reward": 0.44140625, "step": 15 }, { "completion_length": 395.24921875, "epoch": 0.0706713780918728, "grad_norm": 0.6026471879082274, "kl": 0.3488151550292969, "learning_rate": 1.3793103448275863e-05, "loss": 0.0139, "reward": 0.613671875, "reward_std": 0.3264844523742795, "rewards/accuracy_reward": 0.168359375, "rewards/format_reward": 0.4453125, "step": 20 }, { "completion_length": 398.496484375, "epoch": 0.08833922261484099, "grad_norm": 0.46593386880605636, "kl": 0.1805694580078125, "learning_rate": 1.7241379310344828e-05, "loss": 0.0072, "reward": 0.61484375, "reward_std": 0.2883669765666127, "rewards/accuracy_reward": 0.158203125, "rewards/format_reward": 0.456640625, "step": 25 }, { "completion_length": 401.319921875, "epoch": 0.10600706713780919, "grad_norm": 0.5247335452215227, "kl": 0.145611572265625, "learning_rate": 1.999923511388017e-05, "loss": 0.0058, "reward": 0.61328125, "reward_std": 0.2905766863375902, "rewards/accuracy_reward": 0.16484375, "rewards/format_reward": 0.4484375, "step": 30 }, { "completion_length": 402.28671875, "epoch": 0.12367491166077739, "grad_norm": 1.3201550928400456, "kl": 0.15986328125, "learning_rate": 1.9972476383747748e-05, "loss": 0.0064, "reward": 0.591796875, "reward_std": 0.29886309332214295, "rewards/accuracy_reward": 0.1515625, "rewards/format_reward": 0.440234375, "step": 35 }, { "completion_length": 400.734765625, "epoch": 0.1413427561837456, "grad_norm": 0.4543361752537872, "kl": 0.293060302734375, "learning_rate": 1.9907590277344582e-05, "loss": 0.0117, "reward": 0.60703125, "reward_std": 0.31156891826540234, "rewards/accuracy_reward": 0.166015625, "rewards/format_reward": 0.441015625, "step": 40 }, { "completion_length": 402.326953125, "epoch": 0.15901060070671377, "grad_norm": 1.8083450201171443, "kl": 0.539990234375, "learning_rate": 1.9804824871166254e-05, "loss": 0.0216, "reward": 0.59609375, "reward_std": 0.3016252293717116, "rewards/accuracy_reward": 0.159765625, "rewards/format_reward": 0.436328125, "step": 45 }, { "completion_length": 399.735546875, "epoch": 0.17667844522968199, "grad_norm": 0.6440098529655808, "kl": 36.3084228515625, "learning_rate": 1.9664573064143604e-05, "loss": 1.4539, "reward": 0.6109375, "reward_std": 0.3060446453746408, "rewards/accuracy_reward": 0.162109375, "rewards/format_reward": 0.448828125, "step": 50 }, { "completion_length": 396.709375, "epoch": 0.19434628975265017, "grad_norm": 0.949235195273066, "kl": 18740.40302734375, "learning_rate": 1.948737107548771e-05, "loss": 748.2533, "reward": 0.61640625, "reward_std": 0.3193028993904591, "rewards/accuracy_reward": 0.165625, "rewards/format_reward": 0.45078125, "step": 55 }, { "completion_length": 410.02734375, "epoch": 0.21201413427561838, "grad_norm": 2.007276336933817, "kl": 1.0031005859375, "learning_rate": 1.9273896394584103e-05, "loss": 0.0401, "reward": 0.59609375, "reward_std": 0.2949961026199162, "rewards/accuracy_reward": 0.148828125, "rewards/format_reward": 0.447265625, "step": 60 }, { "completion_length": 405.175390625, "epoch": 0.22968197879858657, "grad_norm": 0.7554540760672372, "kl": 1.448046875, "learning_rate": 1.9024965190774262e-05, "loss": 0.0579, "reward": 0.60546875, "reward_std": 0.297205812856555, "rewards/accuracy_reward": 0.15546875, "rewards/format_reward": 0.45, "step": 65 }, { "completion_length": 403.630859375, "epoch": 0.24734982332155478, "grad_norm": 0.7209308900049199, "kl": 19355.033984375, "learning_rate": 1.8741529192927528e-05, "loss": 776.0688, "reward": 0.6125, "reward_std": 0.30714950021356346, "rewards/accuracy_reward": 0.165234375, "rewards/format_reward": 0.447265625, "step": 70 }, { "completion_length": 398.434375, "epoch": 0.26501766784452296, "grad_norm": 0.5881139318633425, "kl": 1.331982421875, "learning_rate": 1.8424672050733577e-05, "loss": 0.0533, "reward": 0.61484375, "reward_std": 0.2994155207648873, "rewards/accuracy_reward": 0.158203125, "rewards/format_reward": 0.456640625, "step": 75 }, { "completion_length": 408.6515625, "epoch": 0.2826855123674912, "grad_norm": 2.6274469054608143, "kl": 1.180029296875, "learning_rate": 1.8075605191627242e-05, "loss": 0.0472, "reward": 0.607421875, "reward_std": 0.2811854241415858, "rewards/accuracy_reward": 0.164453125, "rewards/format_reward": 0.44296875, "step": 80 }, { "completion_length": 413.64296875, "epoch": 0.3003533568904594, "grad_norm": 0.6738788725405221, "kl": 11.340185546875, "learning_rate": 1.7695663189185703e-05, "loss": 0.4546, "reward": 0.601953125, "reward_std": 0.30549221779219804, "rewards/accuracy_reward": 0.179296875, "rewards/format_reward": 0.42265625, "step": 85 }, { "completion_length": 413.73359375, "epoch": 0.31802120141342755, "grad_norm": 1.3105277713530405, "kl": 1.447265625, "learning_rate": 1.7286298660705877e-05, "loss": 0.0579, "reward": 0.60390625, "reward_std": 0.30604464691132305, "rewards/accuracy_reward": 0.1609375, "rewards/format_reward": 0.44296875, "step": 90 }, { "completion_length": 404.303515625, "epoch": 0.33568904593639576, "grad_norm": 0.7623390157660208, "kl": 1.0364013671875, "learning_rate": 1.6849076713469914e-05, "loss": 0.0415, "reward": 0.601171875, "reward_std": 0.2955485317390412, "rewards/accuracy_reward": 0.16171875, "rewards/format_reward": 0.439453125, "step": 95 }, { "completion_length": 413.7109375, "epoch": 0.35335689045936397, "grad_norm": 1.0600792188331805, "kl": 2.1205810546875, "learning_rate": 1.6385668960932143e-05, "loss": 0.0849, "reward": 0.572265625, "reward_std": 0.3088067832402885, "rewards/accuracy_reward": 0.1546875, "rewards/format_reward": 0.417578125, "step": 100 }, { "epoch": 0.35335689045936397, "eval_completion_length": 405.1893997192383, "eval_kl": 1.361328125, "eval_loss": 0.05589722469449043, "eval_reward": 0.61328125, "eval_reward_std": 0.2927863895893097, "eval_rewards/accuracy_reward": 0.140625, "eval_rewards/format_reward": 0.47265625, "eval_runtime": 62.5177, "eval_samples_per_second": 1.584, "eval_steps_per_second": 0.032, "step": 100 }, { "completion_length": 397.241015625, "epoch": 0.3710247349823322, "grad_norm": 1.7180085275135002, "kl": 2.092529296875, "learning_rate": 1.5897847131705194e-05, "loss": 0.0837, "reward": 0.6046875, "reward_std": 0.3170931892469525, "rewards/accuracy_reward": 0.167578125, "rewards/format_reward": 0.437109375, "step": 105 }, { "completion_length": 403.58828125, "epoch": 0.38869257950530034, "grad_norm": 0.534578040818149, "kl": 1.238232421875, "learning_rate": 1.5387476295779737e-05, "loss": 0.0495, "reward": 0.597265625, "reward_std": 0.31322619933635, "rewards/accuracy_reward": 0.1671875, "rewards/format_reward": 0.430078125, "step": 110 }, { "completion_length": 408.934375, "epoch": 0.40636042402826855, "grad_norm": 0.41472109596238, "kl": 1.536083984375, "learning_rate": 1.4856507733875837e-05, "loss": 0.0614, "reward": 0.6015625, "reward_std": 0.3082543543539941, "rewards/accuracy_reward": 0.173828125, "rewards/format_reward": 0.427734375, "step": 115 }, { "completion_length": 393.684375, "epoch": 0.42402826855123676, "grad_norm": 1.0671198220931404, "kl": 1.219482421875, "learning_rate": 1.4306971477188223e-05, "loss": 0.0488, "reward": 0.602734375, "reward_std": 0.31101649152114985, "rewards/accuracy_reward": 0.15546875, "rewards/format_reward": 0.447265625, "step": 120 }, { "completion_length": 411.417578125, "epoch": 0.4416961130742049, "grad_norm": 0.46043384834399087, "kl": 1.910693359375, "learning_rate": 1.3740968546047935e-05, "loss": 0.0764, "reward": 0.60390625, "reward_std": 0.29941552053205667, "rewards/accuracy_reward": 0.16328125, "rewards/format_reward": 0.440625, "step": 125 }, { "completion_length": 397.962890625, "epoch": 0.45936395759717313, "grad_norm": 0.6111314907455481, "kl": 0.9618408203125, "learning_rate": 1.3160662917174045e-05, "loss": 0.0385, "reward": 0.585546875, "reward_std": 0.31212134528905155, "rewards/accuracy_reward": 0.14765625, "rewards/format_reward": 0.437890625, "step": 130 }, { "completion_length": 401.703515625, "epoch": 0.47703180212014135, "grad_norm": 1.4967183682261056, "kl": 1.99873046875, "learning_rate": 1.2568273250226681e-05, "loss": 0.08, "reward": 0.61171875, "reward_std": 0.2927863945718855, "rewards/accuracy_reward": 0.1671875, "rewards/format_reward": 0.44453125, "step": 135 }, { "completion_length": 406.8, "epoch": 0.49469964664310956, "grad_norm": 0.917377230805497, "kl": 1.26953125, "learning_rate": 1.1966064405292887e-05, "loss": 0.0508, "reward": 0.59765625, "reward_std": 0.31377862663939593, "rewards/accuracy_reward": 0.168359375, "rewards/format_reward": 0.429296875, "step": 140 }, { "completion_length": 391.3640625, "epoch": 0.5123674911660777, "grad_norm": 0.5259506143043062, "kl": 1.326171875, "learning_rate": 1.1356338783736256e-05, "loss": 0.053, "reward": 0.6125, "reward_std": 0.28615726907737554, "rewards/accuracy_reward": 0.166015625, "rewards/format_reward": 0.446484375, "step": 145 }, { "completion_length": 397.0296875, "epoch": 0.5300353356890459, "grad_norm": 0.6823442998587644, "kl": 1.138134765625, "learning_rate": 1.0741427525516463e-05, "loss": 0.0455, "reward": 0.6140625, "reward_std": 0.2828427059110254, "rewards/accuracy_reward": 0.168359375, "rewards/format_reward": 0.445703125, "step": 150 }, { "completion_length": 407.55390625, "epoch": 0.5477031802120141, "grad_norm": 5.1945886494720215, "kl": 1.53837890625, "learning_rate": 1.012368159663363e-05, "loss": 0.0615, "reward": 0.57109375, "reward_std": 0.3038349375128746, "rewards/accuracy_reward": 0.14375, "rewards/format_reward": 0.42734375, "step": 155 }, { "completion_length": 401.918359375, "epoch": 0.5653710247349824, "grad_norm": 0.4307285006305928, "kl": 1.49169921875, "learning_rate": 9.505462800772612e-06, "loss": 0.0597, "reward": 0.60546875, "reward_std": 0.3049397937953472, "rewards/accuracy_reward": 0.1625, "rewards/format_reward": 0.44296875, "step": 160 }, { "completion_length": 402.686328125, "epoch": 0.5830388692579506, "grad_norm": 0.49030947447366735, "kl": 1.36279296875, "learning_rate": 8.889134749511956e-06, "loss": 0.0545, "reward": 0.590625, "reward_std": 0.3038349383510649, "rewards/accuracy_reward": 0.153125, "rewards/format_reward": 0.4375, "step": 165 }, { "completion_length": 405.9875, "epoch": 0.6007067137809188, "grad_norm": 0.4352892915737104, "kl": 1.3828125, "learning_rate": 8.277053825620836e-06, "loss": 0.0553, "reward": 0.61328125, "reward_std": 0.3027300829067826, "rewards/accuracy_reward": 0.1546875, "rewards/format_reward": 0.45859375, "step": 170 }, { "completion_length": 401.63359375, "epoch": 0.6183745583038869, "grad_norm": 0.8900367662135797, "kl": 1.16708984375, "learning_rate": 7.671560173993588e-06, "loss": 0.0467, "reward": 0.615625, "reward_std": 0.3038349374197423, "rewards/accuracy_reward": 0.18046875, "rewards/format_reward": 0.43515625, "step": 175 }, { "completion_length": 399.348828125, "epoch": 0.6360424028268551, "grad_norm": 0.8123164664250848, "kl": 1.635009765625, "learning_rate": 7.07496875466589e-06, "loss": 0.0654, "reward": 0.61640625, "reward_std": 0.29720581048168243, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.45234375, "step": 180 }, { "completion_length": 401.88515625, "epoch": 0.6537102473498233, "grad_norm": 0.5468048622490435, "kl": 1.2939208984375, "learning_rate": 6.489560492119225e-06, "loss": 0.0518, "reward": 0.607421875, "reward_std": 0.3054922170005739, "rewards/accuracy_reward": 0.15546875, "rewards/format_reward": 0.451953125, "step": 185 }, { "completion_length": 397.83359375, "epoch": 0.6713780918727915, "grad_norm": 0.7244711255317006, "kl": 1.36591796875, "learning_rate": 5.9175735547120975e-06, "loss": 0.0546, "reward": 0.6109375, "reward_std": 0.3181980476714671, "rewards/accuracy_reward": 0.155859375, "rewards/format_reward": 0.455078125, "step": 190 }, { "completion_length": 413.570703125, "epoch": 0.6890459363957597, "grad_norm": 0.42585359101119413, "kl": 1.764208984375, "learning_rate": 5.361194797579108e-06, "loss": 0.0706, "reward": 0.6046875, "reward_std": 0.3093592093326151, "rewards/accuracy_reward": 0.1671875, "rewards/format_reward": 0.4375, "step": 195 }, { "completion_length": 399.438671875, "epoch": 0.7067137809187279, "grad_norm": 0.47550986420528907, "kl": 1.2271484375, "learning_rate": 4.8225514017138205e-06, "loss": 0.0491, "reward": 0.60546875, "reward_std": 0.3038349355570972, "rewards/accuracy_reward": 0.1578125, "rewards/format_reward": 0.44765625, "step": 200 }, { "epoch": 0.7067137809187279, "eval_completion_length": 383.06652069091797, "eval_kl": 1.0625, "eval_loss": 0.043225545436143875, "eval_reward": 0.6328125, "eval_reward_std": 0.2651650458574295, "eval_rewards/accuracy_reward": 0.16015625, "eval_rewards/format_reward": 0.47265625, "eval_runtime": 64.1406, "eval_samples_per_second": 1.543, "eval_steps_per_second": 0.031, "step": 200 }, { "completion_length": 410.916796875, "epoch": 0.7243816254416962, "grad_norm": 0.4152907308781029, "kl": 1.306787109375, "learning_rate": 4.303702741201431e-06, "loss": 0.0523, "reward": 0.605859375, "reward_std": 0.29554852955043315, "rewards/accuracy_reward": 0.162890625, "rewards/format_reward": 0.44296875, "step": 205 }, { "completion_length": 420.941796875, "epoch": 0.7420494699646644, "grad_norm": 0.3864466357905039, "kl": 1.347509765625, "learning_rate": 3.8066325096949153e-06, "loss": 0.0539, "reward": 0.594140625, "reward_std": 0.3010728007182479, "rewards/accuracy_reward": 0.16484375, "rewards/format_reward": 0.429296875, "step": 210 }, { "completion_length": 405.008984375, "epoch": 0.7597173144876325, "grad_norm": 0.4071374966900544, "kl": 1.3837890625, "learning_rate": 3.3332411362372063e-06, "loss": 0.0554, "reward": 0.5796875, "reward_std": 0.31156891863793135, "rewards/accuracy_reward": 0.155859375, "rewards/format_reward": 0.423828125, "step": 215 }, { "completion_length": 400.690234375, "epoch": 0.7773851590106007, "grad_norm": 0.48756085064617083, "kl": 1.277734375, "learning_rate": 2.8853385194256677e-06, "loss": 0.0511, "reward": 0.6046875, "reward_std": 0.31488348012790085, "rewards/accuracy_reward": 0.173046875, "rewards/format_reward": 0.431640625, "step": 220 }, { "completion_length": 418.2671875, "epoch": 0.7950530035335689, "grad_norm": 0.37777218952387104, "kl": 1.430517578125, "learning_rate": 2.464637107698046e-06, "loss": 0.0573, "reward": 0.59375, "reward_std": 0.312673770962283, "rewards/accuracy_reward": 0.169140625, "rewards/format_reward": 0.424609375, "step": 225 }, { "completion_length": 409.259375, "epoch": 0.8127208480565371, "grad_norm": 0.5380959008484876, "kl": 1.55126953125, "learning_rate": 2.072745352195794e-06, "loss": 0.0621, "reward": 0.5984375, "reward_std": 0.2938912484794855, "rewards/accuracy_reward": 0.17734375, "rewards/format_reward": 0.42109375, "step": 230 }, { "completion_length": 408.725, "epoch": 0.8303886925795053, "grad_norm": 0.6087277900031041, "kl": 1.276611328125, "learning_rate": 1.7111615572361628e-06, "loss": 0.0511, "reward": 0.608203125, "reward_std": 0.32096018167212603, "rewards/accuracy_reward": 0.166015625, "rewards/format_reward": 0.4421875, "step": 235 }, { "completion_length": 408.4484375, "epoch": 0.8480565371024735, "grad_norm": 0.40330618192163126, "kl": 1.2666015625, "learning_rate": 1.381268151904298e-06, "loss": 0.0507, "reward": 0.591015625, "reward_std": 0.2999679483473301, "rewards/accuracy_reward": 0.1625, "rewards/format_reward": 0.428515625, "step": 240 }, { "completion_length": 411.65390625, "epoch": 0.8657243816254417, "grad_norm": 0.5432070185726592, "kl": 1.262109375, "learning_rate": 1.0843264046665558e-06, "loss": 0.0505, "reward": 0.5953125, "reward_std": 0.3016252295579761, "rewards/accuracy_reward": 0.15703125, "rewards/format_reward": 0.43828125, "step": 245 }, { "completion_length": 409.023046875, "epoch": 0.8833922261484098, "grad_norm": 0.3703261485488551, "kl": 1.469970703125, "learning_rate": 8.214716012124491e-07, "loss": 0.0588, "reward": 0.605078125, "reward_std": 0.3021776580251753, "rewards/accuracy_reward": 0.16484375, "rewards/format_reward": 0.440234375, "step": 250 }, { "completion_length": 407.982421875, "epoch": 0.901060070671378, "grad_norm": 0.48718776424035376, "kl": 1.395263671875, "learning_rate": 5.937087039615619e-07, "loss": 0.0558, "reward": 0.61328125, "reward_std": 0.2861572677269578, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.44140625, "step": 255 }, { "completion_length": 399.203125, "epoch": 0.9187279151943463, "grad_norm": 0.4674025132985681, "kl": 1.384521484375, "learning_rate": 4.019085098303077e-07, "loss": 0.0554, "reward": 0.599609375, "reward_std": 0.30438736486248674, "rewards/accuracy_reward": 0.154296875, "rewards/format_reward": 0.4453125, "step": 260 }, { "completion_length": 407.852734375, "epoch": 0.9363957597173145, "grad_norm": 0.518993833517968, "kl": 1.352685546875, "learning_rate": 2.4680432094837394e-07, "loss": 0.0541, "reward": 0.5921875, "reward_std": 0.2861572689376771, "rewards/accuracy_reward": 0.1609375, "rewards/format_reward": 0.43125, "step": 265 }, { "completion_length": 418.2390625, "epoch": 0.9540636042402827, "grad_norm": 0.41910418773784364, "kl": 1.345751953125, "learning_rate": 1.289891410535593e-07, "loss": 0.0538, "reward": 0.598828125, "reward_std": 0.3209601787850261, "rewards/accuracy_reward": 0.168359375, "rewards/format_reward": 0.43046875, "step": 270 }, { "completion_length": 403.1046875, "epoch": 0.9717314487632509, "grad_norm": 0.5629306163631225, "kl": 1.38427734375, "learning_rate": 4.8913408283934874e-08, "loss": 0.0554, "reward": 0.587109375, "reward_std": 0.31101649068295956, "rewards/accuracy_reward": 0.155078125, "rewards/format_reward": 0.43203125, "step": 275 }, { "completion_length": 396.14375, "epoch": 0.9893992932862191, "grad_norm": 0.40077217392258757, "kl": 1.292138671875, "learning_rate": 6.883273035447335e-09, "loss": 0.0517, "reward": 0.61484375, "reward_std": 0.29610095573589207, "rewards/accuracy_reward": 0.16953125, "rewards/format_reward": 0.4453125, "step": 280 }, { "completion_length": 388.8455181121826, "epoch": 1.0, "kl": 1.29052734375, "reward": 0.6243489583333334, "reward_std": 0.3176456190024813, "rewards/accuracy_reward": 0.17838541666666666, "rewards/format_reward": 0.4459635416666667, "step": 283, "total_flos": 0.0, "train_loss": 27.009561450669516, "train_runtime": 47693.8258, "train_samples_per_second": 1.519, "train_steps_per_second": 0.006 } ], "logging_steps": 5, "max_steps": 283, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }