diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3108 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 205, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 71.69230651855469, + "epoch": 0.004878048780487805, + "grad_norm": 1.988409597361304, + "kl": 0.0002803802490234375, + "learning_rate": 5e-08, + "loss": -0.0332, + "reward": 0.38933777809143066, + "reward_std": 0.2539410889148712, + "rewards/format_reward": 0.11098899692296982, + "rewards/judgement_reward": 0.26093122363090515, + "rewards/token_reward": 0.017417579889297485, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.14835357666016, + "epoch": 0.00975609756097561, + "grad_norm": 2.131261717353683, + "kl": 0.000675201416015625, + "learning_rate": 1e-07, + "loss": -0.0789, + "reward": 0.3299258053302765, + "reward_std": 0.24830038845539093, + "rewards/format_reward": 0.09890110045671463, + "rewards/judgement_reward": 0.2083873748779297, + "rewards/token_reward": 0.02263736166059971, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.86813354492188, + "epoch": 0.014634146341463415, + "grad_norm": 1.9020051525729609, + "kl": 0.00048828125, + "learning_rate": 1.5e-07, + "loss": -0.0115, + "reward": 0.3796335756778717, + "reward_std": 0.2533051073551178, + "rewards/format_reward": 0.09340659528970718, + "rewards/judgement_reward": 0.26880943775177, + "rewards/token_reward": 0.017417583614587784, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.22527313232422, + "epoch": 0.01951219512195122, + "grad_norm": 1.926669794956742, + "kl": 0.0002899169921875, + "learning_rate": 2e-07, + "loss": 0.002, + "reward": 0.30424928665161133, + "reward_std": 0.23502430319786072, + "rewards/format_reward": 0.08681320399045944, + "rewards/judgement_reward": 0.1996888816356659, + "rewards/token_reward": 0.01774725317955017, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.5824203491211, + "epoch": 0.024390243902439025, + "grad_norm": 1.931358237356469, + "kl": 0.000339508056640625, + "learning_rate": 2.5e-07, + "loss": -0.0284, + "reward": 0.27388590574264526, + "reward_std": 0.22234365344047546, + "rewards/format_reward": 0.09450550377368927, + "rewards/judgement_reward": 0.1600947231054306, + "rewards/token_reward": 0.019285714253783226, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.84066009521484, + "epoch": 0.02926829268292683, + "grad_norm": 2.102547432960352, + "kl": 0.000278472900390625, + "learning_rate": 3e-07, + "loss": 0.0118, + "reward": 0.5828344821929932, + "reward_std": 0.27698564529418945, + "rewards/format_reward": 0.11098900437355042, + "rewards/judgement_reward": 0.4510761499404907, + "rewards/token_reward": 0.020769229158759117, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.5769271850586, + "epoch": 0.03414634146341464, + "grad_norm": 2.187265324428969, + "kl": 0.00026702880859375, + "learning_rate": 3.5e-07, + "loss": -0.0371, + "reward": 0.38959935307502747, + "reward_std": 0.274240106344223, + "rewards/format_reward": 0.10659340023994446, + "rewards/judgement_reward": 0.25861039757728577, + "rewards/token_reward": 0.024395601823925972, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.87362670898438, + "epoch": 0.03902439024390244, + "grad_norm": 1.7011251355501271, + "kl": 0.0003719329833984375, + "learning_rate": 4e-07, + "loss": 0.0269, + "reward": 0.26069721579551697, + "reward_std": 0.22703438997268677, + "rewards/format_reward": 0.0912087932229042, + "rewards/judgement_reward": 0.15185105800628662, + "rewards/token_reward": 0.017637362703680992, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.25274658203125, + "epoch": 0.04390243902439024, + "grad_norm": 2.0455198863312387, + "kl": 0.000316619873046875, + "learning_rate": 4.5e-07, + "loss": -0.0821, + "reward": 0.34196677803993225, + "reward_std": 0.25876346230506897, + "rewards/format_reward": 0.09670329838991165, + "rewards/judgement_reward": 0.22526347637176514, + "rewards/token_reward": 0.019999999552965164, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.80769348144531, + "epoch": 0.04878048780487805, + "grad_norm": 1.8007886440567025, + "kl": 0.000514984130859375, + "learning_rate": 5e-07, + "loss": -0.0259, + "reward": 0.34238940477371216, + "reward_std": 0.2460326850414276, + "rewards/format_reward": 0.09560439735651016, + "rewards/judgement_reward": 0.23140040040016174, + "rewards/token_reward": 0.015384615398943424, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.19780731201172, + "epoch": 0.05365853658536585, + "grad_norm": 1.9673405461994626, + "kl": 0.000698089599609375, + "learning_rate": 5.5e-07, + "loss": -0.0162, + "reward": 0.18868696689605713, + "reward_std": 0.15467074513435364, + "rewards/format_reward": 0.07252748310565948, + "rewards/judgement_reward": 0.10967598855495453, + "rewards/token_reward": 0.006483516190201044, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.06593322753906, + "epoch": 0.05853658536585366, + "grad_norm": 1.9439922450073313, + "kl": 0.000568389892578125, + "learning_rate": 6e-07, + "loss": -0.0065, + "reward": 0.23565447330474854, + "reward_std": 0.1971663385629654, + "rewards/format_reward": 0.0912088081240654, + "rewards/judgement_reward": 0.1310391128063202, + "rewards/token_reward": 0.013406592421233654, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.83516693115234, + "epoch": 0.06341463414634146, + "grad_norm": 1.7492995284129258, + "kl": 0.00131988525390625, + "learning_rate": 6.5e-07, + "loss": -0.0114, + "reward": 0.4377107322216034, + "reward_std": 0.2658415734767914, + "rewards/format_reward": 0.09670329838991165, + "rewards/judgement_reward": 0.320293128490448, + "rewards/token_reward": 0.02071428671479225, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.25274658203125, + "epoch": 0.06829268292682927, + "grad_norm": 1.8852756312045653, + "kl": 0.0023956298828125, + "learning_rate": 7e-07, + "loss": -0.1104, + "reward": 0.25799688696861267, + "reward_std": 0.22852738201618195, + "rewards/format_reward": 0.08461539447307587, + "rewards/judgement_reward": 0.1457441747188568, + "rewards/token_reward": 0.027637362480163574, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.5934066772461, + "epoch": 0.07317073170731707, + "grad_norm": 1.8265395690821953, + "kl": 0.003021240234375, + "learning_rate": 7.5e-07, + "loss": 0.0091, + "reward": 0.3250506818294525, + "reward_std": 0.2030942291021347, + "rewards/format_reward": 0.08901099860668182, + "rewards/judgement_reward": 0.2235121876001358, + "rewards/token_reward": 0.012527472339570522, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.31318664550781, + "epoch": 0.07804878048780488, + "grad_norm": 1.8379653827038929, + "kl": 0.0087890625, + "learning_rate": 8e-07, + "loss": -0.0047, + "reward": 0.3267355263233185, + "reward_std": 0.24120216071605682, + "rewards/format_reward": 0.10109890252351761, + "rewards/judgement_reward": 0.2026696354150772, + "rewards/token_reward": 0.022967034950852394, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.23077392578125, + "epoch": 0.08292682926829269, + "grad_norm": 2.0529600972186666, + "kl": 0.005584716796875, + "learning_rate": 8.499999999999999e-07, + "loss": 0.043, + "reward": 0.3882504105567932, + "reward_std": 0.24895574152469635, + "rewards/format_reward": 0.1021978035569191, + "rewards/judgement_reward": 0.2652834951877594, + "rewards/token_reward": 0.020769229158759117, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.64835357666016, + "epoch": 0.08780487804878048, + "grad_norm": 2.1163090543204053, + "kl": 0.01458740234375, + "learning_rate": 9e-07, + "loss": 0.04, + "reward": 0.36192721128463745, + "reward_std": 0.21096283197402954, + "rewards/format_reward": 0.10659340769052505, + "rewards/judgement_reward": 0.24775134027004242, + "rewards/token_reward": 0.007582417689263821, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.26374053955078, + "epoch": 0.09268292682926829, + "grad_norm": 1.8165905290949755, + "kl": 0.006988525390625, + "learning_rate": 9.499999999999999e-07, + "loss": 0.081, + "reward": 0.3227555453777313, + "reward_std": 0.20002064108848572, + "rewards/format_reward": 0.10989010334014893, + "rewards/judgement_reward": 0.20275558531284332, + "rewards/token_reward": 0.01010989025235176, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.64835357666016, + "epoch": 0.0975609756097561, + "grad_norm": 1.967123469564035, + "kl": 0.0120849609375, + "learning_rate": 1e-06, + "loss": 0.0601, + "reward": 0.474505752325058, + "reward_std": 0.2401813268661499, + "rewards/format_reward": 0.11978019773960114, + "rewards/judgement_reward": 0.33972567319869995, + "rewards/token_reward": 0.015000000596046448, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.8956069946289, + "epoch": 0.1024390243902439, + "grad_norm": 1.7567747825280924, + "kl": 0.0108642578125, + "learning_rate": 1e-06, + "loss": 0.0215, + "reward": 0.27094441652297974, + "reward_std": 0.20202507078647614, + "rewards/format_reward": 0.10549449920654297, + "rewards/judgement_reward": 0.15775761008262634, + "rewards/token_reward": 0.007692308630794287, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.54945373535156, + "epoch": 0.1073170731707317, + "grad_norm": 1.8581558141105476, + "kl": 0.08349609375, + "learning_rate": 1e-06, + "loss": 0.048, + "reward": 0.4122963845729828, + "reward_std": 0.24994409084320068, + "rewards/format_reward": 0.1230769008398056, + "rewards/judgement_reward": 0.27993375062942505, + "rewards/token_reward": 0.009285714477300644, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 67.73626708984375, + "epoch": 0.11219512195121951, + "grad_norm": 1.8992395015405976, + "kl": 0.01470947265625, + "learning_rate": 1e-06, + "loss": -0.014, + "reward": 0.22567632794380188, + "reward_std": 0.1741848886013031, + "rewards/format_reward": 0.09560439735651016, + "rewards/judgement_reward": 0.11776423454284668, + "rewards/token_reward": 0.012307691387832165, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.0054931640625, + "epoch": 0.11707317073170732, + "grad_norm": 2.333890318615974, + "kl": 0.01806640625, + "learning_rate": 1e-06, + "loss": 0.0425, + "reward": 0.3388529121875763, + "reward_std": 0.22322086989879608, + "rewards/format_reward": 0.12087910622358322, + "rewards/judgement_reward": 0.20572103559970856, + "rewards/token_reward": 0.012252748012542725, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.61538696289062, + "epoch": 0.12195121951219512, + "grad_norm": 2.5484341340291334, + "kl": 0.021728515625, + "learning_rate": 1e-06, + "loss": 0.0759, + "reward": 0.6952180862426758, + "reward_std": 0.2809818983078003, + "rewards/format_reward": 0.15054939687252045, + "rewards/judgement_reward": 0.5331300497055054, + "rewards/token_reward": 0.011538460850715637, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.8956069946289, + "epoch": 0.12682926829268293, + "grad_norm": 2.00022024625464, + "kl": 0.0308837890625, + "learning_rate": 1e-06, + "loss": -0.0145, + "reward": 0.4654199182987213, + "reward_std": 0.24022048711776733, + "rewards/format_reward": 0.13076920807361603, + "rewards/judgement_reward": 0.3248704969882965, + "rewards/token_reward": 0.0097802197560668, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.3846206665039, + "epoch": 0.13170731707317074, + "grad_norm": 2.133845945151958, + "kl": 0.028564453125, + "learning_rate": 1e-06, + "loss": 0.0379, + "reward": 0.6326283812522888, + "reward_std": 0.25742557644844055, + "rewards/format_reward": 0.14945051074028015, + "rewards/judgement_reward": 0.4757600724697113, + "rewards/token_reward": 0.0074175819754600525, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.95604705810547, + "epoch": 0.13658536585365855, + "grad_norm": 2.1425979650194793, + "kl": 0.021484375, + "learning_rate": 1e-06, + "loss": 0.0393, + "reward": 0.42065250873565674, + "reward_std": 0.17824424803256989, + "rewards/format_reward": 0.1131868064403534, + "rewards/judgement_reward": 0.30175137519836426, + "rewards/token_reward": 0.0057142856530845165, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.22527313232422, + "epoch": 0.14146341463414633, + "grad_norm": 2.187328509574426, + "kl": 0.0264892578125, + "learning_rate": 1e-06, + "loss": -0.0145, + "reward": 0.5736148357391357, + "reward_std": 0.250982403755188, + "rewards/format_reward": 0.14175820350646973, + "rewards/judgement_reward": 0.4122960567474365, + "rewards/token_reward": 0.01956043764948845, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.989013671875, + "epoch": 0.14634146341463414, + "grad_norm": 2.0631362920367984, + "kl": 0.0223388671875, + "learning_rate": 1e-06, + "loss": -0.0436, + "reward": 0.8461222648620605, + "reward_std": 0.25097036361694336, + "rewards/format_reward": 0.16153840720653534, + "rewards/judgement_reward": 0.6611222624778748, + "rewards/token_reward": 0.023461539298295975, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.67582702636719, + "epoch": 0.15121951219512195, + "grad_norm": 2.5463613589005516, + "kl": 0.037353515625, + "learning_rate": 1e-06, + "loss": -0.0348, + "reward": 0.6614670753479004, + "reward_std": 0.2477238029241562, + "rewards/format_reward": 0.1439560055732727, + "rewards/judgement_reward": 0.507401168346405, + "rewards/token_reward": 0.01010989025235176, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 68.18681335449219, + "epoch": 0.15609756097560976, + "grad_norm": 2.157033445164365, + "kl": 0.0264892578125, + "learning_rate": 1e-06, + "loss": -0.0477, + "reward": 0.6428108811378479, + "reward_std": 0.21757948398590088, + "rewards/format_reward": 0.14285710453987122, + "rewards/judgement_reward": 0.4923712909221649, + "rewards/token_reward": 0.007582417223602533, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.52747344970703, + "epoch": 0.16097560975609757, + "grad_norm": 2.5147644681680013, + "kl": 1.71875, + "learning_rate": 1e-06, + "loss": -0.0331, + "reward": 0.8551515340805054, + "reward_std": 0.20301613211631775, + "rewards/format_reward": 0.15934060513973236, + "rewards/judgement_reward": 0.6783931255340576, + "rewards/token_reward": 0.017417581751942635, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.4120864868164, + "epoch": 0.16585365853658537, + "grad_norm": 2.1819457001917804, + "kl": 0.71484375, + "learning_rate": 1e-06, + "loss": -0.0018, + "reward": 0.48153403401374817, + "reward_std": 0.24979981780052185, + "rewards/format_reward": 0.12417580932378769, + "rewards/judgement_reward": 0.34016045928001404, + "rewards/token_reward": 0.017197802662849426, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.0824203491211, + "epoch": 0.17073170731707318, + "grad_norm": 2.0349402588988084, + "kl": 0.023193359375, + "learning_rate": 1e-06, + "loss": 0.0397, + "reward": 0.49935343861579895, + "reward_std": 0.25207024812698364, + "rewards/format_reward": 0.1340659111738205, + "rewards/judgement_reward": 0.3458918631076813, + "rewards/token_reward": 0.019395604729652405, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.0769271850586, + "epoch": 0.17560975609756097, + "grad_norm": 2.204221432744273, + "kl": 0.035400390625, + "learning_rate": 1e-06, + "loss": 0.0172, + "reward": 0.5841602087020874, + "reward_std": 0.2741175889968872, + "rewards/format_reward": 0.1450549066066742, + "rewards/judgement_reward": 0.4198743999004364, + "rewards/token_reward": 0.019230768084526062, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.20879364013672, + "epoch": 0.18048780487804877, + "grad_norm": 2.021038070522573, + "kl": 0.038330078125, + "learning_rate": 1e-06, + "loss": 0.0224, + "reward": 0.41530290246009827, + "reward_std": 0.2347797006368637, + "rewards/format_reward": 0.11758241057395935, + "rewards/judgement_reward": 0.2791491150856018, + "rewards/token_reward": 0.01857142709195614, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.3846206665039, + "epoch": 0.18536585365853658, + "grad_norm": 2.2222191685212156, + "kl": 0.044189453125, + "learning_rate": 1e-06, + "loss": -0.0314, + "reward": 0.6318976879119873, + "reward_std": 0.26240062713623047, + "rewards/format_reward": 0.1351647973060608, + "rewards/judgement_reward": 0.4639304578304291, + "rewards/token_reward": 0.032802194356918335, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.52747344970703, + "epoch": 0.1902439024390244, + "grad_norm": 1.8969688577935941, + "kl": 0.138671875, + "learning_rate": 1e-06, + "loss": -0.0619, + "reward": 0.7222000360488892, + "reward_std": 0.268084853887558, + "rewards/format_reward": 0.15384609997272491, + "rewards/judgement_reward": 0.5535735487937927, + "rewards/token_reward": 0.014780220575630665, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.75274658203125, + "epoch": 0.1951219512195122, + "grad_norm": 2.1456284767626763, + "kl": 0.046142578125, + "learning_rate": 1e-06, + "loss": -0.0734, + "reward": 0.8482385277748108, + "reward_std": 0.24352890253067017, + "rewards/format_reward": 0.1560439020395279, + "rewards/judgement_reward": 0.6564801335334778, + "rewards/token_reward": 0.0357142835855484, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.15933990478516, + "epoch": 0.2, + "grad_norm": 2.09640812853542, + "kl": 0.16796875, + "learning_rate": 1e-06, + "loss": 0.0156, + "reward": 0.8189014792442322, + "reward_std": 0.2852559983730316, + "rewards/format_reward": 0.15824170410633087, + "rewards/judgement_reward": 0.6289563775062561, + "rewards/token_reward": 0.031703293323516846, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.30769348144531, + "epoch": 0.2048780487804878, + "grad_norm": 2.05137630399559, + "kl": 0.030029296875, + "learning_rate": 1e-06, + "loss": -0.0138, + "reward": 0.9518312215805054, + "reward_std": 0.21282121539115906, + "rewards/format_reward": 0.16923069953918457, + "rewards/judgement_reward": 0.759743332862854, + "rewards/token_reward": 0.022857142612338066, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.30220031738281, + "epoch": 0.2097560975609756, + "grad_norm": 2.212876795222977, + "kl": 0.03857421875, + "learning_rate": 1e-06, + "loss": 0.042, + "reward": 0.5973189473152161, + "reward_std": 0.2632991671562195, + "rewards/format_reward": 0.13186810910701752, + "rewards/judgement_reward": 0.4431980848312378, + "rewards/token_reward": 0.022252749651670456, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.78022003173828, + "epoch": 0.2146341463414634, + "grad_norm": 1.909917279230211, + "kl": 0.02197265625, + "learning_rate": 1e-06, + "loss": -0.0097, + "reward": 0.6979678869247437, + "reward_std": 0.2731499671936035, + "rewards/format_reward": 0.1461537927389145, + "rewards/judgement_reward": 0.5200557112693787, + "rewards/token_reward": 0.03175824135541916, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.13736724853516, + "epoch": 0.21951219512195122, + "grad_norm": 2.00637254655722, + "kl": 0.031982421875, + "learning_rate": 1e-06, + "loss": 0.0395, + "reward": 0.5393092036247253, + "reward_std": 0.18171927332878113, + "rewards/format_reward": 0.12197799980640411, + "rewards/judgement_reward": 0.40848490595817566, + "rewards/token_reward": 0.008846154436469078, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.3901138305664, + "epoch": 0.22439024390243903, + "grad_norm": 3.1756908083902964, + "kl": 0.032958984375, + "learning_rate": 1e-06, + "loss": -0.0354, + "reward": 0.5674500465393066, + "reward_std": 0.21071720123291016, + "rewards/format_reward": 0.13186810910701752, + "rewards/judgement_reward": 0.4111863970756531, + "rewards/token_reward": 0.024395601823925972, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.52747344970703, + "epoch": 0.22926829268292684, + "grad_norm": 2.006041173831088, + "kl": 0.0302734375, + "learning_rate": 1e-06, + "loss": -0.0108, + "reward": 0.5255321860313416, + "reward_std": 0.24098682403564453, + "rewards/format_reward": 0.12747250497341156, + "rewards/judgement_reward": 0.3749277889728546, + "rewards/token_reward": 0.02313186600804329, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.30220031738281, + "epoch": 0.23414634146341465, + "grad_norm": 2.080512783491811, + "kl": 0.0257568359375, + "learning_rate": 1e-06, + "loss": -0.0174, + "reward": 0.6429842710494995, + "reward_std": 0.2190992385149002, + "rewards/format_reward": 0.14615380764007568, + "rewards/judgement_reward": 0.47715994715690613, + "rewards/token_reward": 0.019670329988002777, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.19780731201172, + "epoch": 0.23902439024390243, + "grad_norm": 2.0510274772907042, + "kl": 0.0220947265625, + "learning_rate": 1e-06, + "loss": -0.0791, + "reward": 0.8461357355117798, + "reward_std": 0.2294541895389557, + "rewards/format_reward": 0.1648351103067398, + "rewards/judgement_reward": 0.6526740789413452, + "rewards/token_reward": 0.028626371175050735, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.05494689941406, + "epoch": 0.24390243902439024, + "grad_norm": 2.1607651542772115, + "kl": 0.0244140625, + "learning_rate": 1e-06, + "loss": -0.0118, + "reward": 0.7917323708534241, + "reward_std": 0.2613984942436218, + "rewards/format_reward": 0.15714280307292938, + "rewards/judgement_reward": 0.6060182452201843, + "rewards/token_reward": 0.02857142873108387, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.94505310058594, + "epoch": 0.24878048780487805, + "grad_norm": 2.4101187996457467, + "kl": 0.06787109375, + "learning_rate": 1e-06, + "loss": -0.0128, + "reward": 0.6344149112701416, + "reward_std": 0.2056046426296234, + "rewards/format_reward": 0.1340659111738205, + "rewards/judgement_reward": 0.4821070730686188, + "rewards/token_reward": 0.01824175752699375, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.18132019042969, + "epoch": 0.25365853658536586, + "grad_norm": 2.0276407359997446, + "kl": 0.205078125, + "learning_rate": 1e-06, + "loss": -0.0588, + "reward": 0.9314945340156555, + "reward_std": 0.17086516320705414, + "rewards/format_reward": 0.1659339964389801, + "rewards/judgement_reward": 0.7470438480377197, + "rewards/token_reward": 0.018516482785344124, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.30769348144531, + "epoch": 0.25853658536585367, + "grad_norm": 2.17785136004001, + "kl": 0.031982421875, + "learning_rate": 1e-06, + "loss": -0.0705, + "reward": 0.7751470804214478, + "reward_std": 0.2271140068769455, + "rewards/format_reward": 0.1549450010061264, + "rewards/judgement_reward": 0.5904769897460938, + "rewards/token_reward": 0.029725274071097374, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.989013671875, + "epoch": 0.2634146341463415, + "grad_norm": 2.6022497270890868, + "kl": 0.0400390625, + "learning_rate": 1e-06, + "loss": 0.0112, + "reward": 0.7827267646789551, + "reward_std": 0.2629249691963196, + "rewards/format_reward": 0.1549450010061264, + "rewards/judgement_reward": 0.6101441979408264, + "rewards/token_reward": 0.017637362703680992, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.96703338623047, + "epoch": 0.2682926829268293, + "grad_norm": 1.9363620619923776, + "kl": 0.032958984375, + "learning_rate": 1e-06, + "loss": 0.0014, + "reward": 0.579794704914093, + "reward_std": 0.26253968477249146, + "rewards/format_reward": 0.13846150040626526, + "rewards/judgement_reward": 0.41259682178497314, + "rewards/token_reward": 0.028736261650919914, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.9945068359375, + "epoch": 0.2731707317073171, + "grad_norm": 2.2800506149734696, + "kl": 0.06005859375, + "learning_rate": 1e-06, + "loss": 0.035, + "reward": 0.7825473546981812, + "reward_std": 0.2477075755596161, + "rewards/format_reward": 0.15934060513973236, + "rewards/judgement_reward": 0.6038658618927002, + "rewards/token_reward": 0.01934065856039524, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.19230651855469, + "epoch": 0.2780487804878049, + "grad_norm": 1.9316889087097737, + "kl": 0.03369140625, + "learning_rate": 1e-06, + "loss": 0.0151, + "reward": 0.7461085319519043, + "reward_std": 0.22608627378940582, + "rewards/format_reward": 0.14835159480571747, + "rewards/judgement_reward": 0.5816579461097717, + "rewards/token_reward": 0.016098899766802788, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.45604705810547, + "epoch": 0.28292682926829266, + "grad_norm": 2.891000369424163, + "kl": 0.09228515625, + "learning_rate": 1e-06, + "loss": 0.0333, + "reward": 0.7758737206459045, + "reward_std": 0.27175047993659973, + "rewards/format_reward": 0.15714281797409058, + "rewards/judgement_reward": 0.5963131785392761, + "rewards/token_reward": 0.02241758443415165, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.02198028564453, + "epoch": 0.28780487804878047, + "grad_norm": 2.08130255800055, + "kl": 0.0277099609375, + "learning_rate": 1e-06, + "loss": -0.0011, + "reward": 0.7745002508163452, + "reward_std": 0.24488268792629242, + "rewards/format_reward": 0.16153840720653534, + "rewards/judgement_reward": 0.5817528367042542, + "rewards/token_reward": 0.031208788976073265, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.19230651855469, + "epoch": 0.2926829268292683, + "grad_norm": 2.325364169755121, + "kl": 0.048583984375, + "learning_rate": 1e-06, + "loss": -0.054, + "reward": 0.7218204736709595, + "reward_std": 0.22538162767887115, + "rewards/format_reward": 0.1439560055732727, + "rewards/judgement_reward": 0.5538533926010132, + "rewards/token_reward": 0.02401098981499672, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.14835357666016, + "epoch": 0.2975609756097561, + "grad_norm": 2.199589014022037, + "kl": 0.0289306640625, + "learning_rate": 1e-06, + "loss": -0.0617, + "reward": 0.6991139054298401, + "reward_std": 0.22941938042640686, + "rewards/format_reward": 0.1450549066066742, + "rewards/judgement_reward": 0.5255423784255981, + "rewards/token_reward": 0.028516482561826706, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.95055389404297, + "epoch": 0.3024390243902439, + "grad_norm": 2.0508974708861576, + "kl": 0.02587890625, + "learning_rate": 1e-06, + "loss": -0.04, + "reward": 0.6912637948989868, + "reward_std": 0.22075381875038147, + "rewards/format_reward": 0.15274719893932343, + "rewards/judgement_reward": 0.5231318473815918, + "rewards/token_reward": 0.015384615398943424, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.80769348144531, + "epoch": 0.3073170731707317, + "grad_norm": 2.2205627029567467, + "kl": 0.035888671875, + "learning_rate": 1e-06, + "loss": -0.0696, + "reward": 0.7073810696601868, + "reward_std": 0.22409050166606903, + "rewards/format_reward": 0.15714280307292938, + "rewards/judgement_reward": 0.514194130897522, + "rewards/token_reward": 0.036043956875801086, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.52747344970703, + "epoch": 0.3121951219512195, + "grad_norm": 2.0916293721018544, + "kl": 0.032958984375, + "learning_rate": 1e-06, + "loss": -0.0537, + "reward": 0.998910129070282, + "reward_std": 0.2128116488456726, + "rewards/format_reward": 0.17032960057258606, + "rewards/judgement_reward": 0.800338625907898, + "rewards/token_reward": 0.028241755440831184, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.0934066772461, + "epoch": 0.3170731707317073, + "grad_norm": 3.671459315517818, + "kl": 0.056640625, + "learning_rate": 1e-06, + "loss": -0.0158, + "reward": 0.8368141651153564, + "reward_std": 0.24064061045646667, + "rewards/format_reward": 0.16153840720653534, + "rewards/judgement_reward": 0.6522535681724548, + "rewards/token_reward": 0.02302197553217411, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.41758728027344, + "epoch": 0.32195121951219513, + "grad_norm": 2.0126846364586273, + "kl": 0.026123046875, + "learning_rate": 1e-06, + "loss": -0.0154, + "reward": 0.809525728225708, + "reward_std": 0.2356753945350647, + "rewards/format_reward": 0.16373620927333832, + "rewards/judgement_reward": 0.6260641813278198, + "rewards/token_reward": 0.019725274294614792, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.15933990478516, + "epoch": 0.32682926829268294, + "grad_norm": 1.851476238680362, + "kl": 0.0267333984375, + "learning_rate": 1e-06, + "loss": 0.0033, + "reward": 0.6307772397994995, + "reward_std": 0.23441998660564423, + "rewards/format_reward": 0.14835159480571747, + "rewards/judgement_reward": 0.45868924260139465, + "rewards/token_reward": 0.023736262694001198, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.77472686767578, + "epoch": 0.33170731707317075, + "grad_norm": 1.9067629127088346, + "kl": 0.027587890625, + "learning_rate": 1e-06, + "loss": -0.0166, + "reward": 0.6367327570915222, + "reward_std": 0.207670658826828, + "rewards/format_reward": 0.15054941177368164, + "rewards/judgement_reward": 0.46409520506858826, + "rewards/token_reward": 0.022087913006544113, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.63186645507812, + "epoch": 0.33658536585365856, + "grad_norm": 2.192665819859577, + "kl": 0.058349609375, + "learning_rate": 1e-06, + "loss": -0.0979, + "reward": 0.8123928904533386, + "reward_std": 0.23223978281021118, + "rewards/format_reward": 0.16263730823993683, + "rewards/judgement_reward": 0.6219531297683716, + "rewards/token_reward": 0.027802197262644768, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.26374053955078, + "epoch": 0.34146341463414637, + "grad_norm": 2.339003134381284, + "kl": 0.02392578125, + "learning_rate": 1e-06, + "loss": -0.0611, + "reward": 0.9417235255241394, + "reward_std": 0.18349522352218628, + "rewards/format_reward": 0.16813181340694427, + "rewards/judgement_reward": 0.7577126622200012, + "rewards/token_reward": 0.01587912067770958, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.05494689941406, + "epoch": 0.3463414634146341, + "grad_norm": 2.3107542796374916, + "kl": 0.0244140625, + "learning_rate": 1e-06, + "loss": 0.0253, + "reward": 0.8421667218208313, + "reward_std": 0.22592146694660187, + "rewards/format_reward": 0.15384609997272491, + "rewards/judgement_reward": 0.6690347790718079, + "rewards/token_reward": 0.019285714253783226, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.9120864868164, + "epoch": 0.35121951219512193, + "grad_norm": 1.8500979548941974, + "kl": 0.0191650390625, + "learning_rate": 1e-06, + "loss": 0.0408, + "reward": 0.8825385570526123, + "reward_std": 0.23543551564216614, + "rewards/format_reward": 0.17362630367279053, + "rewards/judgement_reward": 0.6919341087341309, + "rewards/token_reward": 0.01697802171111107, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.62088012695312, + "epoch": 0.35609756097560974, + "grad_norm": 1.7964524052061408, + "kl": 0.0262451171875, + "learning_rate": 1e-06, + "loss": -0.0512, + "reward": 0.7030321359634399, + "reward_std": 0.20544546842575073, + "rewards/format_reward": 0.14615380764007568, + "rewards/judgement_reward": 0.5344606637954712, + "rewards/token_reward": 0.0224175825715065, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.87911987304688, + "epoch": 0.36097560975609755, + "grad_norm": 1.9935947664757243, + "kl": 0.19921875, + "learning_rate": 1e-06, + "loss": -0.055, + "reward": 0.9933971762657166, + "reward_std": 0.22045965492725372, + "rewards/format_reward": 0.17252740263938904, + "rewards/judgement_reward": 0.7981772422790527, + "rewards/token_reward": 0.022692309692502022, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.4011001586914, + "epoch": 0.36585365853658536, + "grad_norm": 2.001748403246846, + "kl": 0.0308837890625, + "learning_rate": 1e-06, + "loss": -0.0664, + "reward": 0.6825388073921204, + "reward_std": 0.2821784019470215, + "rewards/format_reward": 0.1560439020395279, + "rewards/judgement_reward": 0.49479159712791443, + "rewards/token_reward": 0.031703293323516846, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.41758728027344, + "epoch": 0.37073170731707317, + "grad_norm": 2.2248051545634824, + "kl": 0.0302734375, + "learning_rate": 1e-06, + "loss": -0.0628, + "reward": 0.7976081371307373, + "reward_std": 0.24464865028858185, + "rewards/format_reward": 0.16263730823993683, + "rewards/judgement_reward": 0.611564040184021, + "rewards/token_reward": 0.02340659312903881, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.8901138305664, + "epoch": 0.375609756097561, + "grad_norm": 2.0499939459920835, + "kl": 0.03662109375, + "learning_rate": 1e-06, + "loss": -0.0634, + "reward": 0.6835858821868896, + "reward_std": 0.21102023124694824, + "rewards/format_reward": 0.1351647973060608, + "rewards/judgement_reward": 0.5263329744338989, + "rewards/token_reward": 0.022087913006544113, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.62637329101562, + "epoch": 0.3804878048780488, + "grad_norm": 2.266766438478251, + "kl": 0.02880859375, + "learning_rate": 1e-06, + "loss": 0.0109, + "reward": 0.8073420524597168, + "reward_std": 0.23677751421928406, + "rewards/format_reward": 0.1560439020395279, + "rewards/judgement_reward": 0.6212978959083557, + "rewards/token_reward": 0.029999997466802597, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.93955993652344, + "epoch": 0.3853658536585366, + "grad_norm": 2.4759945771013867, + "kl": 0.06640625, + "learning_rate": 1e-06, + "loss": 0.0217, + "reward": 0.6546903252601624, + "reward_std": 0.26877561211586, + "rewards/format_reward": 0.15164829790592194, + "rewards/judgement_reward": 0.4823826551437378, + "rewards/token_reward": 0.020659340545535088, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.65384674072266, + "epoch": 0.3902439024390244, + "grad_norm": 1.945518643660678, + "kl": 0.047119140625, + "learning_rate": 1e-06, + "loss": -0.0002, + "reward": 0.7612788081169128, + "reward_std": 0.1894712746143341, + "rewards/format_reward": 0.13736259937286377, + "rewards/judgement_reward": 0.6034765243530273, + "rewards/token_reward": 0.02043955959379673, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.78571319580078, + "epoch": 0.3951219512195122, + "grad_norm": 2.0901126990153127, + "kl": 0.11474609375, + "learning_rate": 1e-06, + "loss": -0.0731, + "reward": 1.0548338890075684, + "reward_std": 0.21029122173786163, + "rewards/format_reward": 0.1769230216741562, + "rewards/judgement_reward": 0.8410976529121399, + "rewards/token_reward": 0.03681318834424019, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.8956069946289, + "epoch": 0.4, + "grad_norm": 1.7461151621773074, + "kl": 0.03271484375, + "learning_rate": 1e-06, + "loss": -0.0471, + "reward": 0.7646914720535278, + "reward_std": 0.2543509900569916, + "rewards/format_reward": 0.15714281797409058, + "rewards/judgement_reward": 0.5825485587120056, + "rewards/token_reward": 0.02499999850988388, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.9065933227539, + "epoch": 0.40487804878048783, + "grad_norm": 2.0106379862864663, + "kl": 0.0311279296875, + "learning_rate": 1e-06, + "loss": 0.0125, + "reward": 0.7661330699920654, + "reward_std": 0.2671906650066376, + "rewards/format_reward": 0.15934060513973236, + "rewards/judgement_reward": 0.5814077854156494, + "rewards/token_reward": 0.02538461610674858, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.64835357666016, + "epoch": 0.4097560975609756, + "grad_norm": 1.9108709969888935, + "kl": 0.283203125, + "learning_rate": 1e-06, + "loss": 0.0065, + "reward": 0.8318064212799072, + "reward_std": 0.23601563274860382, + "rewards/format_reward": 0.15934060513973236, + "rewards/judgement_reward": 0.6468064785003662, + "rewards/token_reward": 0.025659339502453804, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.02747344970703, + "epoch": 0.4146341463414634, + "grad_norm": 1.890024675225077, + "kl": 0.1630859375, + "learning_rate": 1e-06, + "loss": -0.0754, + "reward": 0.9830853343009949, + "reward_std": 0.22543151676654816, + "rewards/format_reward": 0.17362630367279053, + "rewards/judgement_reward": 0.7781403064727783, + "rewards/token_reward": 0.031318679451942444, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.0824203491211, + "epoch": 0.4195121951219512, + "grad_norm": 2.47037218372719, + "kl": 0.044921875, + "learning_rate": 1e-06, + "loss": 0.0005, + "reward": 0.6921790242195129, + "reward_std": 0.26250430941581726, + "rewards/format_reward": 0.14725270867347717, + "rewards/judgement_reward": 0.5239920616149902, + "rewards/token_reward": 0.02093406394124031, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.28571319580078, + "epoch": 0.424390243902439, + "grad_norm": 2.079365060925443, + "kl": 0.0267333984375, + "learning_rate": 1e-06, + "loss": -0.0605, + "reward": 0.7566412687301636, + "reward_std": 0.21086536347866058, + "rewards/format_reward": 0.15824170410633087, + "rewards/judgement_reward": 0.5732895731925964, + "rewards/token_reward": 0.02510989084839821, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.31867980957031, + "epoch": 0.4292682926829268, + "grad_norm": 2.1529982274280863, + "kl": 0.0255126953125, + "learning_rate": 1e-06, + "loss": -0.0807, + "reward": 0.8711549639701843, + "reward_std": 0.19136983156204224, + "rewards/format_reward": 0.16153840720653534, + "rewards/judgement_reward": 0.6887921690940857, + "rewards/token_reward": 0.02082417532801628, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.85164642333984, + "epoch": 0.43414634146341463, + "grad_norm": 1.9476865855709369, + "kl": 0.03466796875, + "learning_rate": 1e-06, + "loss": -0.0521, + "reward": 0.849588930606842, + "reward_std": 0.22347387671470642, + "rewards/format_reward": 0.1648351103067398, + "rewards/judgement_reward": 0.6607428193092346, + "rewards/token_reward": 0.02401098981499672, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.53296661376953, + "epoch": 0.43902439024390244, + "grad_norm": 1.9988855691892198, + "kl": 0.3203125, + "learning_rate": 1e-06, + "loss": -0.0217, + "reward": 0.794158935546875, + "reward_std": 0.2382083535194397, + "rewards/format_reward": 0.16153840720653534, + "rewards/judgement_reward": 0.6096535921096802, + "rewards/token_reward": 0.022967033088207245, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.9065933227539, + "epoch": 0.44390243902439025, + "grad_norm": 2.3812234433330333, + "kl": 0.039794921875, + "learning_rate": 1e-06, + "loss": -0.066, + "reward": 0.9384704828262329, + "reward_std": 0.19025777280330658, + "rewards/format_reward": 0.1659340113401413, + "rewards/judgement_reward": 0.7457780838012695, + "rewards/token_reward": 0.026758242398500443, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.04396057128906, + "epoch": 0.44878048780487806, + "grad_norm": 2.6148084459815206, + "kl": 0.04345703125, + "learning_rate": 1e-06, + "loss": -0.0984, + "reward": 1.0894677639007568, + "reward_std": 0.1785333752632141, + "rewards/format_reward": 0.18241752684116364, + "rewards/judgement_reward": 0.880236804485321, + "rewards/token_reward": 0.026813184842467308, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.24725341796875, + "epoch": 0.45365853658536587, + "grad_norm": 1.9266274957793252, + "kl": 0.10205078125, + "learning_rate": 1e-06, + "loss": 0.0333, + "reward": 0.6261184215545654, + "reward_std": 0.2962965965270996, + "rewards/format_reward": 0.1450549066066742, + "rewards/judgement_reward": 0.4455689489841461, + "rewards/token_reward": 0.03549450263381004, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.0714340209961, + "epoch": 0.4585365853658537, + "grad_norm": 2.359476534290484, + "kl": 0.0693359375, + "learning_rate": 1e-06, + "loss": -0.0619, + "reward": 0.6673313975334167, + "reward_std": 0.20072828233242035, + "rewards/format_reward": 0.13736259937286377, + "rewards/judgement_reward": 0.5074961185455322, + "rewards/token_reward": 0.022472526878118515, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.46703338623047, + "epoch": 0.4634146341463415, + "grad_norm": 2.1867681379205126, + "kl": 0.045166015625, + "learning_rate": 1e-06, + "loss": -0.0384, + "reward": 0.8676297664642334, + "reward_std": 0.2376934140920639, + "rewards/format_reward": 0.1648351103067398, + "rewards/judgement_reward": 0.6711461544036865, + "rewards/token_reward": 0.03164835274219513, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.30769348144531, + "epoch": 0.4682926829268293, + "grad_norm": 1.7938305263175633, + "kl": 0.0294189453125, + "learning_rate": 1e-06, + "loss": -0.0293, + "reward": 0.8810251355171204, + "reward_std": 0.22088079154491425, + "rewards/format_reward": 0.1659339964389801, + "rewards/judgement_reward": 0.6851460337638855, + "rewards/token_reward": 0.02994505502283573, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.96154022216797, + "epoch": 0.47317073170731705, + "grad_norm": 1.9835104874280258, + "kl": 0.0262451171875, + "learning_rate": 1e-06, + "loss": -0.031, + "reward": 1.0301648378372192, + "reward_std": 0.23829680681228638, + "rewards/format_reward": 0.17912080883979797, + "rewards/judgement_reward": 0.8086262941360474, + "rewards/token_reward": 0.042417578399181366, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.10989379882812, + "epoch": 0.47804878048780486, + "grad_norm": 1.8068829512126077, + "kl": 0.025390625, + "learning_rate": 1e-06, + "loss": -0.0435, + "reward": 0.8396397233009338, + "reward_std": 0.20062671601772308, + "rewards/format_reward": 0.1648351103067398, + "rewards/judgement_reward": 0.6437055468559265, + "rewards/token_reward": 0.031098900362849236, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.79670715332031, + "epoch": 0.48292682926829267, + "grad_norm": 1.8013193104560348, + "kl": 0.0294189453125, + "learning_rate": 1e-06, + "loss": -0.021, + "reward": 0.8207690119743347, + "reward_std": 0.18635249137878418, + "rewards/format_reward": 0.16263730823993683, + "rewards/judgement_reward": 0.6401647329330444, + "rewards/token_reward": 0.01796703413128853, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.29121398925781, + "epoch": 0.4878048780487805, + "grad_norm": 1.8541360087649421, + "kl": 0.03173828125, + "learning_rate": 1e-06, + "loss": -0.0518, + "reward": 0.8696525692939758, + "reward_std": 0.2140418291091919, + "rewards/format_reward": 0.16813179850578308, + "rewards/judgement_reward": 0.6785534620285034, + "rewards/token_reward": 0.022967033088207245, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.42857360839844, + "epoch": 0.4926829268292683, + "grad_norm": 2.3437263624638858, + "kl": 0.03759765625, + "learning_rate": 1e-06, + "loss": 0.0011, + "reward": 0.7790926694869995, + "reward_std": 0.29135364294052124, + "rewards/format_reward": 0.16373620927333832, + "rewards/judgement_reward": 0.5860158205032349, + "rewards/token_reward": 0.029340656474232674, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.17033386230469, + "epoch": 0.4975609756097561, + "grad_norm": 2.1875591465659765, + "kl": 0.05029296875, + "learning_rate": 1e-06, + "loss": -0.0458, + "reward": 0.9169327616691589, + "reward_std": 0.21496839821338654, + "rewards/format_reward": 0.17142850160598755, + "rewards/judgement_reward": 0.7243505120277405, + "rewards/token_reward": 0.021153846755623817, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.53845977783203, + "epoch": 0.5024390243902439, + "grad_norm": 2.2860621039378097, + "kl": 0.052978515625, + "learning_rate": 1e-06, + "loss": -0.0511, + "reward": 0.8492118120193481, + "reward_std": 0.2524387836456299, + "rewards/format_reward": 0.1648351103067398, + "rewards/judgement_reward": 0.6557502746582031, + "rewards/token_reward": 0.028626374900341034, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.21977996826172, + "epoch": 0.5073170731707317, + "grad_norm": 1.9503004419157008, + "kl": 0.03857421875, + "learning_rate": 1e-06, + "loss": -0.1021, + "reward": 0.9117417335510254, + "reward_std": 0.21893596649169922, + "rewards/format_reward": 0.16813181340694427, + "rewards/judgement_reward": 0.7197636961936951, + "rewards/token_reward": 0.023846155032515526, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 69.92308044433594, + "epoch": 0.5121951219512195, + "grad_norm": 2.2102406225271696, + "kl": 0.0732421875, + "learning_rate": 1e-06, + "loss": -0.0789, + "reward": 0.9815363883972168, + "reward_std": 0.1895749419927597, + "rewards/format_reward": 0.17472520470619202, + "rewards/judgement_reward": 0.7850527167320251, + "rewards/token_reward": 0.021758243441581726, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.5, + "epoch": 0.5170731707317073, + "grad_norm": 1.8701663684695582, + "kl": 0.029052734375, + "learning_rate": 1e-06, + "loss": -0.0526, + "reward": 1.0305202007293701, + "reward_std": 0.21711336076259613, + "rewards/format_reward": 0.17802190780639648, + "rewards/judgement_reward": 0.8127729296684265, + "rewards/token_reward": 0.039725273847579956, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.36813354492188, + "epoch": 0.5219512195121951, + "grad_norm": 2.1504947624651005, + "kl": 0.041015625, + "learning_rate": 1e-06, + "loss": -0.0108, + "reward": 0.8238873481750488, + "reward_std": 0.28225311636924744, + "rewards/format_reward": 0.1659340113401413, + "rewards/judgement_reward": 0.6267994046211243, + "rewards/token_reward": 0.0311538465321064, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.24176025390625, + "epoch": 0.526829268292683, + "grad_norm": 1.7794056509498575, + "kl": 0.0311279296875, + "learning_rate": 1e-06, + "loss": -0.0823, + "reward": 0.9624238014221191, + "reward_std": 0.2303503304719925, + "rewards/format_reward": 0.17252740263938904, + "rewards/judgement_reward": 0.7584677934646606, + "rewards/token_reward": 0.03142856806516647, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.4120864868164, + "epoch": 0.5317073170731708, + "grad_norm": 2.504687498054589, + "kl": 0.05712890625, + "learning_rate": 1e-06, + "loss": -0.072, + "reward": 0.976593554019928, + "reward_std": 0.23477815091609955, + "rewards/format_reward": 0.1758241206407547, + "rewards/judgement_reward": 0.7797802090644836, + "rewards/token_reward": 0.020989011973142624, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.32418060302734, + "epoch": 0.5365853658536586, + "grad_norm": 1.9342624784665794, + "kl": 0.0419921875, + "learning_rate": 1e-06, + "loss": -0.0906, + "reward": 1.0066561698913574, + "reward_std": 0.18863876163959503, + "rewards/format_reward": 0.18131859600543976, + "rewards/judgement_reward": 0.7958320379257202, + "rewards/token_reward": 0.029505494982004166, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.02747344970703, + "epoch": 0.5414634146341464, + "grad_norm": 3.6005569321956736, + "kl": 0.07275390625, + "learning_rate": 1e-06, + "loss": 0.0868, + "reward": 0.9240605235099792, + "reward_std": 0.2554757595062256, + "rewards/format_reward": 0.16813181340694427, + "rewards/judgement_reward": 0.7308735251426697, + "rewards/token_reward": 0.025054944679141045, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.05494689941406, + "epoch": 0.5463414634146342, + "grad_norm": 2.384945004137537, + "kl": 0.0498046875, + "learning_rate": 1e-06, + "loss": 0.0406, + "reward": 0.934312105178833, + "reward_std": 0.25367388129234314, + "rewards/format_reward": 0.1758241057395935, + "rewards/judgement_reward": 0.7368395924568176, + "rewards/token_reward": 0.021648351103067398, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.47802734375, + "epoch": 0.551219512195122, + "grad_norm": 2.4494451662329446, + "kl": 0.04150390625, + "learning_rate": 1e-06, + "loss": -0.0659, + "reward": 0.9666280746459961, + "reward_std": 0.26005613803863525, + "rewards/format_reward": 0.176923006772995, + "rewards/judgement_reward": 0.7433313131332397, + "rewards/token_reward": 0.046373624354600906, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.86264038085938, + "epoch": 0.5560975609756098, + "grad_norm": 1.8974487896916954, + "kl": 0.17578125, + "learning_rate": 1e-06, + "loss": -0.0702, + "reward": 1.1044301986694336, + "reward_std": 0.20463985204696655, + "rewards/format_reward": 0.18461531400680542, + "rewards/judgement_reward": 0.8778916597366333, + "rewards/token_reward": 0.04192307963967323, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.41758728027344, + "epoch": 0.5609756097560976, + "grad_norm": 2.0736124082447858, + "kl": 0.055908203125, + "learning_rate": 1e-06, + "loss": -0.0178, + "reward": 0.45087727904319763, + "reward_std": 0.16333483159542084, + "rewards/format_reward": 0.12087910622358322, + "rewards/judgement_reward": 0.3127453625202179, + "rewards/token_reward": 0.01725274696946144, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.31318664550781, + "epoch": 0.5658536585365853, + "grad_norm": 1.9880834210174607, + "kl": 0.265625, + "learning_rate": 1e-06, + "loss": -0.0914, + "reward": 1.0456217527389526, + "reward_std": 0.1933256834745407, + "rewards/format_reward": 0.17912080883979797, + "rewards/judgement_reward": 0.8432590365409851, + "rewards/token_reward": 0.023241758346557617, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.79670715332031, + "epoch": 0.5707317073170731, + "grad_norm": 2.0598303213859634, + "kl": 0.03173828125, + "learning_rate": 1e-06, + "loss": -0.0638, + "reward": 1.015529751777649, + "reward_std": 0.21438416838645935, + "rewards/format_reward": 0.1769230216741562, + "rewards/judgement_reward": 0.8019583821296692, + "rewards/token_reward": 0.03664834797382355, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.15384674072266, + "epoch": 0.5756097560975609, + "grad_norm": 2.1302776398753327, + "kl": 0.0322265625, + "learning_rate": 1e-06, + "loss": -0.1029, + "reward": 1.0583690404891968, + "reward_std": 0.16479381918907166, + "rewards/format_reward": 0.1857142299413681, + "rewards/judgement_reward": 0.8432590365409851, + "rewards/token_reward": 0.029395602643489838, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.35165405273438, + "epoch": 0.5804878048780487, + "grad_norm": 2.139306907057807, + "kl": 0.040771484375, + "learning_rate": 1e-06, + "loss": -0.0908, + "reward": 0.9103569984436035, + "reward_std": 0.2012585699558258, + "rewards/format_reward": 0.1659339964389801, + "rewards/judgement_reward": 0.7116758227348328, + "rewards/token_reward": 0.03274725377559662, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.20879364013672, + "epoch": 0.5853658536585366, + "grad_norm": 1.8649093478118766, + "kl": 0.04931640625, + "learning_rate": 1e-06, + "loss": -0.0399, + "reward": 0.9918122887611389, + "reward_std": 0.2688324749469757, + "rewards/format_reward": 0.176923006772995, + "rewards/judgement_reward": 0.7671419978141785, + "rewards/token_reward": 0.04774724692106247, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.04396057128906, + "epoch": 0.5902439024390244, + "grad_norm": 2.0906208019011614, + "kl": 0.05322265625, + "learning_rate": 1e-06, + "loss": -0.084, + "reward": 0.9650415778160095, + "reward_std": 0.21533620357513428, + "rewards/format_reward": 0.17362630367279053, + "rewards/judgement_reward": 0.7517996430397034, + "rewards/token_reward": 0.03961538150906563, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.62088012695312, + "epoch": 0.5951219512195122, + "grad_norm": 2.6404896615388513, + "kl": 0.039306640625, + "learning_rate": 1e-06, + "loss": -0.0209, + "reward": 0.8149644732475281, + "reward_std": 0.21845516562461853, + "rewards/format_reward": 0.15824170410633087, + "rewards/judgement_reward": 0.6252389550209045, + "rewards/token_reward": 0.03148351609706879, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.19230651855469, + "epoch": 0.6, + "grad_norm": 2.137456730046166, + "kl": 0.05419921875, + "learning_rate": 1e-06, + "loss": 0.0122, + "reward": 0.7231975197792053, + "reward_std": 0.2408137172460556, + "rewards/format_reward": 0.15274719893932343, + "rewards/judgement_reward": 0.5451754331588745, + "rewards/token_reward": 0.025274725630879402, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.81318664550781, + "epoch": 0.6048780487804878, + "grad_norm": 1.8602453492738675, + "kl": 0.03662109375, + "learning_rate": 1e-06, + "loss": -0.0347, + "reward": 0.9557329416275024, + "reward_std": 0.21061010658740997, + "rewards/format_reward": 0.17472520470619202, + "rewards/judgement_reward": 0.7449086904525757, + "rewards/token_reward": 0.0360989011824131, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.18681335449219, + "epoch": 0.6097560975609756, + "grad_norm": 1.8947771050267905, + "kl": 0.04736328125, + "learning_rate": 1e-06, + "loss": -0.0427, + "reward": 0.8908948302268982, + "reward_std": 0.21556268632411957, + "rewards/format_reward": 0.16703291237354279, + "rewards/judgement_reward": 0.688202440738678, + "rewards/token_reward": 0.035659339278936386, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.83516693115234, + "epoch": 0.6146341463414634, + "grad_norm": 1.99350313429728, + "kl": 0.0693359375, + "learning_rate": 1e-06, + "loss": -0.0231, + "reward": 0.8506399393081665, + "reward_std": 0.24161569774150848, + "rewards/format_reward": 0.1648351103067398, + "rewards/judgement_reward": 0.6592661738395691, + "rewards/token_reward": 0.026538461446762085, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.53845977783203, + "epoch": 0.6195121951219512, + "grad_norm": 1.7998816564546938, + "kl": 0.0361328125, + "learning_rate": 1e-06, + "loss": -0.1204, + "reward": 1.039487600326538, + "reward_std": 0.2013106495141983, + "rewards/format_reward": 0.17912080883979797, + "rewards/judgement_reward": 0.8210808038711548, + "rewards/token_reward": 0.03928571194410324, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.9120864868164, + "epoch": 0.624390243902439, + "grad_norm": 1.8452835234198748, + "kl": 0.0390625, + "learning_rate": 1e-06, + "loss": -0.12, + "reward": 0.9250213503837585, + "reward_std": 0.2341793179512024, + "rewards/format_reward": 0.17142850160598755, + "rewards/judgement_reward": 0.7072190642356873, + "rewards/token_reward": 0.046373624354600906, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.82967376708984, + "epoch": 0.6292682926829268, + "grad_norm": 1.9489042143231339, + "kl": 0.0311279296875, + "learning_rate": 1e-06, + "loss": -0.0785, + "reward": 0.994548499584198, + "reward_std": 0.21068567037582397, + "rewards/format_reward": 0.18021972477436066, + "rewards/judgement_reward": 0.7781199812889099, + "rewards/token_reward": 0.03620879352092743, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.79670715332031, + "epoch": 0.6341463414634146, + "grad_norm": 1.9478743633543802, + "kl": 0.041259765625, + "learning_rate": 1e-06, + "loss": 0.0111, + "reward": 0.8490137457847595, + "reward_std": 0.2522919774055481, + "rewards/format_reward": 0.16373620927333832, + "rewards/judgement_reward": 0.6532995104789734, + "rewards/token_reward": 0.031978022307157516, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.68681335449219, + "epoch": 0.6390243902439025, + "grad_norm": 2.085728564809279, + "kl": 0.04150390625, + "learning_rate": 1e-06, + "loss": 0.074, + "reward": 0.5961366295814514, + "reward_std": 0.2355279177427292, + "rewards/format_reward": 0.1439560055732727, + "rewards/judgement_reward": 0.4331147074699402, + "rewards/token_reward": 0.01906593330204487, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.93406677246094, + "epoch": 0.6439024390243903, + "grad_norm": 1.9504606440875714, + "kl": 0.03857421875, + "learning_rate": 1e-06, + "loss": -0.0681, + "reward": 0.9742907881736755, + "reward_std": 0.21939148008823395, + "rewards/format_reward": 0.17802190780639648, + "rewards/judgement_reward": 0.7539612650871277, + "rewards/token_reward": 0.042307693511247635, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.23077392578125, + "epoch": 0.6487804878048781, + "grad_norm": 2.1860995545870168, + "kl": 0.10595703125, + "learning_rate": 1e-06, + "loss": -0.0478, + "reward": 0.8597061634063721, + "reward_std": 0.1860429346561432, + "rewards/format_reward": 0.16043950617313385, + "rewards/judgement_reward": 0.6846511363983154, + "rewards/token_reward": 0.014615383930504322, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.5824203491211, + "epoch": 0.6536585365853659, + "grad_norm": 2.056976443684514, + "kl": 0.04736328125, + "learning_rate": 1e-06, + "loss": -0.0996, + "reward": 1.0542576313018799, + "reward_std": 0.1535356342792511, + "rewards/format_reward": 0.18241751194000244, + "rewards/judgement_reward": 0.8472796678543091, + "rewards/token_reward": 0.024560438469052315, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.73077392578125, + "epoch": 0.6585365853658537, + "grad_norm": 2.354640541368104, + "kl": 0.034912109375, + "learning_rate": 1e-06, + "loss": -0.0639, + "reward": 1.033756971359253, + "reward_std": 0.12462829798460007, + "rewards/format_reward": 0.18241751194000244, + "rewards/judgement_reward": 0.8278777003288269, + "rewards/token_reward": 0.023461539298295975, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.63736724853516, + "epoch": 0.6634146341463415, + "grad_norm": 1.8711434115369097, + "kl": 0.2275390625, + "learning_rate": 1e-06, + "loss": -0.0747, + "reward": 0.8162251710891724, + "reward_std": 0.23913460969924927, + "rewards/format_reward": 0.16703291237354279, + "rewards/judgement_reward": 0.6229832768440247, + "rewards/token_reward": 0.02620879001915455, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.0054931640625, + "epoch": 0.6682926829268293, + "grad_norm": 2.170766069792843, + "kl": 0.158203125, + "learning_rate": 1e-06, + "loss": -0.0001, + "reward": 0.5743477940559387, + "reward_std": 0.2055026888847351, + "rewards/format_reward": 0.13736261427402496, + "rewards/judgement_reward": 0.4148424565792084, + "rewards/token_reward": 0.02214285545051098, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.80220031738281, + "epoch": 0.6731707317073171, + "grad_norm": 2.249511172978621, + "kl": 0.040771484375, + "learning_rate": 1e-06, + "loss": -0.0299, + "reward": 0.9537181258201599, + "reward_std": 0.2547648847103119, + "rewards/format_reward": 0.17802190780639648, + "rewards/judgement_reward": 0.7325092554092407, + "rewards/token_reward": 0.04318681359291077, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.71428680419922, + "epoch": 0.6780487804878049, + "grad_norm": 2.4468786224687973, + "kl": 0.39453125, + "learning_rate": 1e-06, + "loss": -0.0937, + "reward": 0.8086802959442139, + "reward_std": 0.2142382711172104, + "rewards/format_reward": 0.15824170410633087, + "rewards/judgement_reward": 0.6207680702209473, + "rewards/token_reward": 0.02967032790184021, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 71.93955993652344, + "epoch": 0.6829268292682927, + "grad_norm": 2.5243974633393496, + "kl": 0.046630859375, + "learning_rate": 1e-06, + "loss": -0.1273, + "reward": 0.9722238183021545, + "reward_std": 0.18802900612354279, + "rewards/format_reward": 0.18131862580776215, + "rewards/judgement_reward": 0.768817126750946, + "rewards/token_reward": 0.022087909281253815, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.4835205078125, + "epoch": 0.6878048780487804, + "grad_norm": 2.7374184261837455, + "kl": 0.05029296875, + "learning_rate": 1e-06, + "loss": -0.0257, + "reward": 0.9533888101577759, + "reward_std": 0.19217993319034576, + "rewards/format_reward": 0.17362630367279053, + "rewards/judgement_reward": 0.7552569508552551, + "rewards/token_reward": 0.02450549229979515, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.15384674072266, + "epoch": 0.6926829268292682, + "grad_norm": 1.882857912628275, + "kl": 0.048583984375, + "learning_rate": 1e-06, + "loss": -0.1109, + "reward": 0.9248565435409546, + "reward_std": 0.22721460461616516, + "rewards/format_reward": 0.17032961547374725, + "rewards/judgement_reward": 0.7072190642356873, + "rewards/token_reward": 0.04730769246816635, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.5879135131836, + "epoch": 0.697560975609756, + "grad_norm": 1.9696456406180098, + "kl": 0.0498046875, + "learning_rate": 1e-06, + "loss": -0.0738, + "reward": 1.0071877241134644, + "reward_std": 0.24609951674938202, + "rewards/format_reward": 0.18021969497203827, + "rewards/judgement_reward": 0.775978684425354, + "rewards/token_reward": 0.05098900571465492, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.44506072998047, + "epoch": 0.7024390243902439, + "grad_norm": 1.9874995933520714, + "kl": 0.046142578125, + "learning_rate": 1e-06, + "loss": -0.0414, + "reward": 0.9923238754272461, + "reward_std": 0.2381928414106369, + "rewards/format_reward": 0.17252740263938904, + "rewards/judgement_reward": 0.7843018770217896, + "rewards/token_reward": 0.03549450263381004, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.11538696289062, + "epoch": 0.7073170731707317, + "grad_norm": 2.170568236496205, + "kl": 0.044921875, + "learning_rate": 1e-06, + "loss": -0.0286, + "reward": 1.019083857536316, + "reward_std": 0.2215217649936676, + "rewards/format_reward": 0.18131862580776215, + "rewards/judgement_reward": 0.8043037056922913, + "rewards/token_reward": 0.03346153721213341, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.14286041259766, + "epoch": 0.7121951219512195, + "grad_norm": 1.9765889361911162, + "kl": 0.056640625, + "learning_rate": 1e-06, + "loss": -0.0658, + "reward": 0.6368016004562378, + "reward_std": 0.19664892554283142, + "rewards/format_reward": 0.1450549066066742, + "rewards/judgement_reward": 0.4633399248123169, + "rewards/token_reward": 0.028406593948602676, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.33516693115234, + "epoch": 0.7170731707317073, + "grad_norm": 1.749209988774624, + "kl": 0.0537109375, + "learning_rate": 1e-06, + "loss": -0.0663, + "reward": 0.962835967540741, + "reward_std": 0.2308100312948227, + "rewards/format_reward": 0.17252741754055023, + "rewards/judgement_reward": 0.7419567704200745, + "rewards/token_reward": 0.04835164546966553, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.8901138305664, + "epoch": 0.7219512195121951, + "grad_norm": 2.0335137757386077, + "kl": 0.091796875, + "learning_rate": 1e-06, + "loss": -0.0357, + "reward": 0.9629313945770264, + "reward_std": 0.20034556090831757, + "rewards/format_reward": 0.18131862580776215, + "rewards/judgement_reward": 0.7572720646858215, + "rewards/token_reward": 0.024340655654668808, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.15933990478516, + "epoch": 0.7268292682926829, + "grad_norm": 1.7647654238864119, + "kl": 0.051025390625, + "learning_rate": 1e-06, + "loss": -0.0586, + "reward": 0.9167026877403259, + "reward_std": 0.1917964667081833, + "rewards/format_reward": 0.16373620927333832, + "rewards/judgement_reward": 0.7133510112762451, + "rewards/token_reward": 0.03961538150906563, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.3901138305664, + "epoch": 0.7317073170731707, + "grad_norm": 1.9248149745795777, + "kl": 0.047119140625, + "learning_rate": 1e-06, + "loss": -0.1216, + "reward": 0.8943402767181396, + "reward_std": 0.21739540994167328, + "rewards/format_reward": 0.16813181340694427, + "rewards/judgement_reward": 0.6763730645179749, + "rewards/token_reward": 0.04983516409993172, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.5714340209961, + "epoch": 0.7365853658536585, + "grad_norm": 2.0080133802993836, + "kl": 0.056884765625, + "learning_rate": 1e-06, + "loss": -0.0591, + "reward": 0.967893660068512, + "reward_std": 0.24371013045310974, + "rewards/format_reward": 0.17252740263938904, + "rewards/judgement_reward": 0.7508604526519775, + "rewards/token_reward": 0.04450549930334091, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.76374053955078, + "epoch": 0.7414634146341463, + "grad_norm": 1.923607313408308, + "kl": 0.044189453125, + "learning_rate": 1e-06, + "loss": -0.0686, + "reward": 0.9229432940483093, + "reward_std": 0.23856668174266815, + "rewards/format_reward": 0.16923069953918457, + "rewards/judgement_reward": 0.7147566676139832, + "rewards/token_reward": 0.03895604610443115, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.60989379882812, + "epoch": 0.7463414634146341, + "grad_norm": 1.9096459984701761, + "kl": 0.043212890625, + "learning_rate": 1e-06, + "loss": -0.0566, + "reward": 1.0493299961090088, + "reward_std": 0.22466498613357544, + "rewards/format_reward": 0.18461531400680542, + "rewards/judgement_reward": 0.8225165605545044, + "rewards/token_reward": 0.04219780117273331, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.5988998413086, + "epoch": 0.751219512195122, + "grad_norm": 1.9835076997357493, + "kl": 0.04150390625, + "learning_rate": 1e-06, + "loss": -0.0931, + "reward": 0.8906086683273315, + "reward_std": 0.14505840837955475, + "rewards/format_reward": 0.1648351103067398, + "rewards/judgement_reward": 0.7020372152328491, + "rewards/token_reward": 0.023736264556646347, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.36264038085938, + "epoch": 0.7560975609756098, + "grad_norm": 1.840974141351389, + "kl": 0.044921875, + "learning_rate": 1e-06, + "loss": -0.091, + "reward": 1.1236339807510376, + "reward_std": 0.17279954254627228, + "rewards/format_reward": 0.1868131309747696, + "rewards/judgement_reward": 0.8986337780952454, + "rewards/token_reward": 0.03818681463599205, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.02747344970703, + "epoch": 0.7609756097560976, + "grad_norm": 1.963792827769517, + "kl": 0.049560546875, + "learning_rate": 1e-06, + "loss": -0.0361, + "reward": 0.855387270450592, + "reward_std": 0.22867831587791443, + "rewards/format_reward": 0.16153840720653534, + "rewards/judgement_reward": 0.6642332077026367, + "rewards/token_reward": 0.029615381732583046, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.95055389404297, + "epoch": 0.7658536585365854, + "grad_norm": 1.9306677990628487, + "kl": 0.04736328125, + "learning_rate": 1e-06, + "loss": -0.0329, + "reward": 0.9254786372184753, + "reward_std": 0.23398159444332123, + "rewards/format_reward": 0.17032961547374725, + "rewards/judgement_reward": 0.7126215100288391, + "rewards/token_reward": 0.042527470737695694, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.71428680419922, + "epoch": 0.7707317073170732, + "grad_norm": 1.9488316654243145, + "kl": 0.04541015625, + "learning_rate": 1e-06, + "loss": -0.0659, + "reward": 1.0084419250488281, + "reward_std": 0.18995091319084167, + "rewards/format_reward": 0.18351641297340393, + "rewards/judgement_reward": 0.7943759560585022, + "rewards/token_reward": 0.03054944798350334, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.46154022216797, + "epoch": 0.775609756097561, + "grad_norm": 2.4866962371762926, + "kl": 0.107421875, + "learning_rate": 1e-06, + "loss": 0.0118, + "reward": 1.0291651487350464, + "reward_std": 0.22909829020500183, + "rewards/format_reward": 0.17032960057258606, + "rewards/judgement_reward": 0.8225165605545044, + "rewards/token_reward": 0.03631868213415146, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.51099395751953, + "epoch": 0.7804878048780488, + "grad_norm": 2.1399222908924966, + "kl": 0.035888671875, + "learning_rate": 1e-06, + "loss": -0.1147, + "reward": 1.0689184665679932, + "reward_std": 0.2075626105070114, + "rewards/format_reward": 0.17912080883979797, + "rewards/judgement_reward": 0.8432589173316956, + "rewards/token_reward": 0.04653845727443695, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.28571319580078, + "epoch": 0.7853658536585366, + "grad_norm": 2.0654519129717954, + "kl": 0.038818359375, + "learning_rate": 1e-06, + "loss": -0.0747, + "reward": 1.0535253286361694, + "reward_std": 0.1914973109960556, + "rewards/format_reward": 0.18021972477436066, + "rewards/judgement_reward": 0.8373165726661682, + "rewards/token_reward": 0.03598900884389877, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.79121398925781, + "epoch": 0.7902439024390244, + "grad_norm": 1.8597487323795248, + "kl": 0.037353515625, + "learning_rate": 1e-06, + "loss": -0.0976, + "reward": 0.981767475605011, + "reward_std": 0.15387368202209473, + "rewards/format_reward": 0.17142850160598755, + "rewards/judgement_reward": 0.7899540662765503, + "rewards/token_reward": 0.020384615287184715, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.78571319580078, + "epoch": 0.7951219512195122, + "grad_norm": 1.8877094518624145, + "kl": 0.0517578125, + "learning_rate": 1e-06, + "loss": -0.0441, + "reward": 0.611198365688324, + "reward_std": 0.23072822391986847, + "rewards/format_reward": 0.14065930247306824, + "rewards/judgement_reward": 0.431198388338089, + "rewards/token_reward": 0.03934066370129585, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.0054931640625, + "epoch": 0.8, + "grad_norm": 1.7914504262564466, + "kl": 0.049072265625, + "learning_rate": 1e-06, + "loss": -0.0773, + "reward": 1.0531319379806519, + "reward_std": 0.203482985496521, + "rewards/format_reward": 0.1758241206407547, + "rewards/judgement_reward": 0.8310439586639404, + "rewards/token_reward": 0.04626372829079628, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.20330047607422, + "epoch": 0.8048780487804879, + "grad_norm": 1.8591700696334572, + "kl": 0.03662109375, + "learning_rate": 1e-06, + "loss": -0.0489, + "reward": 1.1166560649871826, + "reward_std": 0.1697649508714676, + "rewards/format_reward": 0.1857142299413681, + "rewards/judgement_reward": 0.8986338973045349, + "rewards/token_reward": 0.032307688146829605, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.22527313232422, + "epoch": 0.8097560975609757, + "grad_norm": 2.214129107678454, + "kl": 0.052978515625, + "learning_rate": 1e-06, + "loss": 0.0263, + "reward": 0.6829060912132263, + "reward_std": 0.25708603858947754, + "rewards/format_reward": 0.14285710453987122, + "rewards/judgement_reward": 0.5113676190376282, + "rewards/token_reward": 0.0286813173443079, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.74725341796875, + "epoch": 0.8146341463414634, + "grad_norm": 1.8131229003189935, + "kl": 0.048095703125, + "learning_rate": 1e-06, + "loss": -0.0818, + "reward": 0.8743994832038879, + "reward_std": 0.16046012938022614, + "rewards/format_reward": 0.1648351103067398, + "rewards/judgement_reward": 0.672916054725647, + "rewards/token_reward": 0.03664834797382355, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.55494689941406, + "epoch": 0.8195121951219512, + "grad_norm": 1.8955641226851798, + "kl": 0.0556640625, + "learning_rate": 1e-06, + "loss": -0.0973, + "reward": 1.104522705078125, + "reward_std": 0.19338937103748322, + "rewards/format_reward": 0.18241751194000244, + "rewards/judgement_reward": 0.8802368640899658, + "rewards/token_reward": 0.04186813160777092, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.85165405273438, + "epoch": 0.824390243902439, + "grad_norm": 1.7813215626916248, + "kl": 0.05029296875, + "learning_rate": 1e-06, + "loss": -0.0779, + "reward": 0.8853285312652588, + "reward_std": 0.22625528275966644, + "rewards/format_reward": 0.1659340113401413, + "rewards/judgement_reward": 0.6722515225410461, + "rewards/token_reward": 0.04714285209774971, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.70879364013672, + "epoch": 0.8292682926829268, + "grad_norm": 1.8224025120445444, + "kl": 0.05224609375, + "learning_rate": 1e-06, + "loss": -0.1272, + "reward": 1.0830038785934448, + "reward_std": 0.1721232831478119, + "rewards/format_reward": 0.18131862580776215, + "rewards/judgement_reward": 0.858058750629425, + "rewards/token_reward": 0.043626368045806885, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.5934066772461, + "epoch": 0.8341463414634146, + "grad_norm": 1.8203230311858847, + "kl": 0.050048828125, + "learning_rate": 1e-06, + "loss": -0.1045, + "reward": 1.1872518062591553, + "reward_std": 0.12512442469596863, + "rewards/format_reward": 0.1879120171070099, + "rewards/judgement_reward": 0.9585154056549072, + "rewards/token_reward": 0.040824174880981445, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.5988998413086, + "epoch": 0.8390243902439024, + "grad_norm": 1.8580888818259809, + "kl": 0.06591796875, + "learning_rate": 1e-06, + "loss": 0.012, + "reward": 0.7364369034767151, + "reward_std": 0.24575328826904297, + "rewards/format_reward": 0.14945051074028015, + "rewards/judgement_reward": 0.5472610592842102, + "rewards/token_reward": 0.039725273847579956, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.8901138305664, + "epoch": 0.8439024390243902, + "grad_norm": 1.9076522374281615, + "kl": 0.0673828125, + "learning_rate": 1e-06, + "loss": -0.0474, + "reward": 0.700067400932312, + "reward_std": 0.17015255987644196, + "rewards/format_reward": 0.15714280307292938, + "rewards/judgement_reward": 0.5104518532752991, + "rewards/token_reward": 0.03247252479195595, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.93406677246094, + "epoch": 0.848780487804878, + "grad_norm": 1.9086822135882167, + "kl": 0.06396484375, + "learning_rate": 1e-06, + "loss": -0.099, + "reward": 0.841740071773529, + "reward_std": 0.18049749732017517, + "rewards/format_reward": 0.16263730823993683, + "rewards/judgement_reward": 0.6345422863960266, + "rewards/token_reward": 0.04456043988466263, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.34066009521484, + "epoch": 0.8536585365853658, + "grad_norm": 1.8635805588199053, + "kl": 0.055419921875, + "learning_rate": 1e-06, + "loss": -0.0344, + "reward": 0.9539201855659485, + "reward_std": 0.20264722406864166, + "rewards/format_reward": 0.16813179850578308, + "rewards/judgement_reward": 0.7532058954238892, + "rewards/token_reward": 0.03258241340517998, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.26374053955078, + "epoch": 0.8585365853658536, + "grad_norm": 2.0330839117251336, + "kl": 0.361328125, + "learning_rate": 1e-06, + "loss": -0.0225, + "reward": 0.918322741985321, + "reward_std": 0.2709953188896179, + "rewards/format_reward": 0.17802190780639648, + "rewards/judgement_reward": 0.706509530544281, + "rewards/token_reward": 0.033791206777095795, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.0824203491211, + "epoch": 0.8634146341463415, + "grad_norm": 1.7553145835440749, + "kl": 0.04833984375, + "learning_rate": 1e-06, + "loss": -0.0939, + "reward": 1.1063908338546753, + "reward_std": 0.17723402380943298, + "rewards/format_reward": 0.18241752684116364, + "rewards/judgement_reward": 0.8802370429039001, + "rewards/token_reward": 0.04373626038432121, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.05494689941406, + "epoch": 0.8682926829268293, + "grad_norm": 1.7701802023624165, + "kl": 0.0478515625, + "learning_rate": 1e-06, + "loss": -0.0524, + "reward": 1.1443761587142944, + "reward_std": 0.151431143283844, + "rewards/format_reward": 0.18791203200817108, + "rewards/judgement_reward": 0.9193762540817261, + "rewards/token_reward": 0.03708790987730026, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.87911987304688, + "epoch": 0.8731707317073171, + "grad_norm": 1.948906301032531, + "kl": 0.04833984375, + "learning_rate": 1e-06, + "loss": -0.0406, + "reward": 1.0914630889892578, + "reward_std": 0.17707206308841705, + "rewards/format_reward": 0.1857142299413681, + "rewards/judgement_reward": 0.8778916597366333, + "rewards/token_reward": 0.027857141569256783, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.11538696289062, + "epoch": 0.8780487804878049, + "grad_norm": 1.9707362441929812, + "kl": 0.041748046875, + "learning_rate": 1e-06, + "loss": -0.0719, + "reward": 0.8997630476951599, + "reward_std": 0.19621287286281586, + "rewards/format_reward": 0.17252741754055023, + "rewards/judgement_reward": 0.6895432472229004, + "rewards/token_reward": 0.03769230842590332, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.06593322753906, + "epoch": 0.8829268292682927, + "grad_norm": 2.1153280959093315, + "kl": 0.04931640625, + "learning_rate": 1e-06, + "loss": -0.1035, + "reward": 0.8650724291801453, + "reward_std": 0.18447832763195038, + "rewards/format_reward": 0.16153840720653534, + "rewards/judgement_reward": 0.6753472089767456, + "rewards/token_reward": 0.02818680927157402, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.80220031738281, + "epoch": 0.8878048780487805, + "grad_norm": 1.8505179859150593, + "kl": 0.10546875, + "learning_rate": 1e-06, + "loss": -0.1083, + "reward": 1.0732321739196777, + "reward_std": 0.15069392323493958, + "rewards/format_reward": 0.18131862580776215, + "rewards/judgement_reward": 0.8640012145042419, + "rewards/token_reward": 0.027912087738513947, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.72527313232422, + "epoch": 0.8926829268292683, + "grad_norm": 1.7933498219789568, + "kl": 0.1640625, + "learning_rate": 1e-06, + "loss": -0.0539, + "reward": 1.0424714088439941, + "reward_std": 0.25467538833618164, + "rewards/format_reward": 0.18351641297340393, + "rewards/judgement_reward": 0.8041195869445801, + "rewards/token_reward": 0.05483516305685043, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.67582702636719, + "epoch": 0.8975609756097561, + "grad_norm": 2.600363026488421, + "kl": 0.04931640625, + "learning_rate": 1e-06, + "loss": -0.0476, + "reward": 0.755411684513092, + "reward_std": 0.23074375092983246, + "rewards/format_reward": 0.15714281797409058, + "rewards/judgement_reward": 0.5580490231513977, + "rewards/token_reward": 0.040219780057668686, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.43955993652344, + "epoch": 0.9024390243902439, + "grad_norm": 1.7627204492703208, + "kl": 0.0537109375, + "learning_rate": 1e-06, + "loss": -0.0844, + "reward": 1.1599979400634766, + "reward_std": 0.17514334619045258, + "rewards/format_reward": 0.18901091814041138, + "rewards/judgement_reward": 0.9170307517051697, + "rewards/token_reward": 0.0539560429751873, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.25824737548828, + "epoch": 0.9073170731707317, + "grad_norm": 2.0241371886975035, + "kl": 0.05517578125, + "learning_rate": 1e-06, + "loss": -0.1234, + "reward": 1.0605839490890503, + "reward_std": 0.17436476051807404, + "rewards/format_reward": 0.18131862580776215, + "rewards/judgement_reward": 0.8380013704299927, + "rewards/token_reward": 0.04126373305916786, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.9011001586914, + "epoch": 0.9121951219512195, + "grad_norm": 1.7110207760725646, + "kl": 0.055419921875, + "learning_rate": 1e-06, + "loss": -0.0842, + "reward": 0.7558942437171936, + "reward_std": 0.18241019546985626, + "rewards/format_reward": 0.1549450010061264, + "rewards/judgement_reward": 0.5489163398742676, + "rewards/token_reward": 0.052032966166734695, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.96154022216797, + "epoch": 0.9170731707317074, + "grad_norm": 1.7123226348250025, + "kl": 0.26171875, + "learning_rate": 1e-06, + "loss": -0.1222, + "reward": 0.8997620940208435, + "reward_std": 0.2118684947490692, + "rewards/format_reward": 0.17142850160598755, + "rewards/judgement_reward": 0.6890478134155273, + "rewards/token_reward": 0.03928571194410324, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.17033386230469, + "epoch": 0.9219512195121952, + "grad_norm": 1.830366489112101, + "kl": 0.060302734375, + "learning_rate": 1e-06, + "loss": -0.0873, + "reward": 1.1136815547943115, + "reward_std": 0.16834889352321625, + "rewards/format_reward": 0.18131859600543976, + "rewards/judgement_reward": 0.897197961807251, + "rewards/token_reward": 0.035164833068847656, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.31318664550781, + "epoch": 0.926829268292683, + "grad_norm": 1.6347844224064585, + "kl": 0.059814453125, + "learning_rate": 1e-06, + "loss": -0.0761, + "reward": 1.1308867931365967, + "reward_std": 0.17480330169200897, + "rewards/format_reward": 0.18241752684116364, + "rewards/judgement_reward": 0.8986338973045349, + "rewards/token_reward": 0.04983516409993172, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.97802734375, + "epoch": 0.9317073170731708, + "grad_norm": 1.6920004921526572, + "kl": 0.05615234375, + "learning_rate": 1e-06, + "loss": -0.0895, + "reward": 1.003623604774475, + "reward_std": 0.17075157165527344, + "rewards/format_reward": 0.17472520470619202, + "rewards/judgement_reward": 0.7810961008071899, + "rewards/token_reward": 0.04780219867825508, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.5714340209961, + "epoch": 0.9365853658536586, + "grad_norm": 2.022501703444944, + "kl": 0.08837890625, + "learning_rate": 1e-06, + "loss": -0.039, + "reward": 0.8640207648277283, + "reward_std": 0.23843564093112946, + "rewards/format_reward": 0.16043950617313385, + "rewards/judgement_reward": 0.6474823355674744, + "rewards/token_reward": 0.056098904460668564, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.76923370361328, + "epoch": 0.9414634146341463, + "grad_norm": 1.6540302758951702, + "kl": 0.058349609375, + "learning_rate": 1e-06, + "loss": -0.0698, + "reward": 1.0908318758010864, + "reward_std": 0.18835541605949402, + "rewards/format_reward": 0.18241751194000244, + "rewards/judgement_reward": 0.8616559505462646, + "rewards/token_reward": 0.046758245676755905, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.17033386230469, + "epoch": 0.9463414634146341, + "grad_norm": 2.029218121152698, + "kl": 0.06591796875, + "learning_rate": 1e-06, + "loss": 0.0222, + "reward": 0.7869437336921692, + "reward_std": 0.28270089626312256, + "rewards/format_reward": 0.16263730823993683, + "rewards/judgement_reward": 0.5819985866546631, + "rewards/token_reward": 0.042307689785957336, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.43955993652344, + "epoch": 0.9512195121951219, + "grad_norm": 1.9170587116508577, + "kl": 0.061767578125, + "learning_rate": 1e-06, + "loss": -0.0761, + "reward": 1.1304645538330078, + "reward_std": 0.1845540553331375, + "rewards/format_reward": 0.18791203200817108, + "rewards/judgement_reward": 0.8962885737419128, + "rewards/token_reward": 0.046263739466667175, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.04396057128906, + "epoch": 0.9560975609756097, + "grad_norm": 1.7642610809126764, + "kl": 0.053466796875, + "learning_rate": 1e-06, + "loss": -0.0974, + "reward": 1.0226733684539795, + "reward_std": 0.23680001497268677, + "rewards/format_reward": 0.18241751194000244, + "rewards/judgement_reward": 0.7795965075492859, + "rewards/token_reward": 0.06065933778882027, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.42857360839844, + "epoch": 0.9609756097560975, + "grad_norm": 1.8138413422032649, + "kl": 0.05712890625, + "learning_rate": 1e-06, + "loss": -0.0532, + "reward": 0.8977496027946472, + "reward_std": 0.20727433264255524, + "rewards/format_reward": 0.17032960057258606, + "rewards/judgement_reward": 0.6894528269767761, + "rewards/token_reward": 0.03796703368425369, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.25274658203125, + "epoch": 0.9658536585365853, + "grad_norm": 1.896125990017999, + "kl": 0.053955078125, + "learning_rate": 1e-06, + "loss": -0.0677, + "reward": 0.8915479183197021, + "reward_std": 0.2226872593164444, + "rewards/format_reward": 0.16153840720653534, + "rewards/judgement_reward": 0.6870973110198975, + "rewards/token_reward": 0.042912084609270096, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.41758728027344, + "epoch": 0.9707317073170731, + "grad_norm": 1.854570297914855, + "kl": 0.048828125, + "learning_rate": 1e-06, + "loss": -0.0345, + "reward": 0.7600922584533691, + "reward_std": 0.22392849624156952, + "rewards/format_reward": 0.1560439020395279, + "rewards/judgement_reward": 0.575421929359436, + "rewards/token_reward": 0.028626374900341034, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.27472686767578, + "epoch": 0.975609756097561, + "grad_norm": 1.8461878166271504, + "kl": 0.05322265625, + "learning_rate": 1e-06, + "loss": -0.0727, + "reward": 0.5569639205932617, + "reward_std": 0.14818796515464783, + "rewards/format_reward": 0.1340659111738205, + "rewards/judgement_reward": 0.3882276713848114, + "rewards/token_reward": 0.034670326858758926, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.75824737548828, + "epoch": 0.9804878048780488, + "grad_norm": 2.318927168904253, + "kl": 0.056396484375, + "learning_rate": 1e-06, + "loss": 0.0058, + "reward": 0.5673214197158813, + "reward_std": 0.23380212485790253, + "rewards/format_reward": 0.14285710453987122, + "rewards/judgement_reward": 0.393749862909317, + "rewards/token_reward": 0.030714284628629684, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.66484069824219, + "epoch": 0.9853658536585366, + "grad_norm": 1.9516131221905255, + "kl": 0.051513671875, + "learning_rate": 1e-06, + "loss": -0.1456, + "reward": 1.2027462720870972, + "reward_std": 0.13449910283088684, + "rewards/format_reward": 0.19230760633945465, + "rewards/judgement_reward": 0.9585154056549072, + "rewards/token_reward": 0.051923077553510666, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.9065933227539, + "epoch": 0.9902439024390244, + "grad_norm": 1.9128695892010321, + "kl": 0.052734375, + "learning_rate": 1e-06, + "loss": -0.0288, + "reward": 0.8996109366416931, + "reward_std": 0.25487908720970154, + "rewards/format_reward": 0.17362631857395172, + "rewards/judgement_reward": 0.7061491012573242, + "rewards/token_reward": 0.01983516290783882, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.14286041259766, + "epoch": 0.9951219512195122, + "grad_norm": 1.7460376924567018, + "kl": 0.058349609375, + "learning_rate": 1e-06, + "loss": -0.0686, + "reward": 0.9356229305267334, + "reward_std": 0.17833828926086426, + "rewards/format_reward": 0.17032961547374725, + "rewards/judgement_reward": 0.7273810505867004, + "rewards/token_reward": 0.03791208565235138, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.55555725097656, + "epoch": 1.0, + "grad_norm": 1.8130933404400773, + "kl": 0.051513671875, + "learning_rate": 1e-06, + "loss": -0.0604, + "reward": 0.9366883635520935, + "reward_std": 0.1746482402086258, + "rewards/format_reward": 0.1560439020395279, + "rewards/judgement_reward": 0.7467983365058899, + "rewards/token_reward": 0.03384615480899811, + "step": 205 + } + ], + "logging_steps": 1, + "max_steps": 205, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 26, + "trial_name": null, + "trial_params": null +}