|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 205, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.69230651855469, |
|
"epoch": 0.004878048780487805, |
|
"grad_norm": 1.988409597361304, |
|
"kl": 0.0002803802490234375, |
|
"learning_rate": 5e-08, |
|
"loss": -0.0332, |
|
"reward": 0.38933777809143066, |
|
"reward_std": 0.2539410889148712, |
|
"rewards/format_reward": 0.11098899692296982, |
|
"rewards/judgement_reward": 0.26093122363090515, |
|
"rewards/token_reward": 0.017417579889297485, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.14835357666016, |
|
"epoch": 0.00975609756097561, |
|
"grad_norm": 2.131261717353683, |
|
"kl": 0.000675201416015625, |
|
"learning_rate": 1e-07, |
|
"loss": -0.0789, |
|
"reward": 0.3299258053302765, |
|
"reward_std": 0.24830038845539093, |
|
"rewards/format_reward": 0.09890110045671463, |
|
"rewards/judgement_reward": 0.2083873748779297, |
|
"rewards/token_reward": 0.02263736166059971, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.86813354492188, |
|
"epoch": 0.014634146341463415, |
|
"grad_norm": 1.9020051525729609, |
|
"kl": 0.00048828125, |
|
"learning_rate": 1.5e-07, |
|
"loss": -0.0115, |
|
"reward": 0.3796335756778717, |
|
"reward_std": 0.2533051073551178, |
|
"rewards/format_reward": 0.09340659528970718, |
|
"rewards/judgement_reward": 0.26880943775177, |
|
"rewards/token_reward": 0.017417583614587784, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.22527313232422, |
|
"epoch": 0.01951219512195122, |
|
"grad_norm": 1.926669794956742, |
|
"kl": 0.0002899169921875, |
|
"learning_rate": 2e-07, |
|
"loss": 0.002, |
|
"reward": 0.30424928665161133, |
|
"reward_std": 0.23502430319786072, |
|
"rewards/format_reward": 0.08681320399045944, |
|
"rewards/judgement_reward": 0.1996888816356659, |
|
"rewards/token_reward": 0.01774725317955017, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.5824203491211, |
|
"epoch": 0.024390243902439025, |
|
"grad_norm": 1.931358237356469, |
|
"kl": 0.000339508056640625, |
|
"learning_rate": 2.5e-07, |
|
"loss": -0.0284, |
|
"reward": 0.27388590574264526, |
|
"reward_std": 0.22234365344047546, |
|
"rewards/format_reward": 0.09450550377368927, |
|
"rewards/judgement_reward": 0.1600947231054306, |
|
"rewards/token_reward": 0.019285714253783226, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.84066009521484, |
|
"epoch": 0.02926829268292683, |
|
"grad_norm": 2.102547432960352, |
|
"kl": 0.000278472900390625, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0118, |
|
"reward": 0.5828344821929932, |
|
"reward_std": 0.27698564529418945, |
|
"rewards/format_reward": 0.11098900437355042, |
|
"rewards/judgement_reward": 0.4510761499404907, |
|
"rewards/token_reward": 0.020769229158759117, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.5769271850586, |
|
"epoch": 0.03414634146341464, |
|
"grad_norm": 2.187265324428969, |
|
"kl": 0.00026702880859375, |
|
"learning_rate": 3.5e-07, |
|
"loss": -0.0371, |
|
"reward": 0.38959935307502747, |
|
"reward_std": 0.274240106344223, |
|
"rewards/format_reward": 0.10659340023994446, |
|
"rewards/judgement_reward": 0.25861039757728577, |
|
"rewards/token_reward": 0.024395601823925972, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.87362670898438, |
|
"epoch": 0.03902439024390244, |
|
"grad_norm": 1.7011251355501271, |
|
"kl": 0.0003719329833984375, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0269, |
|
"reward": 0.26069721579551697, |
|
"reward_std": 0.22703438997268677, |
|
"rewards/format_reward": 0.0912087932229042, |
|
"rewards/judgement_reward": 0.15185105800628662, |
|
"rewards/token_reward": 0.017637362703680992, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.25274658203125, |
|
"epoch": 0.04390243902439024, |
|
"grad_norm": 2.0455198863312387, |
|
"kl": 0.000316619873046875, |
|
"learning_rate": 4.5e-07, |
|
"loss": -0.0821, |
|
"reward": 0.34196677803993225, |
|
"reward_std": 0.25876346230506897, |
|
"rewards/format_reward": 0.09670329838991165, |
|
"rewards/judgement_reward": 0.22526347637176514, |
|
"rewards/token_reward": 0.019999999552965164, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.80769348144531, |
|
"epoch": 0.04878048780487805, |
|
"grad_norm": 1.8007886440567025, |
|
"kl": 0.000514984130859375, |
|
"learning_rate": 5e-07, |
|
"loss": -0.0259, |
|
"reward": 0.34238940477371216, |
|
"reward_std": 0.2460326850414276, |
|
"rewards/format_reward": 0.09560439735651016, |
|
"rewards/judgement_reward": 0.23140040040016174, |
|
"rewards/token_reward": 0.015384615398943424, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.19780731201172, |
|
"epoch": 0.05365853658536585, |
|
"grad_norm": 1.9673405461994626, |
|
"kl": 0.000698089599609375, |
|
"learning_rate": 5.5e-07, |
|
"loss": -0.0162, |
|
"reward": 0.18868696689605713, |
|
"reward_std": 0.15467074513435364, |
|
"rewards/format_reward": 0.07252748310565948, |
|
"rewards/judgement_reward": 0.10967598855495453, |
|
"rewards/token_reward": 0.006483516190201044, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.06593322753906, |
|
"epoch": 0.05853658536585366, |
|
"grad_norm": 1.9439922450073313, |
|
"kl": 0.000568389892578125, |
|
"learning_rate": 6e-07, |
|
"loss": -0.0065, |
|
"reward": 0.23565447330474854, |
|
"reward_std": 0.1971663385629654, |
|
"rewards/format_reward": 0.0912088081240654, |
|
"rewards/judgement_reward": 0.1310391128063202, |
|
"rewards/token_reward": 0.013406592421233654, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.83516693115234, |
|
"epoch": 0.06341463414634146, |
|
"grad_norm": 1.7492995284129258, |
|
"kl": 0.00131988525390625, |
|
"learning_rate": 6.5e-07, |
|
"loss": -0.0114, |
|
"reward": 0.4377107322216034, |
|
"reward_std": 0.2658415734767914, |
|
"rewards/format_reward": 0.09670329838991165, |
|
"rewards/judgement_reward": 0.320293128490448, |
|
"rewards/token_reward": 0.02071428671479225, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.25274658203125, |
|
"epoch": 0.06829268292682927, |
|
"grad_norm": 1.8852756312045653, |
|
"kl": 0.0023956298828125, |
|
"learning_rate": 7e-07, |
|
"loss": -0.1104, |
|
"reward": 0.25799688696861267, |
|
"reward_std": 0.22852738201618195, |
|
"rewards/format_reward": 0.08461539447307587, |
|
"rewards/judgement_reward": 0.1457441747188568, |
|
"rewards/token_reward": 0.027637362480163574, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.5934066772461, |
|
"epoch": 0.07317073170731707, |
|
"grad_norm": 1.8265395690821953, |
|
"kl": 0.003021240234375, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.0091, |
|
"reward": 0.3250506818294525, |
|
"reward_std": 0.2030942291021347, |
|
"rewards/format_reward": 0.08901099860668182, |
|
"rewards/judgement_reward": 0.2235121876001358, |
|
"rewards/token_reward": 0.012527472339570522, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.31318664550781, |
|
"epoch": 0.07804878048780488, |
|
"grad_norm": 1.8379653827038929, |
|
"kl": 0.0087890625, |
|
"learning_rate": 8e-07, |
|
"loss": -0.0047, |
|
"reward": 0.3267355263233185, |
|
"reward_std": 0.24120216071605682, |
|
"rewards/format_reward": 0.10109890252351761, |
|
"rewards/judgement_reward": 0.2026696354150772, |
|
"rewards/token_reward": 0.022967034950852394, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.23077392578125, |
|
"epoch": 0.08292682926829269, |
|
"grad_norm": 2.0529600972186666, |
|
"kl": 0.005584716796875, |
|
"learning_rate": 8.499999999999999e-07, |
|
"loss": 0.043, |
|
"reward": 0.3882504105567932, |
|
"reward_std": 0.24895574152469635, |
|
"rewards/format_reward": 0.1021978035569191, |
|
"rewards/judgement_reward": 0.2652834951877594, |
|
"rewards/token_reward": 0.020769229158759117, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.64835357666016, |
|
"epoch": 0.08780487804878048, |
|
"grad_norm": 2.1163090543204053, |
|
"kl": 0.01458740234375, |
|
"learning_rate": 9e-07, |
|
"loss": 0.04, |
|
"reward": 0.36192721128463745, |
|
"reward_std": 0.21096283197402954, |
|
"rewards/format_reward": 0.10659340769052505, |
|
"rewards/judgement_reward": 0.24775134027004242, |
|
"rewards/token_reward": 0.007582417689263821, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.26374053955078, |
|
"epoch": 0.09268292682926829, |
|
"grad_norm": 1.8165905290949755, |
|
"kl": 0.006988525390625, |
|
"learning_rate": 9.499999999999999e-07, |
|
"loss": 0.081, |
|
"reward": 0.3227555453777313, |
|
"reward_std": 0.20002064108848572, |
|
"rewards/format_reward": 0.10989010334014893, |
|
"rewards/judgement_reward": 0.20275558531284332, |
|
"rewards/token_reward": 0.01010989025235176, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.64835357666016, |
|
"epoch": 0.0975609756097561, |
|
"grad_norm": 1.967123469564035, |
|
"kl": 0.0120849609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0601, |
|
"reward": 0.474505752325058, |
|
"reward_std": 0.2401813268661499, |
|
"rewards/format_reward": 0.11978019773960114, |
|
"rewards/judgement_reward": 0.33972567319869995, |
|
"rewards/token_reward": 0.015000000596046448, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.8956069946289, |
|
"epoch": 0.1024390243902439, |
|
"grad_norm": 1.7567747825280924, |
|
"kl": 0.0108642578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0215, |
|
"reward": 0.27094441652297974, |
|
"reward_std": 0.20202507078647614, |
|
"rewards/format_reward": 0.10549449920654297, |
|
"rewards/judgement_reward": 0.15775761008262634, |
|
"rewards/token_reward": 0.007692308630794287, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.54945373535156, |
|
"epoch": 0.1073170731707317, |
|
"grad_norm": 1.8581558141105476, |
|
"kl": 0.08349609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.048, |
|
"reward": 0.4122963845729828, |
|
"reward_std": 0.24994409084320068, |
|
"rewards/format_reward": 0.1230769008398056, |
|
"rewards/judgement_reward": 0.27993375062942505, |
|
"rewards/token_reward": 0.009285714477300644, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 67.73626708984375, |
|
"epoch": 0.11219512195121951, |
|
"grad_norm": 1.8992395015405976, |
|
"kl": 0.01470947265625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.014, |
|
"reward": 0.22567632794380188, |
|
"reward_std": 0.1741848886013031, |
|
"rewards/format_reward": 0.09560439735651016, |
|
"rewards/judgement_reward": 0.11776423454284668, |
|
"rewards/token_reward": 0.012307691387832165, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.0054931640625, |
|
"epoch": 0.11707317073170732, |
|
"grad_norm": 2.333890318615974, |
|
"kl": 0.01806640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0425, |
|
"reward": 0.3388529121875763, |
|
"reward_std": 0.22322086989879608, |
|
"rewards/format_reward": 0.12087910622358322, |
|
"rewards/judgement_reward": 0.20572103559970856, |
|
"rewards/token_reward": 0.012252748012542725, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.61538696289062, |
|
"epoch": 0.12195121951219512, |
|
"grad_norm": 2.5484341340291334, |
|
"kl": 0.021728515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0759, |
|
"reward": 0.6952180862426758, |
|
"reward_std": 0.2809818983078003, |
|
"rewards/format_reward": 0.15054939687252045, |
|
"rewards/judgement_reward": 0.5331300497055054, |
|
"rewards/token_reward": 0.011538460850715637, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.8956069946289, |
|
"epoch": 0.12682926829268293, |
|
"grad_norm": 2.00022024625464, |
|
"kl": 0.0308837890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0145, |
|
"reward": 0.4654199182987213, |
|
"reward_std": 0.24022048711776733, |
|
"rewards/format_reward": 0.13076920807361603, |
|
"rewards/judgement_reward": 0.3248704969882965, |
|
"rewards/token_reward": 0.0097802197560668, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 65.3846206665039, |
|
"epoch": 0.13170731707317074, |
|
"grad_norm": 2.133845945151958, |
|
"kl": 0.028564453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0379, |
|
"reward": 0.6326283812522888, |
|
"reward_std": 0.25742557644844055, |
|
"rewards/format_reward": 0.14945051074028015, |
|
"rewards/judgement_reward": 0.4757600724697113, |
|
"rewards/token_reward": 0.0074175819754600525, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 66.95604705810547, |
|
"epoch": 0.13658536585365855, |
|
"grad_norm": 2.1425979650194793, |
|
"kl": 0.021484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0393, |
|
"reward": 0.42065250873565674, |
|
"reward_std": 0.17824424803256989, |
|
"rewards/format_reward": 0.1131868064403534, |
|
"rewards/judgement_reward": 0.30175137519836426, |
|
"rewards/token_reward": 0.0057142856530845165, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.22527313232422, |
|
"epoch": 0.14146341463414633, |
|
"grad_norm": 2.187328509574426, |
|
"kl": 0.0264892578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0145, |
|
"reward": 0.5736148357391357, |
|
"reward_std": 0.250982403755188, |
|
"rewards/format_reward": 0.14175820350646973, |
|
"rewards/judgement_reward": 0.4122960567474365, |
|
"rewards/token_reward": 0.01956043764948845, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.989013671875, |
|
"epoch": 0.14634146341463414, |
|
"grad_norm": 2.0631362920367984, |
|
"kl": 0.0223388671875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0436, |
|
"reward": 0.8461222648620605, |
|
"reward_std": 0.25097036361694336, |
|
"rewards/format_reward": 0.16153840720653534, |
|
"rewards/judgement_reward": 0.6611222624778748, |
|
"rewards/token_reward": 0.023461539298295975, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 62.67582702636719, |
|
"epoch": 0.15121951219512195, |
|
"grad_norm": 2.5463613589005516, |
|
"kl": 0.037353515625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0348, |
|
"reward": 0.6614670753479004, |
|
"reward_std": 0.2477238029241562, |
|
"rewards/format_reward": 0.1439560055732727, |
|
"rewards/judgement_reward": 0.507401168346405, |
|
"rewards/token_reward": 0.01010989025235176, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 68.18681335449219, |
|
"epoch": 0.15609756097560976, |
|
"grad_norm": 2.157033445164365, |
|
"kl": 0.0264892578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0477, |
|
"reward": 0.6428108811378479, |
|
"reward_std": 0.21757948398590088, |
|
"rewards/format_reward": 0.14285710453987122, |
|
"rewards/judgement_reward": 0.4923712909221649, |
|
"rewards/token_reward": 0.007582417223602533, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 70.52747344970703, |
|
"epoch": 0.16097560975609757, |
|
"grad_norm": 2.5147644681680013, |
|
"kl": 1.71875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0331, |
|
"reward": 0.8551515340805054, |
|
"reward_std": 0.20301613211631775, |
|
"rewards/format_reward": 0.15934060513973236, |
|
"rewards/judgement_reward": 0.6783931255340576, |
|
"rewards/token_reward": 0.017417581751942635, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.4120864868164, |
|
"epoch": 0.16585365853658537, |
|
"grad_norm": 2.1819457001917804, |
|
"kl": 0.71484375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0018, |
|
"reward": 0.48153403401374817, |
|
"reward_std": 0.24979981780052185, |
|
"rewards/format_reward": 0.12417580932378769, |
|
"rewards/judgement_reward": 0.34016045928001404, |
|
"rewards/token_reward": 0.017197802662849426, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.0824203491211, |
|
"epoch": 0.17073170731707318, |
|
"grad_norm": 2.0349402588988084, |
|
"kl": 0.023193359375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0397, |
|
"reward": 0.49935343861579895, |
|
"reward_std": 0.25207024812698364, |
|
"rewards/format_reward": 0.1340659111738205, |
|
"rewards/judgement_reward": 0.3458918631076813, |
|
"rewards/token_reward": 0.019395604729652405, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 78.0769271850586, |
|
"epoch": 0.17560975609756097, |
|
"grad_norm": 2.204221432744273, |
|
"kl": 0.035400390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0172, |
|
"reward": 0.5841602087020874, |
|
"reward_std": 0.2741175889968872, |
|
"rewards/format_reward": 0.1450549066066742, |
|
"rewards/judgement_reward": 0.4198743999004364, |
|
"rewards/token_reward": 0.019230768084526062, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.20879364013672, |
|
"epoch": 0.18048780487804877, |
|
"grad_norm": 2.021038070522573, |
|
"kl": 0.038330078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0224, |
|
"reward": 0.41530290246009827, |
|
"reward_std": 0.2347797006368637, |
|
"rewards/format_reward": 0.11758241057395935, |
|
"rewards/judgement_reward": 0.2791491150856018, |
|
"rewards/token_reward": 0.01857142709195614, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.3846206665039, |
|
"epoch": 0.18536585365853658, |
|
"grad_norm": 2.2222191685212156, |
|
"kl": 0.044189453125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0314, |
|
"reward": 0.6318976879119873, |
|
"reward_std": 0.26240062713623047, |
|
"rewards/format_reward": 0.1351647973060608, |
|
"rewards/judgement_reward": 0.4639304578304291, |
|
"rewards/token_reward": 0.032802194356918335, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.52747344970703, |
|
"epoch": 0.1902439024390244, |
|
"grad_norm": 1.8969688577935941, |
|
"kl": 0.138671875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0619, |
|
"reward": 0.7222000360488892, |
|
"reward_std": 0.268084853887558, |
|
"rewards/format_reward": 0.15384609997272491, |
|
"rewards/judgement_reward": 0.5535735487937927, |
|
"rewards/token_reward": 0.014780220575630665, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.75274658203125, |
|
"epoch": 0.1951219512195122, |
|
"grad_norm": 2.1456284767626763, |
|
"kl": 0.046142578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0734, |
|
"reward": 0.8482385277748108, |
|
"reward_std": 0.24352890253067017, |
|
"rewards/format_reward": 0.1560439020395279, |
|
"rewards/judgement_reward": 0.6564801335334778, |
|
"rewards/token_reward": 0.0357142835855484, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.15933990478516, |
|
"epoch": 0.2, |
|
"grad_norm": 2.09640812853542, |
|
"kl": 0.16796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0156, |
|
"reward": 0.8189014792442322, |
|
"reward_std": 0.2852559983730316, |
|
"rewards/format_reward": 0.15824170410633087, |
|
"rewards/judgement_reward": 0.6289563775062561, |
|
"rewards/token_reward": 0.031703293323516846, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.30769348144531, |
|
"epoch": 0.2048780487804878, |
|
"grad_norm": 2.05137630399559, |
|
"kl": 0.030029296875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0138, |
|
"reward": 0.9518312215805054, |
|
"reward_std": 0.21282121539115906, |
|
"rewards/format_reward": 0.16923069953918457, |
|
"rewards/judgement_reward": 0.759743332862854, |
|
"rewards/token_reward": 0.022857142612338066, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.30220031738281, |
|
"epoch": 0.2097560975609756, |
|
"grad_norm": 2.212876795222977, |
|
"kl": 0.03857421875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.042, |
|
"reward": 0.5973189473152161, |
|
"reward_std": 0.2632991671562195, |
|
"rewards/format_reward": 0.13186810910701752, |
|
"rewards/judgement_reward": 0.4431980848312378, |
|
"rewards/token_reward": 0.022252749651670456, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.78022003173828, |
|
"epoch": 0.2146341463414634, |
|
"grad_norm": 1.909917279230211, |
|
"kl": 0.02197265625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0097, |
|
"reward": 0.6979678869247437, |
|
"reward_std": 0.2731499671936035, |
|
"rewards/format_reward": 0.1461537927389145, |
|
"rewards/judgement_reward": 0.5200557112693787, |
|
"rewards/token_reward": 0.03175824135541916, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.13736724853516, |
|
"epoch": 0.21951219512195122, |
|
"grad_norm": 2.00637254655722, |
|
"kl": 0.031982421875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0395, |
|
"reward": 0.5393092036247253, |
|
"reward_std": 0.18171927332878113, |
|
"rewards/format_reward": 0.12197799980640411, |
|
"rewards/judgement_reward": 0.40848490595817566, |
|
"rewards/token_reward": 0.008846154436469078, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.3901138305664, |
|
"epoch": 0.22439024390243903, |
|
"grad_norm": 3.1756908083902964, |
|
"kl": 0.032958984375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0354, |
|
"reward": 0.5674500465393066, |
|
"reward_std": 0.21071720123291016, |
|
"rewards/format_reward": 0.13186810910701752, |
|
"rewards/judgement_reward": 0.4111863970756531, |
|
"rewards/token_reward": 0.024395601823925972, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.52747344970703, |
|
"epoch": 0.22926829268292684, |
|
"grad_norm": 2.006041173831088, |
|
"kl": 0.0302734375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0108, |
|
"reward": 0.5255321860313416, |
|
"reward_std": 0.24098682403564453, |
|
"rewards/format_reward": 0.12747250497341156, |
|
"rewards/judgement_reward": 0.3749277889728546, |
|
"rewards/token_reward": 0.02313186600804329, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.30220031738281, |
|
"epoch": 0.23414634146341465, |
|
"grad_norm": 2.080512783491811, |
|
"kl": 0.0257568359375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0174, |
|
"reward": 0.6429842710494995, |
|
"reward_std": 0.2190992385149002, |
|
"rewards/format_reward": 0.14615380764007568, |
|
"rewards/judgement_reward": 0.47715994715690613, |
|
"rewards/token_reward": 0.019670329988002777, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.19780731201172, |
|
"epoch": 0.23902439024390243, |
|
"grad_norm": 2.0510274772907042, |
|
"kl": 0.0220947265625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0791, |
|
"reward": 0.8461357355117798, |
|
"reward_std": 0.2294541895389557, |
|
"rewards/format_reward": 0.1648351103067398, |
|
"rewards/judgement_reward": 0.6526740789413452, |
|
"rewards/token_reward": 0.028626371175050735, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.05494689941406, |
|
"epoch": 0.24390243902439024, |
|
"grad_norm": 2.1607651542772115, |
|
"kl": 0.0244140625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0118, |
|
"reward": 0.7917323708534241, |
|
"reward_std": 0.2613984942436218, |
|
"rewards/format_reward": 0.15714280307292938, |
|
"rewards/judgement_reward": 0.6060182452201843, |
|
"rewards/token_reward": 0.02857142873108387, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.94505310058594, |
|
"epoch": 0.24878048780487805, |
|
"grad_norm": 2.4101187996457467, |
|
"kl": 0.06787109375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0128, |
|
"reward": 0.6344149112701416, |
|
"reward_std": 0.2056046426296234, |
|
"rewards/format_reward": 0.1340659111738205, |
|
"rewards/judgement_reward": 0.4821070730686188, |
|
"rewards/token_reward": 0.01824175752699375, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.18132019042969, |
|
"epoch": 0.25365853658536586, |
|
"grad_norm": 2.0276407359997446, |
|
"kl": 0.205078125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0588, |
|
"reward": 0.9314945340156555, |
|
"reward_std": 0.17086516320705414, |
|
"rewards/format_reward": 0.1659339964389801, |
|
"rewards/judgement_reward": 0.7470438480377197, |
|
"rewards/token_reward": 0.018516482785344124, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.30769348144531, |
|
"epoch": 0.25853658536585367, |
|
"grad_norm": 2.17785136004001, |
|
"kl": 0.031982421875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0705, |
|
"reward": 0.7751470804214478, |
|
"reward_std": 0.2271140068769455, |
|
"rewards/format_reward": 0.1549450010061264, |
|
"rewards/judgement_reward": 0.5904769897460938, |
|
"rewards/token_reward": 0.029725274071097374, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.989013671875, |
|
"epoch": 0.2634146341463415, |
|
"grad_norm": 2.6022497270890868, |
|
"kl": 0.0400390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0112, |
|
"reward": 0.7827267646789551, |
|
"reward_std": 0.2629249691963196, |
|
"rewards/format_reward": 0.1549450010061264, |
|
"rewards/judgement_reward": 0.6101441979408264, |
|
"rewards/token_reward": 0.017637362703680992, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.96703338623047, |
|
"epoch": 0.2682926829268293, |
|
"grad_norm": 1.9363620619923776, |
|
"kl": 0.032958984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0014, |
|
"reward": 0.579794704914093, |
|
"reward_std": 0.26253968477249146, |
|
"rewards/format_reward": 0.13846150040626526, |
|
"rewards/judgement_reward": 0.41259682178497314, |
|
"rewards/token_reward": 0.028736261650919914, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.9945068359375, |
|
"epoch": 0.2731707317073171, |
|
"grad_norm": 2.2800506149734696, |
|
"kl": 0.06005859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.035, |
|
"reward": 0.7825473546981812, |
|
"reward_std": 0.2477075755596161, |
|
"rewards/format_reward": 0.15934060513973236, |
|
"rewards/judgement_reward": 0.6038658618927002, |
|
"rewards/token_reward": 0.01934065856039524, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.19230651855469, |
|
"epoch": 0.2780487804878049, |
|
"grad_norm": 1.9316889087097737, |
|
"kl": 0.03369140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0151, |
|
"reward": 0.7461085319519043, |
|
"reward_std": 0.22608627378940582, |
|
"rewards/format_reward": 0.14835159480571747, |
|
"rewards/judgement_reward": 0.5816579461097717, |
|
"rewards/token_reward": 0.016098899766802788, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.45604705810547, |
|
"epoch": 0.28292682926829266, |
|
"grad_norm": 2.891000369424163, |
|
"kl": 0.09228515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0333, |
|
"reward": 0.7758737206459045, |
|
"reward_std": 0.27175047993659973, |
|
"rewards/format_reward": 0.15714281797409058, |
|
"rewards/judgement_reward": 0.5963131785392761, |
|
"rewards/token_reward": 0.02241758443415165, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 78.02198028564453, |
|
"epoch": 0.28780487804878047, |
|
"grad_norm": 2.08130255800055, |
|
"kl": 0.0277099609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0011, |
|
"reward": 0.7745002508163452, |
|
"reward_std": 0.24488268792629242, |
|
"rewards/format_reward": 0.16153840720653534, |
|
"rewards/judgement_reward": 0.5817528367042542, |
|
"rewards/token_reward": 0.031208788976073265, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.19230651855469, |
|
"epoch": 0.2926829268292683, |
|
"grad_norm": 2.325364169755121, |
|
"kl": 0.048583984375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.054, |
|
"reward": 0.7218204736709595, |
|
"reward_std": 0.22538162767887115, |
|
"rewards/format_reward": 0.1439560055732727, |
|
"rewards/judgement_reward": 0.5538533926010132, |
|
"rewards/token_reward": 0.02401098981499672, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.14835357666016, |
|
"epoch": 0.2975609756097561, |
|
"grad_norm": 2.199589014022037, |
|
"kl": 0.0289306640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0617, |
|
"reward": 0.6991139054298401, |
|
"reward_std": 0.22941938042640686, |
|
"rewards/format_reward": 0.1450549066066742, |
|
"rewards/judgement_reward": 0.5255423784255981, |
|
"rewards/token_reward": 0.028516482561826706, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.95055389404297, |
|
"epoch": 0.3024390243902439, |
|
"grad_norm": 2.0508974708861576, |
|
"kl": 0.02587890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.04, |
|
"reward": 0.6912637948989868, |
|
"reward_std": 0.22075381875038147, |
|
"rewards/format_reward": 0.15274719893932343, |
|
"rewards/judgement_reward": 0.5231318473815918, |
|
"rewards/token_reward": 0.015384615398943424, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.80769348144531, |
|
"epoch": 0.3073170731707317, |
|
"grad_norm": 2.2205627029567467, |
|
"kl": 0.035888671875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0696, |
|
"reward": 0.7073810696601868, |
|
"reward_std": 0.22409050166606903, |
|
"rewards/format_reward": 0.15714280307292938, |
|
"rewards/judgement_reward": 0.514194130897522, |
|
"rewards/token_reward": 0.036043956875801086, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 70.52747344970703, |
|
"epoch": 0.3121951219512195, |
|
"grad_norm": 2.0916293721018544, |
|
"kl": 0.032958984375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0537, |
|
"reward": 0.998910129070282, |
|
"reward_std": 0.2128116488456726, |
|
"rewards/format_reward": 0.17032960057258606, |
|
"rewards/judgement_reward": 0.800338625907898, |
|
"rewards/token_reward": 0.028241755440831184, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.0934066772461, |
|
"epoch": 0.3170731707317073, |
|
"grad_norm": 3.671459315517818, |
|
"kl": 0.056640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0158, |
|
"reward": 0.8368141651153564, |
|
"reward_std": 0.24064061045646667, |
|
"rewards/format_reward": 0.16153840720653534, |
|
"rewards/judgement_reward": 0.6522535681724548, |
|
"rewards/token_reward": 0.02302197553217411, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.41758728027344, |
|
"epoch": 0.32195121951219513, |
|
"grad_norm": 2.0126846364586273, |
|
"kl": 0.026123046875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0154, |
|
"reward": 0.809525728225708, |
|
"reward_std": 0.2356753945350647, |
|
"rewards/format_reward": 0.16373620927333832, |
|
"rewards/judgement_reward": 0.6260641813278198, |
|
"rewards/token_reward": 0.019725274294614792, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.15933990478516, |
|
"epoch": 0.32682926829268294, |
|
"grad_norm": 1.851476238680362, |
|
"kl": 0.0267333984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0033, |
|
"reward": 0.6307772397994995, |
|
"reward_std": 0.23441998660564423, |
|
"rewards/format_reward": 0.14835159480571747, |
|
"rewards/judgement_reward": 0.45868924260139465, |
|
"rewards/token_reward": 0.023736262694001198, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.77472686767578, |
|
"epoch": 0.33170731707317075, |
|
"grad_norm": 1.9067629127088346, |
|
"kl": 0.027587890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0166, |
|
"reward": 0.6367327570915222, |
|
"reward_std": 0.207670658826828, |
|
"rewards/format_reward": 0.15054941177368164, |
|
"rewards/judgement_reward": 0.46409520506858826, |
|
"rewards/token_reward": 0.022087913006544113, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.63186645507812, |
|
"epoch": 0.33658536585365856, |
|
"grad_norm": 2.192665819859577, |
|
"kl": 0.058349609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0979, |
|
"reward": 0.8123928904533386, |
|
"reward_std": 0.23223978281021118, |
|
"rewards/format_reward": 0.16263730823993683, |
|
"rewards/judgement_reward": 0.6219531297683716, |
|
"rewards/token_reward": 0.027802197262644768, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.26374053955078, |
|
"epoch": 0.34146341463414637, |
|
"grad_norm": 2.339003134381284, |
|
"kl": 0.02392578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0611, |
|
"reward": 0.9417235255241394, |
|
"reward_std": 0.18349522352218628, |
|
"rewards/format_reward": 0.16813181340694427, |
|
"rewards/judgement_reward": 0.7577126622200012, |
|
"rewards/token_reward": 0.01587912067770958, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.05494689941406, |
|
"epoch": 0.3463414634146341, |
|
"grad_norm": 2.3107542796374916, |
|
"kl": 0.0244140625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0253, |
|
"reward": 0.8421667218208313, |
|
"reward_std": 0.22592146694660187, |
|
"rewards/format_reward": 0.15384609997272491, |
|
"rewards/judgement_reward": 0.6690347790718079, |
|
"rewards/token_reward": 0.019285714253783226, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.9120864868164, |
|
"epoch": 0.35121951219512193, |
|
"grad_norm": 1.8500979548941974, |
|
"kl": 0.0191650390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0408, |
|
"reward": 0.8825385570526123, |
|
"reward_std": 0.23543551564216614, |
|
"rewards/format_reward": 0.17362630367279053, |
|
"rewards/judgement_reward": 0.6919341087341309, |
|
"rewards/token_reward": 0.01697802171111107, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.62088012695312, |
|
"epoch": 0.35609756097560974, |
|
"grad_norm": 1.7964524052061408, |
|
"kl": 0.0262451171875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0512, |
|
"reward": 0.7030321359634399, |
|
"reward_std": 0.20544546842575073, |
|
"rewards/format_reward": 0.14615380764007568, |
|
"rewards/judgement_reward": 0.5344606637954712, |
|
"rewards/token_reward": 0.0224175825715065, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.87911987304688, |
|
"epoch": 0.36097560975609755, |
|
"grad_norm": 1.9935947664757243, |
|
"kl": 0.19921875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.055, |
|
"reward": 0.9933971762657166, |
|
"reward_std": 0.22045965492725372, |
|
"rewards/format_reward": 0.17252740263938904, |
|
"rewards/judgement_reward": 0.7981772422790527, |
|
"rewards/token_reward": 0.022692309692502022, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.4011001586914, |
|
"epoch": 0.36585365853658536, |
|
"grad_norm": 2.001748403246846, |
|
"kl": 0.0308837890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0664, |
|
"reward": 0.6825388073921204, |
|
"reward_std": 0.2821784019470215, |
|
"rewards/format_reward": 0.1560439020395279, |
|
"rewards/judgement_reward": 0.49479159712791443, |
|
"rewards/token_reward": 0.031703293323516846, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.41758728027344, |
|
"epoch": 0.37073170731707317, |
|
"grad_norm": 2.2248051545634824, |
|
"kl": 0.0302734375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0628, |
|
"reward": 0.7976081371307373, |
|
"reward_std": 0.24464865028858185, |
|
"rewards/format_reward": 0.16263730823993683, |
|
"rewards/judgement_reward": 0.611564040184021, |
|
"rewards/token_reward": 0.02340659312903881, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.8901138305664, |
|
"epoch": 0.375609756097561, |
|
"grad_norm": 2.0499939459920835, |
|
"kl": 0.03662109375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0634, |
|
"reward": 0.6835858821868896, |
|
"reward_std": 0.21102023124694824, |
|
"rewards/format_reward": 0.1351647973060608, |
|
"rewards/judgement_reward": 0.5263329744338989, |
|
"rewards/token_reward": 0.022087913006544113, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.62637329101562, |
|
"epoch": 0.3804878048780488, |
|
"grad_norm": 2.266766438478251, |
|
"kl": 0.02880859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0109, |
|
"reward": 0.8073420524597168, |
|
"reward_std": 0.23677751421928406, |
|
"rewards/format_reward": 0.1560439020395279, |
|
"rewards/judgement_reward": 0.6212978959083557, |
|
"rewards/token_reward": 0.029999997466802597, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.93955993652344, |
|
"epoch": 0.3853658536585366, |
|
"grad_norm": 2.4759945771013867, |
|
"kl": 0.06640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0217, |
|
"reward": 0.6546903252601624, |
|
"reward_std": 0.26877561211586, |
|
"rewards/format_reward": 0.15164829790592194, |
|
"rewards/judgement_reward": 0.4823826551437378, |
|
"rewards/token_reward": 0.020659340545535088, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.65384674072266, |
|
"epoch": 0.3902439024390244, |
|
"grad_norm": 1.945518643660678, |
|
"kl": 0.047119140625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0002, |
|
"reward": 0.7612788081169128, |
|
"reward_std": 0.1894712746143341, |
|
"rewards/format_reward": 0.13736259937286377, |
|
"rewards/judgement_reward": 0.6034765243530273, |
|
"rewards/token_reward": 0.02043955959379673, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.78571319580078, |
|
"epoch": 0.3951219512195122, |
|
"grad_norm": 2.0901126990153127, |
|
"kl": 0.11474609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0731, |
|
"reward": 1.0548338890075684, |
|
"reward_std": 0.21029122173786163, |
|
"rewards/format_reward": 0.1769230216741562, |
|
"rewards/judgement_reward": 0.8410976529121399, |
|
"rewards/token_reward": 0.03681318834424019, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.8956069946289, |
|
"epoch": 0.4, |
|
"grad_norm": 1.7461151621773074, |
|
"kl": 0.03271484375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0471, |
|
"reward": 0.7646914720535278, |
|
"reward_std": 0.2543509900569916, |
|
"rewards/format_reward": 0.15714281797409058, |
|
"rewards/judgement_reward": 0.5825485587120056, |
|
"rewards/token_reward": 0.02499999850988388, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.9065933227539, |
|
"epoch": 0.40487804878048783, |
|
"grad_norm": 2.0106379862864663, |
|
"kl": 0.0311279296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0125, |
|
"reward": 0.7661330699920654, |
|
"reward_std": 0.2671906650066376, |
|
"rewards/format_reward": 0.15934060513973236, |
|
"rewards/judgement_reward": 0.5814077854156494, |
|
"rewards/token_reward": 0.02538461610674858, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.64835357666016, |
|
"epoch": 0.4097560975609756, |
|
"grad_norm": 1.9108709969888935, |
|
"kl": 0.283203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0065, |
|
"reward": 0.8318064212799072, |
|
"reward_std": 0.23601563274860382, |
|
"rewards/format_reward": 0.15934060513973236, |
|
"rewards/judgement_reward": 0.6468064785003662, |
|
"rewards/token_reward": 0.025659339502453804, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.02747344970703, |
|
"epoch": 0.4146341463414634, |
|
"grad_norm": 1.890024675225077, |
|
"kl": 0.1630859375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0754, |
|
"reward": 0.9830853343009949, |
|
"reward_std": 0.22543151676654816, |
|
"rewards/format_reward": 0.17362630367279053, |
|
"rewards/judgement_reward": 0.7781403064727783, |
|
"rewards/token_reward": 0.031318679451942444, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.0824203491211, |
|
"epoch": 0.4195121951219512, |
|
"grad_norm": 2.47037218372719, |
|
"kl": 0.044921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"reward": 0.6921790242195129, |
|
"reward_std": 0.26250430941581726, |
|
"rewards/format_reward": 0.14725270867347717, |
|
"rewards/judgement_reward": 0.5239920616149902, |
|
"rewards/token_reward": 0.02093406394124031, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.28571319580078, |
|
"epoch": 0.424390243902439, |
|
"grad_norm": 2.079365060925443, |
|
"kl": 0.0267333984375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0605, |
|
"reward": 0.7566412687301636, |
|
"reward_std": 0.21086536347866058, |
|
"rewards/format_reward": 0.15824170410633087, |
|
"rewards/judgement_reward": 0.5732895731925964, |
|
"rewards/token_reward": 0.02510989084839821, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.31867980957031, |
|
"epoch": 0.4292682926829268, |
|
"grad_norm": 2.1529982274280863, |
|
"kl": 0.0255126953125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0807, |
|
"reward": 0.8711549639701843, |
|
"reward_std": 0.19136983156204224, |
|
"rewards/format_reward": 0.16153840720653534, |
|
"rewards/judgement_reward": 0.6887921690940857, |
|
"rewards/token_reward": 0.02082417532801628, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.85164642333984, |
|
"epoch": 0.43414634146341463, |
|
"grad_norm": 1.9476865855709369, |
|
"kl": 0.03466796875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0521, |
|
"reward": 0.849588930606842, |
|
"reward_std": 0.22347387671470642, |
|
"rewards/format_reward": 0.1648351103067398, |
|
"rewards/judgement_reward": 0.6607428193092346, |
|
"rewards/token_reward": 0.02401098981499672, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.53296661376953, |
|
"epoch": 0.43902439024390244, |
|
"grad_norm": 1.9988855691892198, |
|
"kl": 0.3203125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0217, |
|
"reward": 0.794158935546875, |
|
"reward_std": 0.2382083535194397, |
|
"rewards/format_reward": 0.16153840720653534, |
|
"rewards/judgement_reward": 0.6096535921096802, |
|
"rewards/token_reward": 0.022967033088207245, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.9065933227539, |
|
"epoch": 0.44390243902439025, |
|
"grad_norm": 2.3812234433330333, |
|
"kl": 0.039794921875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.066, |
|
"reward": 0.9384704828262329, |
|
"reward_std": 0.19025777280330658, |
|
"rewards/format_reward": 0.1659340113401413, |
|
"rewards/judgement_reward": 0.7457780838012695, |
|
"rewards/token_reward": 0.026758242398500443, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.04396057128906, |
|
"epoch": 0.44878048780487806, |
|
"grad_norm": 2.6148084459815206, |
|
"kl": 0.04345703125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0984, |
|
"reward": 1.0894677639007568, |
|
"reward_std": 0.1785333752632141, |
|
"rewards/format_reward": 0.18241752684116364, |
|
"rewards/judgement_reward": 0.880236804485321, |
|
"rewards/token_reward": 0.026813184842467308, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.24725341796875, |
|
"epoch": 0.45365853658536587, |
|
"grad_norm": 1.9266274957793252, |
|
"kl": 0.10205078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0333, |
|
"reward": 0.6261184215545654, |
|
"reward_std": 0.2962965965270996, |
|
"rewards/format_reward": 0.1450549066066742, |
|
"rewards/judgement_reward": 0.4455689489841461, |
|
"rewards/token_reward": 0.03549450263381004, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.0714340209961, |
|
"epoch": 0.4585365853658537, |
|
"grad_norm": 2.359476534290484, |
|
"kl": 0.0693359375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0619, |
|
"reward": 0.6673313975334167, |
|
"reward_std": 0.20072828233242035, |
|
"rewards/format_reward": 0.13736259937286377, |
|
"rewards/judgement_reward": 0.5074961185455322, |
|
"rewards/token_reward": 0.022472526878118515, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.46703338623047, |
|
"epoch": 0.4634146341463415, |
|
"grad_norm": 2.1867681379205126, |
|
"kl": 0.045166015625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0384, |
|
"reward": 0.8676297664642334, |
|
"reward_std": 0.2376934140920639, |
|
"rewards/format_reward": 0.1648351103067398, |
|
"rewards/judgement_reward": 0.6711461544036865, |
|
"rewards/token_reward": 0.03164835274219513, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.30769348144531, |
|
"epoch": 0.4682926829268293, |
|
"grad_norm": 1.7938305263175633, |
|
"kl": 0.0294189453125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0293, |
|
"reward": 0.8810251355171204, |
|
"reward_std": 0.22088079154491425, |
|
"rewards/format_reward": 0.1659339964389801, |
|
"rewards/judgement_reward": 0.6851460337638855, |
|
"rewards/token_reward": 0.02994505502283573, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.96154022216797, |
|
"epoch": 0.47317073170731705, |
|
"grad_norm": 1.9835104874280258, |
|
"kl": 0.0262451171875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.031, |
|
"reward": 1.0301648378372192, |
|
"reward_std": 0.23829680681228638, |
|
"rewards/format_reward": 0.17912080883979797, |
|
"rewards/judgement_reward": 0.8086262941360474, |
|
"rewards/token_reward": 0.042417578399181366, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.10989379882812, |
|
"epoch": 0.47804878048780486, |
|
"grad_norm": 1.8068829512126077, |
|
"kl": 0.025390625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0435, |
|
"reward": 0.8396397233009338, |
|
"reward_std": 0.20062671601772308, |
|
"rewards/format_reward": 0.1648351103067398, |
|
"rewards/judgement_reward": 0.6437055468559265, |
|
"rewards/token_reward": 0.031098900362849236, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.79670715332031, |
|
"epoch": 0.48292682926829267, |
|
"grad_norm": 1.8013193104560348, |
|
"kl": 0.0294189453125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.021, |
|
"reward": 0.8207690119743347, |
|
"reward_std": 0.18635249137878418, |
|
"rewards/format_reward": 0.16263730823993683, |
|
"rewards/judgement_reward": 0.6401647329330444, |
|
"rewards/token_reward": 0.01796703413128853, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.29121398925781, |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 1.8541360087649421, |
|
"kl": 0.03173828125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0518, |
|
"reward": 0.8696525692939758, |
|
"reward_std": 0.2140418291091919, |
|
"rewards/format_reward": 0.16813179850578308, |
|
"rewards/judgement_reward": 0.6785534620285034, |
|
"rewards/token_reward": 0.022967033088207245, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.42857360839844, |
|
"epoch": 0.4926829268292683, |
|
"grad_norm": 2.3437263624638858, |
|
"kl": 0.03759765625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0011, |
|
"reward": 0.7790926694869995, |
|
"reward_std": 0.29135364294052124, |
|
"rewards/format_reward": 0.16373620927333832, |
|
"rewards/judgement_reward": 0.5860158205032349, |
|
"rewards/token_reward": 0.029340656474232674, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 70.17033386230469, |
|
"epoch": 0.4975609756097561, |
|
"grad_norm": 2.1875591465659765, |
|
"kl": 0.05029296875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0458, |
|
"reward": 0.9169327616691589, |
|
"reward_std": 0.21496839821338654, |
|
"rewards/format_reward": 0.17142850160598755, |
|
"rewards/judgement_reward": 0.7243505120277405, |
|
"rewards/token_reward": 0.021153846755623817, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.53845977783203, |
|
"epoch": 0.5024390243902439, |
|
"grad_norm": 2.2860621039378097, |
|
"kl": 0.052978515625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0511, |
|
"reward": 0.8492118120193481, |
|
"reward_std": 0.2524387836456299, |
|
"rewards/format_reward": 0.1648351103067398, |
|
"rewards/judgement_reward": 0.6557502746582031, |
|
"rewards/token_reward": 0.028626374900341034, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.21977996826172, |
|
"epoch": 0.5073170731707317, |
|
"grad_norm": 1.9503004419157008, |
|
"kl": 0.03857421875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1021, |
|
"reward": 0.9117417335510254, |
|
"reward_std": 0.21893596649169922, |
|
"rewards/format_reward": 0.16813181340694427, |
|
"rewards/judgement_reward": 0.7197636961936951, |
|
"rewards/token_reward": 0.023846155032515526, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 69.92308044433594, |
|
"epoch": 0.5121951219512195, |
|
"grad_norm": 2.2102406225271696, |
|
"kl": 0.0732421875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0789, |
|
"reward": 0.9815363883972168, |
|
"reward_std": 0.1895749419927597, |
|
"rewards/format_reward": 0.17472520470619202, |
|
"rewards/judgement_reward": 0.7850527167320251, |
|
"rewards/token_reward": 0.021758243441581726, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 78.5, |
|
"epoch": 0.5170731707317073, |
|
"grad_norm": 1.8701663684695582, |
|
"kl": 0.029052734375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0526, |
|
"reward": 1.0305202007293701, |
|
"reward_std": 0.21711336076259613, |
|
"rewards/format_reward": 0.17802190780639648, |
|
"rewards/judgement_reward": 0.8127729296684265, |
|
"rewards/token_reward": 0.039725273847579956, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.36813354492188, |
|
"epoch": 0.5219512195121951, |
|
"grad_norm": 2.1504947624651005, |
|
"kl": 0.041015625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0108, |
|
"reward": 0.8238873481750488, |
|
"reward_std": 0.28225311636924744, |
|
"rewards/format_reward": 0.1659340113401413, |
|
"rewards/judgement_reward": 0.6267994046211243, |
|
"rewards/token_reward": 0.0311538465321064, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.24176025390625, |
|
"epoch": 0.526829268292683, |
|
"grad_norm": 1.7794056509498575, |
|
"kl": 0.0311279296875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0823, |
|
"reward": 0.9624238014221191, |
|
"reward_std": 0.2303503304719925, |
|
"rewards/format_reward": 0.17252740263938904, |
|
"rewards/judgement_reward": 0.7584677934646606, |
|
"rewards/token_reward": 0.03142856806516647, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.4120864868164, |
|
"epoch": 0.5317073170731708, |
|
"grad_norm": 2.504687498054589, |
|
"kl": 0.05712890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.072, |
|
"reward": 0.976593554019928, |
|
"reward_std": 0.23477815091609955, |
|
"rewards/format_reward": 0.1758241206407547, |
|
"rewards/judgement_reward": 0.7797802090644836, |
|
"rewards/token_reward": 0.020989011973142624, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.32418060302734, |
|
"epoch": 0.5365853658536586, |
|
"grad_norm": 1.9342624784665794, |
|
"kl": 0.0419921875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0906, |
|
"reward": 1.0066561698913574, |
|
"reward_std": 0.18863876163959503, |
|
"rewards/format_reward": 0.18131859600543976, |
|
"rewards/judgement_reward": 0.7958320379257202, |
|
"rewards/token_reward": 0.029505494982004166, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.02747344970703, |
|
"epoch": 0.5414634146341464, |
|
"grad_norm": 3.6005569321956736, |
|
"kl": 0.07275390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0868, |
|
"reward": 0.9240605235099792, |
|
"reward_std": 0.2554757595062256, |
|
"rewards/format_reward": 0.16813181340694427, |
|
"rewards/judgement_reward": 0.7308735251426697, |
|
"rewards/token_reward": 0.025054944679141045, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.05494689941406, |
|
"epoch": 0.5463414634146342, |
|
"grad_norm": 2.384945004137537, |
|
"kl": 0.0498046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0406, |
|
"reward": 0.934312105178833, |
|
"reward_std": 0.25367388129234314, |
|
"rewards/format_reward": 0.1758241057395935, |
|
"rewards/judgement_reward": 0.7368395924568176, |
|
"rewards/token_reward": 0.021648351103067398, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 81.47802734375, |
|
"epoch": 0.551219512195122, |
|
"grad_norm": 2.4494451662329446, |
|
"kl": 0.04150390625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0659, |
|
"reward": 0.9666280746459961, |
|
"reward_std": 0.26005613803863525, |
|
"rewards/format_reward": 0.176923006772995, |
|
"rewards/judgement_reward": 0.7433313131332397, |
|
"rewards/token_reward": 0.046373624354600906, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.86264038085938, |
|
"epoch": 0.5560975609756098, |
|
"grad_norm": 1.8974487896916954, |
|
"kl": 0.17578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0702, |
|
"reward": 1.1044301986694336, |
|
"reward_std": 0.20463985204696655, |
|
"rewards/format_reward": 0.18461531400680542, |
|
"rewards/judgement_reward": 0.8778916597366333, |
|
"rewards/token_reward": 0.04192307963967323, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 73.41758728027344, |
|
"epoch": 0.5609756097560976, |
|
"grad_norm": 2.0736124082447858, |
|
"kl": 0.055908203125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0178, |
|
"reward": 0.45087727904319763, |
|
"reward_std": 0.16333483159542084, |
|
"rewards/format_reward": 0.12087910622358322, |
|
"rewards/judgement_reward": 0.3127453625202179, |
|
"rewards/token_reward": 0.01725274696946144, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 72.31318664550781, |
|
"epoch": 0.5658536585365853, |
|
"grad_norm": 1.9880834210174607, |
|
"kl": 0.265625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0914, |
|
"reward": 1.0456217527389526, |
|
"reward_std": 0.1933256834745407, |
|
"rewards/format_reward": 0.17912080883979797, |
|
"rewards/judgement_reward": 0.8432590365409851, |
|
"rewards/token_reward": 0.023241758346557617, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.79670715332031, |
|
"epoch": 0.5707317073170731, |
|
"grad_norm": 2.0598303213859634, |
|
"kl": 0.03173828125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0638, |
|
"reward": 1.015529751777649, |
|
"reward_std": 0.21438416838645935, |
|
"rewards/format_reward": 0.1769230216741562, |
|
"rewards/judgement_reward": 0.8019583821296692, |
|
"rewards/token_reward": 0.03664834797382355, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.15384674072266, |
|
"epoch": 0.5756097560975609, |
|
"grad_norm": 2.1302776398753327, |
|
"kl": 0.0322265625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1029, |
|
"reward": 1.0583690404891968, |
|
"reward_std": 0.16479381918907166, |
|
"rewards/format_reward": 0.1857142299413681, |
|
"rewards/judgement_reward": 0.8432590365409851, |
|
"rewards/token_reward": 0.029395602643489838, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.35165405273438, |
|
"epoch": 0.5804878048780487, |
|
"grad_norm": 2.139306907057807, |
|
"kl": 0.040771484375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0908, |
|
"reward": 0.9103569984436035, |
|
"reward_std": 0.2012585699558258, |
|
"rewards/format_reward": 0.1659339964389801, |
|
"rewards/judgement_reward": 0.7116758227348328, |
|
"rewards/token_reward": 0.03274725377559662, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 81.20879364013672, |
|
"epoch": 0.5853658536585366, |
|
"grad_norm": 1.8649093478118766, |
|
"kl": 0.04931640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0399, |
|
"reward": 0.9918122887611389, |
|
"reward_std": 0.2688324749469757, |
|
"rewards/format_reward": 0.176923006772995, |
|
"rewards/judgement_reward": 0.7671419978141785, |
|
"rewards/token_reward": 0.04774724692106247, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.04396057128906, |
|
"epoch": 0.5902439024390244, |
|
"grad_norm": 2.0906208019011614, |
|
"kl": 0.05322265625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.084, |
|
"reward": 0.9650415778160095, |
|
"reward_std": 0.21533620357513428, |
|
"rewards/format_reward": 0.17362630367279053, |
|
"rewards/judgement_reward": 0.7517996430397034, |
|
"rewards/token_reward": 0.03961538150906563, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.62088012695312, |
|
"epoch": 0.5951219512195122, |
|
"grad_norm": 2.6404896615388513, |
|
"kl": 0.039306640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0209, |
|
"reward": 0.8149644732475281, |
|
"reward_std": 0.21845516562461853, |
|
"rewards/format_reward": 0.15824170410633087, |
|
"rewards/judgement_reward": 0.6252389550209045, |
|
"rewards/token_reward": 0.03148351609706879, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 78.19230651855469, |
|
"epoch": 0.6, |
|
"grad_norm": 2.137456730046166, |
|
"kl": 0.05419921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0122, |
|
"reward": 0.7231975197792053, |
|
"reward_std": 0.2408137172460556, |
|
"rewards/format_reward": 0.15274719893932343, |
|
"rewards/judgement_reward": 0.5451754331588745, |
|
"rewards/token_reward": 0.025274725630879402, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.81318664550781, |
|
"epoch": 0.6048780487804878, |
|
"grad_norm": 1.8602453492738675, |
|
"kl": 0.03662109375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0347, |
|
"reward": 0.9557329416275024, |
|
"reward_std": 0.21061010658740997, |
|
"rewards/format_reward": 0.17472520470619202, |
|
"rewards/judgement_reward": 0.7449086904525757, |
|
"rewards/token_reward": 0.0360989011824131, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.18681335449219, |
|
"epoch": 0.6097560975609756, |
|
"grad_norm": 1.8947771050267905, |
|
"kl": 0.04736328125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0427, |
|
"reward": 0.8908948302268982, |
|
"reward_std": 0.21556268632411957, |
|
"rewards/format_reward": 0.16703291237354279, |
|
"rewards/judgement_reward": 0.688202440738678, |
|
"rewards/token_reward": 0.035659339278936386, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.83516693115234, |
|
"epoch": 0.6146341463414634, |
|
"grad_norm": 1.99350313429728, |
|
"kl": 0.0693359375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0231, |
|
"reward": 0.8506399393081665, |
|
"reward_std": 0.24161569774150848, |
|
"rewards/format_reward": 0.1648351103067398, |
|
"rewards/judgement_reward": 0.6592661738395691, |
|
"rewards/token_reward": 0.026538461446762085, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 78.53845977783203, |
|
"epoch": 0.6195121951219512, |
|
"grad_norm": 1.7998816564546938, |
|
"kl": 0.0361328125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1204, |
|
"reward": 1.039487600326538, |
|
"reward_std": 0.2013106495141983, |
|
"rewards/format_reward": 0.17912080883979797, |
|
"rewards/judgement_reward": 0.8210808038711548, |
|
"rewards/token_reward": 0.03928571194410324, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 81.9120864868164, |
|
"epoch": 0.624390243902439, |
|
"grad_norm": 1.8452835234198748, |
|
"kl": 0.0390625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.12, |
|
"reward": 0.9250213503837585, |
|
"reward_std": 0.2341793179512024, |
|
"rewards/format_reward": 0.17142850160598755, |
|
"rewards/judgement_reward": 0.7072190642356873, |
|
"rewards/token_reward": 0.046373624354600906, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.82967376708984, |
|
"epoch": 0.6292682926829268, |
|
"grad_norm": 1.9489042143231339, |
|
"kl": 0.0311279296875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0785, |
|
"reward": 0.994548499584198, |
|
"reward_std": 0.21068567037582397, |
|
"rewards/format_reward": 0.18021972477436066, |
|
"rewards/judgement_reward": 0.7781199812889099, |
|
"rewards/token_reward": 0.03620879352092743, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.79670715332031, |
|
"epoch": 0.6341463414634146, |
|
"grad_norm": 1.9478743633543802, |
|
"kl": 0.041259765625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0111, |
|
"reward": 0.8490137457847595, |
|
"reward_std": 0.2522919774055481, |
|
"rewards/format_reward": 0.16373620927333832, |
|
"rewards/judgement_reward": 0.6532995104789734, |
|
"rewards/token_reward": 0.031978022307157516, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 81.68681335449219, |
|
"epoch": 0.6390243902439025, |
|
"grad_norm": 2.085728564809279, |
|
"kl": 0.04150390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.074, |
|
"reward": 0.5961366295814514, |
|
"reward_std": 0.2355279177427292, |
|
"rewards/format_reward": 0.1439560055732727, |
|
"rewards/judgement_reward": 0.4331147074699402, |
|
"rewards/token_reward": 0.01906593330204487, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.93406677246094, |
|
"epoch": 0.6439024390243903, |
|
"grad_norm": 1.9504606440875714, |
|
"kl": 0.03857421875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0681, |
|
"reward": 0.9742907881736755, |
|
"reward_std": 0.21939148008823395, |
|
"rewards/format_reward": 0.17802190780639648, |
|
"rewards/judgement_reward": 0.7539612650871277, |
|
"rewards/token_reward": 0.042307693511247635, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.23077392578125, |
|
"epoch": 0.6487804878048781, |
|
"grad_norm": 2.1860995545870168, |
|
"kl": 0.10595703125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0478, |
|
"reward": 0.8597061634063721, |
|
"reward_std": 0.1860429346561432, |
|
"rewards/format_reward": 0.16043950617313385, |
|
"rewards/judgement_reward": 0.6846511363983154, |
|
"rewards/token_reward": 0.014615383930504322, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.5824203491211, |
|
"epoch": 0.6536585365853659, |
|
"grad_norm": 2.056976443684514, |
|
"kl": 0.04736328125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0996, |
|
"reward": 1.0542576313018799, |
|
"reward_std": 0.1535356342792511, |
|
"rewards/format_reward": 0.18241751194000244, |
|
"rewards/judgement_reward": 0.8472796678543091, |
|
"rewards/token_reward": 0.024560438469052315, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.73077392578125, |
|
"epoch": 0.6585365853658537, |
|
"grad_norm": 2.354640541368104, |
|
"kl": 0.034912109375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0639, |
|
"reward": 1.033756971359253, |
|
"reward_std": 0.12462829798460007, |
|
"rewards/format_reward": 0.18241751194000244, |
|
"rewards/judgement_reward": 0.8278777003288269, |
|
"rewards/token_reward": 0.023461539298295975, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.63736724853516, |
|
"epoch": 0.6634146341463415, |
|
"grad_norm": 1.8711434115369097, |
|
"kl": 0.2275390625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0747, |
|
"reward": 0.8162251710891724, |
|
"reward_std": 0.23913460969924927, |
|
"rewards/format_reward": 0.16703291237354279, |
|
"rewards/judgement_reward": 0.6229832768440247, |
|
"rewards/token_reward": 0.02620879001915455, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.0054931640625, |
|
"epoch": 0.6682926829268293, |
|
"grad_norm": 2.170766069792843, |
|
"kl": 0.158203125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0001, |
|
"reward": 0.5743477940559387, |
|
"reward_std": 0.2055026888847351, |
|
"rewards/format_reward": 0.13736261427402496, |
|
"rewards/judgement_reward": 0.4148424565792084, |
|
"rewards/token_reward": 0.02214285545051098, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 78.80220031738281, |
|
"epoch": 0.6731707317073171, |
|
"grad_norm": 2.249511172978621, |
|
"kl": 0.040771484375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0299, |
|
"reward": 0.9537181258201599, |
|
"reward_std": 0.2547648847103119, |
|
"rewards/format_reward": 0.17802190780639648, |
|
"rewards/judgement_reward": 0.7325092554092407, |
|
"rewards/token_reward": 0.04318681359291077, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 74.71428680419922, |
|
"epoch": 0.6780487804878049, |
|
"grad_norm": 2.4468786224687973, |
|
"kl": 0.39453125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0937, |
|
"reward": 0.8086802959442139, |
|
"reward_std": 0.2142382711172104, |
|
"rewards/format_reward": 0.15824170410633087, |
|
"rewards/judgement_reward": 0.6207680702209473, |
|
"rewards/token_reward": 0.02967032790184021, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 71.93955993652344, |
|
"epoch": 0.6829268292682927, |
|
"grad_norm": 2.5243974633393496, |
|
"kl": 0.046630859375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1273, |
|
"reward": 0.9722238183021545, |
|
"reward_std": 0.18802900612354279, |
|
"rewards/format_reward": 0.18131862580776215, |
|
"rewards/judgement_reward": 0.768817126750946, |
|
"rewards/token_reward": 0.022087909281253815, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.4835205078125, |
|
"epoch": 0.6878048780487804, |
|
"grad_norm": 2.7374184261837455, |
|
"kl": 0.05029296875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0257, |
|
"reward": 0.9533888101577759, |
|
"reward_std": 0.19217993319034576, |
|
"rewards/format_reward": 0.17362630367279053, |
|
"rewards/judgement_reward": 0.7552569508552551, |
|
"rewards/token_reward": 0.02450549229979515, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 82.15384674072266, |
|
"epoch": 0.6926829268292682, |
|
"grad_norm": 1.882857912628275, |
|
"kl": 0.048583984375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1109, |
|
"reward": 0.9248565435409546, |
|
"reward_std": 0.22721460461616516, |
|
"rewards/format_reward": 0.17032961547374725, |
|
"rewards/judgement_reward": 0.7072190642356873, |
|
"rewards/token_reward": 0.04730769246816635, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 81.5879135131836, |
|
"epoch": 0.697560975609756, |
|
"grad_norm": 1.9696456406180098, |
|
"kl": 0.0498046875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0738, |
|
"reward": 1.0071877241134644, |
|
"reward_std": 0.24609951674938202, |
|
"rewards/format_reward": 0.18021969497203827, |
|
"rewards/judgement_reward": 0.775978684425354, |
|
"rewards/token_reward": 0.05098900571465492, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.44506072998047, |
|
"epoch": 0.7024390243902439, |
|
"grad_norm": 1.9874995933520714, |
|
"kl": 0.046142578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0414, |
|
"reward": 0.9923238754272461, |
|
"reward_std": 0.2381928414106369, |
|
"rewards/format_reward": 0.17252740263938904, |
|
"rewards/judgement_reward": 0.7843018770217896, |
|
"rewards/token_reward": 0.03549450263381004, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.11538696289062, |
|
"epoch": 0.7073170731707317, |
|
"grad_norm": 2.170568236496205, |
|
"kl": 0.044921875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0286, |
|
"reward": 1.019083857536316, |
|
"reward_std": 0.2215217649936676, |
|
"rewards/format_reward": 0.18131862580776215, |
|
"rewards/judgement_reward": 0.8043037056922913, |
|
"rewards/token_reward": 0.03346153721213341, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 81.14286041259766, |
|
"epoch": 0.7121951219512195, |
|
"grad_norm": 1.9765889361911162, |
|
"kl": 0.056640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0658, |
|
"reward": 0.6368016004562378, |
|
"reward_std": 0.19664892554283142, |
|
"rewards/format_reward": 0.1450549066066742, |
|
"rewards/judgement_reward": 0.4633399248123169, |
|
"rewards/token_reward": 0.028406593948602676, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 82.33516693115234, |
|
"epoch": 0.7170731707317073, |
|
"grad_norm": 1.749209988774624, |
|
"kl": 0.0537109375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0663, |
|
"reward": 0.962835967540741, |
|
"reward_std": 0.2308100312948227, |
|
"rewards/format_reward": 0.17252741754055023, |
|
"rewards/judgement_reward": 0.7419567704200745, |
|
"rewards/token_reward": 0.04835164546966553, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.8901138305664, |
|
"epoch": 0.7219512195121951, |
|
"grad_norm": 2.0335137757386077, |
|
"kl": 0.091796875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0357, |
|
"reward": 0.9629313945770264, |
|
"reward_std": 0.20034556090831757, |
|
"rewards/format_reward": 0.18131862580776215, |
|
"rewards/judgement_reward": 0.7572720646858215, |
|
"rewards/token_reward": 0.024340655654668808, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 83.15933990478516, |
|
"epoch": 0.7268292682926829, |
|
"grad_norm": 1.7647654238864119, |
|
"kl": 0.051025390625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0586, |
|
"reward": 0.9167026877403259, |
|
"reward_std": 0.1917964667081833, |
|
"rewards/format_reward": 0.16373620927333832, |
|
"rewards/judgement_reward": 0.7133510112762451, |
|
"rewards/token_reward": 0.03961538150906563, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 83.3901138305664, |
|
"epoch": 0.7317073170731707, |
|
"grad_norm": 1.9248149745795777, |
|
"kl": 0.047119140625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1216, |
|
"reward": 0.8943402767181396, |
|
"reward_std": 0.21739540994167328, |
|
"rewards/format_reward": 0.16813181340694427, |
|
"rewards/judgement_reward": 0.6763730645179749, |
|
"rewards/token_reward": 0.04983516409993172, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.5714340209961, |
|
"epoch": 0.7365853658536585, |
|
"grad_norm": 2.0080133802993836, |
|
"kl": 0.056884765625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0591, |
|
"reward": 0.967893660068512, |
|
"reward_std": 0.24371013045310974, |
|
"rewards/format_reward": 0.17252740263938904, |
|
"rewards/judgement_reward": 0.7508604526519775, |
|
"rewards/token_reward": 0.04450549930334091, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.76374053955078, |
|
"epoch": 0.7414634146341463, |
|
"grad_norm": 1.923607313408308, |
|
"kl": 0.044189453125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0686, |
|
"reward": 0.9229432940483093, |
|
"reward_std": 0.23856668174266815, |
|
"rewards/format_reward": 0.16923069953918457, |
|
"rewards/judgement_reward": 0.7147566676139832, |
|
"rewards/token_reward": 0.03895604610443115, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.60989379882812, |
|
"epoch": 0.7463414634146341, |
|
"grad_norm": 1.9096459984701761, |
|
"kl": 0.043212890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0566, |
|
"reward": 1.0493299961090088, |
|
"reward_std": 0.22466498613357544, |
|
"rewards/format_reward": 0.18461531400680542, |
|
"rewards/judgement_reward": 0.8225165605545044, |
|
"rewards/token_reward": 0.04219780117273331, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.5988998413086, |
|
"epoch": 0.751219512195122, |
|
"grad_norm": 1.9835076997357493, |
|
"kl": 0.04150390625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0931, |
|
"reward": 0.8906086683273315, |
|
"reward_std": 0.14505840837955475, |
|
"rewards/format_reward": 0.1648351103067398, |
|
"rewards/judgement_reward": 0.7020372152328491, |
|
"rewards/token_reward": 0.023736264556646347, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.36264038085938, |
|
"epoch": 0.7560975609756098, |
|
"grad_norm": 1.840974141351389, |
|
"kl": 0.044921875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.091, |
|
"reward": 1.1236339807510376, |
|
"reward_std": 0.17279954254627228, |
|
"rewards/format_reward": 0.1868131309747696, |
|
"rewards/judgement_reward": 0.8986337780952454, |
|
"rewards/token_reward": 0.03818681463599205, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.02747344970703, |
|
"epoch": 0.7609756097560976, |
|
"grad_norm": 1.963792827769517, |
|
"kl": 0.049560546875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0361, |
|
"reward": 0.855387270450592, |
|
"reward_std": 0.22867831587791443, |
|
"rewards/format_reward": 0.16153840720653534, |
|
"rewards/judgement_reward": 0.6642332077026367, |
|
"rewards/token_reward": 0.029615381732583046, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 82.95055389404297, |
|
"epoch": 0.7658536585365854, |
|
"grad_norm": 1.9306677990628487, |
|
"kl": 0.04736328125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0329, |
|
"reward": 0.9254786372184753, |
|
"reward_std": 0.23398159444332123, |
|
"rewards/format_reward": 0.17032961547374725, |
|
"rewards/judgement_reward": 0.7126215100288391, |
|
"rewards/token_reward": 0.042527470737695694, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 78.71428680419922, |
|
"epoch": 0.7707317073170732, |
|
"grad_norm": 1.9488316654243145, |
|
"kl": 0.04541015625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0659, |
|
"reward": 1.0084419250488281, |
|
"reward_std": 0.18995091319084167, |
|
"rewards/format_reward": 0.18351641297340393, |
|
"rewards/judgement_reward": 0.7943759560585022, |
|
"rewards/token_reward": 0.03054944798350334, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.46154022216797, |
|
"epoch": 0.775609756097561, |
|
"grad_norm": 2.4866962371762926, |
|
"kl": 0.107421875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0118, |
|
"reward": 1.0291651487350464, |
|
"reward_std": 0.22909829020500183, |
|
"rewards/format_reward": 0.17032960057258606, |
|
"rewards/judgement_reward": 0.8225165605545044, |
|
"rewards/token_reward": 0.03631868213415146, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 78.51099395751953, |
|
"epoch": 0.7804878048780488, |
|
"grad_norm": 2.1399222908924966, |
|
"kl": 0.035888671875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1147, |
|
"reward": 1.0689184665679932, |
|
"reward_std": 0.2075626105070114, |
|
"rewards/format_reward": 0.17912080883979797, |
|
"rewards/judgement_reward": 0.8432589173316956, |
|
"rewards/token_reward": 0.04653845727443695, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.28571319580078, |
|
"epoch": 0.7853658536585366, |
|
"grad_norm": 2.0654519129717954, |
|
"kl": 0.038818359375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0747, |
|
"reward": 1.0535253286361694, |
|
"reward_std": 0.1914973109960556, |
|
"rewards/format_reward": 0.18021972477436066, |
|
"rewards/judgement_reward": 0.8373165726661682, |
|
"rewards/token_reward": 0.03598900884389877, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 75.79121398925781, |
|
"epoch": 0.7902439024390244, |
|
"grad_norm": 1.8597487323795248, |
|
"kl": 0.037353515625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0976, |
|
"reward": 0.981767475605011, |
|
"reward_std": 0.15387368202209473, |
|
"rewards/format_reward": 0.17142850160598755, |
|
"rewards/judgement_reward": 0.7899540662765503, |
|
"rewards/token_reward": 0.020384615287184715, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.78571319580078, |
|
"epoch": 0.7951219512195122, |
|
"grad_norm": 1.8877094518624145, |
|
"kl": 0.0517578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0441, |
|
"reward": 0.611198365688324, |
|
"reward_std": 0.23072822391986847, |
|
"rewards/format_reward": 0.14065930247306824, |
|
"rewards/judgement_reward": 0.431198388338089, |
|
"rewards/token_reward": 0.03934066370129585, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 82.0054931640625, |
|
"epoch": 0.8, |
|
"grad_norm": 1.7914504262564466, |
|
"kl": 0.049072265625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0773, |
|
"reward": 1.0531319379806519, |
|
"reward_std": 0.203482985496521, |
|
"rewards/format_reward": 0.1758241206407547, |
|
"rewards/judgement_reward": 0.8310439586639404, |
|
"rewards/token_reward": 0.04626372829079628, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.20330047607422, |
|
"epoch": 0.8048780487804879, |
|
"grad_norm": 1.8591700696334572, |
|
"kl": 0.03662109375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0489, |
|
"reward": 1.1166560649871826, |
|
"reward_std": 0.1697649508714676, |
|
"rewards/format_reward": 0.1857142299413681, |
|
"rewards/judgement_reward": 0.8986338973045349, |
|
"rewards/token_reward": 0.032307688146829605, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 83.22527313232422, |
|
"epoch": 0.8097560975609757, |
|
"grad_norm": 2.214129107678454, |
|
"kl": 0.052978515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0263, |
|
"reward": 0.6829060912132263, |
|
"reward_std": 0.25708603858947754, |
|
"rewards/format_reward": 0.14285710453987122, |
|
"rewards/judgement_reward": 0.5113676190376282, |
|
"rewards/token_reward": 0.0286813173443079, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.74725341796875, |
|
"epoch": 0.8146341463414634, |
|
"grad_norm": 1.8131229003189935, |
|
"kl": 0.048095703125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0818, |
|
"reward": 0.8743994832038879, |
|
"reward_std": 0.16046012938022614, |
|
"rewards/format_reward": 0.1648351103067398, |
|
"rewards/judgement_reward": 0.672916054725647, |
|
"rewards/token_reward": 0.03664834797382355, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.55494689941406, |
|
"epoch": 0.8195121951219512, |
|
"grad_norm": 1.8955641226851798, |
|
"kl": 0.0556640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0973, |
|
"reward": 1.104522705078125, |
|
"reward_std": 0.19338937103748322, |
|
"rewards/format_reward": 0.18241751194000244, |
|
"rewards/judgement_reward": 0.8802368640899658, |
|
"rewards/token_reward": 0.04186813160777092, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 84.85165405273438, |
|
"epoch": 0.824390243902439, |
|
"grad_norm": 1.7813215626916248, |
|
"kl": 0.05029296875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0779, |
|
"reward": 0.8853285312652588, |
|
"reward_std": 0.22625528275966644, |
|
"rewards/format_reward": 0.1659340113401413, |
|
"rewards/judgement_reward": 0.6722515225410461, |
|
"rewards/token_reward": 0.04714285209774971, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.70879364013672, |
|
"epoch": 0.8292682926829268, |
|
"grad_norm": 1.8224025120445444, |
|
"kl": 0.05224609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1272, |
|
"reward": 1.0830038785934448, |
|
"reward_std": 0.1721232831478119, |
|
"rewards/format_reward": 0.18131862580776215, |
|
"rewards/judgement_reward": 0.858058750629425, |
|
"rewards/token_reward": 0.043626368045806885, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.5934066772461, |
|
"epoch": 0.8341463414634146, |
|
"grad_norm": 1.8203230311858847, |
|
"kl": 0.050048828125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1045, |
|
"reward": 1.1872518062591553, |
|
"reward_std": 0.12512442469596863, |
|
"rewards/format_reward": 0.1879120171070099, |
|
"rewards/judgement_reward": 0.9585154056549072, |
|
"rewards/token_reward": 0.040824174880981445, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 83.5988998413086, |
|
"epoch": 0.8390243902439024, |
|
"grad_norm": 1.8580888818259809, |
|
"kl": 0.06591796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.012, |
|
"reward": 0.7364369034767151, |
|
"reward_std": 0.24575328826904297, |
|
"rewards/format_reward": 0.14945051074028015, |
|
"rewards/judgement_reward": 0.5472610592842102, |
|
"rewards/token_reward": 0.039725273847579956, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.8901138305664, |
|
"epoch": 0.8439024390243902, |
|
"grad_norm": 1.9076522374281615, |
|
"kl": 0.0673828125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0474, |
|
"reward": 0.700067400932312, |
|
"reward_std": 0.17015255987644196, |
|
"rewards/format_reward": 0.15714280307292938, |
|
"rewards/judgement_reward": 0.5104518532752991, |
|
"rewards/token_reward": 0.03247252479195595, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 81.93406677246094, |
|
"epoch": 0.848780487804878, |
|
"grad_norm": 1.9086822135882167, |
|
"kl": 0.06396484375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.099, |
|
"reward": 0.841740071773529, |
|
"reward_std": 0.18049749732017517, |
|
"rewards/format_reward": 0.16263730823993683, |
|
"rewards/judgement_reward": 0.6345422863960266, |
|
"rewards/token_reward": 0.04456043988466263, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 83.34066009521484, |
|
"epoch": 0.8536585365853658, |
|
"grad_norm": 1.8635805588199053, |
|
"kl": 0.055419921875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0344, |
|
"reward": 0.9539201855659485, |
|
"reward_std": 0.20264722406864166, |
|
"rewards/format_reward": 0.16813179850578308, |
|
"rewards/judgement_reward": 0.7532058954238892, |
|
"rewards/token_reward": 0.03258241340517998, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 82.26374053955078, |
|
"epoch": 0.8585365853658536, |
|
"grad_norm": 2.0330839117251336, |
|
"kl": 0.361328125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0225, |
|
"reward": 0.918322741985321, |
|
"reward_std": 0.2709953188896179, |
|
"rewards/format_reward": 0.17802190780639648, |
|
"rewards/judgement_reward": 0.706509530544281, |
|
"rewards/token_reward": 0.033791206777095795, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 83.0824203491211, |
|
"epoch": 0.8634146341463415, |
|
"grad_norm": 1.7553145835440749, |
|
"kl": 0.04833984375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0939, |
|
"reward": 1.1063908338546753, |
|
"reward_std": 0.17723402380943298, |
|
"rewards/format_reward": 0.18241752684116364, |
|
"rewards/judgement_reward": 0.8802370429039001, |
|
"rewards/token_reward": 0.04373626038432121, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.05494689941406, |
|
"epoch": 0.8682926829268293, |
|
"grad_norm": 1.7701802023624165, |
|
"kl": 0.0478515625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0524, |
|
"reward": 1.1443761587142944, |
|
"reward_std": 0.151431143283844, |
|
"rewards/format_reward": 0.18791203200817108, |
|
"rewards/judgement_reward": 0.9193762540817261, |
|
"rewards/token_reward": 0.03708790987730026, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 76.87911987304688, |
|
"epoch": 0.8731707317073171, |
|
"grad_norm": 1.948906301032531, |
|
"kl": 0.04833984375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0406, |
|
"reward": 1.0914630889892578, |
|
"reward_std": 0.17707206308841705, |
|
"rewards/format_reward": 0.1857142299413681, |
|
"rewards/judgement_reward": 0.8778916597366333, |
|
"rewards/token_reward": 0.027857141569256783, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.11538696289062, |
|
"epoch": 0.8780487804878049, |
|
"grad_norm": 1.9707362441929812, |
|
"kl": 0.041748046875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0719, |
|
"reward": 0.8997630476951599, |
|
"reward_std": 0.19621287286281586, |
|
"rewards/format_reward": 0.17252741754055023, |
|
"rewards/judgement_reward": 0.6895432472229004, |
|
"rewards/token_reward": 0.03769230842590332, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 78.06593322753906, |
|
"epoch": 0.8829268292682927, |
|
"grad_norm": 2.1153280959093315, |
|
"kl": 0.04931640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1035, |
|
"reward": 0.8650724291801453, |
|
"reward_std": 0.18447832763195038, |
|
"rewards/format_reward": 0.16153840720653534, |
|
"rewards/judgement_reward": 0.6753472089767456, |
|
"rewards/token_reward": 0.02818680927157402, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.80220031738281, |
|
"epoch": 0.8878048780487805, |
|
"grad_norm": 1.8505179859150593, |
|
"kl": 0.10546875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1083, |
|
"reward": 1.0732321739196777, |
|
"reward_std": 0.15069392323493958, |
|
"rewards/format_reward": 0.18131862580776215, |
|
"rewards/judgement_reward": 0.8640012145042419, |
|
"rewards/token_reward": 0.027912087738513947, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 82.72527313232422, |
|
"epoch": 0.8926829268292683, |
|
"grad_norm": 1.7933498219789568, |
|
"kl": 0.1640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0539, |
|
"reward": 1.0424714088439941, |
|
"reward_std": 0.25467538833618164, |
|
"rewards/format_reward": 0.18351641297340393, |
|
"rewards/judgement_reward": 0.8041195869445801, |
|
"rewards/token_reward": 0.05483516305685043, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 83.67582702636719, |
|
"epoch": 0.8975609756097561, |
|
"grad_norm": 2.600363026488421, |
|
"kl": 0.04931640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0476, |
|
"reward": 0.755411684513092, |
|
"reward_std": 0.23074375092983246, |
|
"rewards/format_reward": 0.15714281797409058, |
|
"rewards/judgement_reward": 0.5580490231513977, |
|
"rewards/token_reward": 0.040219780057668686, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 84.43955993652344, |
|
"epoch": 0.9024390243902439, |
|
"grad_norm": 1.7627204492703208, |
|
"kl": 0.0537109375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0844, |
|
"reward": 1.1599979400634766, |
|
"reward_std": 0.17514334619045258, |
|
"rewards/format_reward": 0.18901091814041138, |
|
"rewards/judgement_reward": 0.9170307517051697, |
|
"rewards/token_reward": 0.0539560429751873, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 78.25824737548828, |
|
"epoch": 0.9073170731707317, |
|
"grad_norm": 2.0241371886975035, |
|
"kl": 0.05517578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1234, |
|
"reward": 1.0605839490890503, |
|
"reward_std": 0.17436476051807404, |
|
"rewards/format_reward": 0.18131862580776215, |
|
"rewards/judgement_reward": 0.8380013704299927, |
|
"rewards/token_reward": 0.04126373305916786, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 83.9011001586914, |
|
"epoch": 0.9121951219512195, |
|
"grad_norm": 1.7110207760725646, |
|
"kl": 0.055419921875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0842, |
|
"reward": 0.7558942437171936, |
|
"reward_std": 0.18241019546985626, |
|
"rewards/format_reward": 0.1549450010061264, |
|
"rewards/judgement_reward": 0.5489163398742676, |
|
"rewards/token_reward": 0.052032966166734695, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.96154022216797, |
|
"epoch": 0.9170731707317074, |
|
"grad_norm": 1.7123226348250025, |
|
"kl": 0.26171875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1222, |
|
"reward": 0.8997620940208435, |
|
"reward_std": 0.2118684947490692, |
|
"rewards/format_reward": 0.17142850160598755, |
|
"rewards/judgement_reward": 0.6890478134155273, |
|
"rewards/token_reward": 0.03928571194410324, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.17033386230469, |
|
"epoch": 0.9219512195121952, |
|
"grad_norm": 1.830366489112101, |
|
"kl": 0.060302734375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0873, |
|
"reward": 1.1136815547943115, |
|
"reward_std": 0.16834889352321625, |
|
"rewards/format_reward": 0.18131859600543976, |
|
"rewards/judgement_reward": 0.897197961807251, |
|
"rewards/token_reward": 0.035164833068847656, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 85.31318664550781, |
|
"epoch": 0.926829268292683, |
|
"grad_norm": 1.6347844224064585, |
|
"kl": 0.059814453125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0761, |
|
"reward": 1.1308867931365967, |
|
"reward_std": 0.17480330169200897, |
|
"rewards/format_reward": 0.18241752684116364, |
|
"rewards/judgement_reward": 0.8986338973045349, |
|
"rewards/token_reward": 0.04983516409993172, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 84.97802734375, |
|
"epoch": 0.9317073170731708, |
|
"grad_norm": 1.6920004921526572, |
|
"kl": 0.05615234375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0895, |
|
"reward": 1.003623604774475, |
|
"reward_std": 0.17075157165527344, |
|
"rewards/format_reward": 0.17472520470619202, |
|
"rewards/judgement_reward": 0.7810961008071899, |
|
"rewards/token_reward": 0.04780219867825508, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 84.5714340209961, |
|
"epoch": 0.9365853658536586, |
|
"grad_norm": 2.022501703444944, |
|
"kl": 0.08837890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.039, |
|
"reward": 0.8640207648277283, |
|
"reward_std": 0.23843564093112946, |
|
"rewards/format_reward": 0.16043950617313385, |
|
"rewards/judgement_reward": 0.6474823355674744, |
|
"rewards/token_reward": 0.056098904460668564, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 84.76923370361328, |
|
"epoch": 0.9414634146341463, |
|
"grad_norm": 1.6540302758951702, |
|
"kl": 0.058349609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0698, |
|
"reward": 1.0908318758010864, |
|
"reward_std": 0.18835541605949402, |
|
"rewards/format_reward": 0.18241751194000244, |
|
"rewards/judgement_reward": 0.8616559505462646, |
|
"rewards/token_reward": 0.046758245676755905, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 86.17033386230469, |
|
"epoch": 0.9463414634146341, |
|
"grad_norm": 2.029218121152698, |
|
"kl": 0.06591796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0222, |
|
"reward": 0.7869437336921692, |
|
"reward_std": 0.28270089626312256, |
|
"rewards/format_reward": 0.16263730823993683, |
|
"rewards/judgement_reward": 0.5819985866546631, |
|
"rewards/token_reward": 0.042307689785957336, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 82.43955993652344, |
|
"epoch": 0.9512195121951219, |
|
"grad_norm": 1.9170587116508577, |
|
"kl": 0.061767578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0761, |
|
"reward": 1.1304645538330078, |
|
"reward_std": 0.1845540553331375, |
|
"rewards/format_reward": 0.18791203200817108, |
|
"rewards/judgement_reward": 0.8962885737419128, |
|
"rewards/token_reward": 0.046263739466667175, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 85.04396057128906, |
|
"epoch": 0.9560975609756097, |
|
"grad_norm": 1.7642610809126764, |
|
"kl": 0.053466796875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0974, |
|
"reward": 1.0226733684539795, |
|
"reward_std": 0.23680001497268677, |
|
"rewards/format_reward": 0.18241751194000244, |
|
"rewards/judgement_reward": 0.7795965075492859, |
|
"rewards/token_reward": 0.06065933778882027, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 82.42857360839844, |
|
"epoch": 0.9609756097560975, |
|
"grad_norm": 1.8138413422032649, |
|
"kl": 0.05712890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0532, |
|
"reward": 0.8977496027946472, |
|
"reward_std": 0.20727433264255524, |
|
"rewards/format_reward": 0.17032960057258606, |
|
"rewards/judgement_reward": 0.6894528269767761, |
|
"rewards/token_reward": 0.03796703368425369, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 86.25274658203125, |
|
"epoch": 0.9658536585365853, |
|
"grad_norm": 1.896125990017999, |
|
"kl": 0.053955078125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0677, |
|
"reward": 0.8915479183197021, |
|
"reward_std": 0.2226872593164444, |
|
"rewards/format_reward": 0.16153840720653534, |
|
"rewards/judgement_reward": 0.6870973110198975, |
|
"rewards/token_reward": 0.042912084609270096, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.41758728027344, |
|
"epoch": 0.9707317073170731, |
|
"grad_norm": 1.854570297914855, |
|
"kl": 0.048828125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0345, |
|
"reward": 0.7600922584533691, |
|
"reward_std": 0.22392849624156952, |
|
"rewards/format_reward": 0.1560439020395279, |
|
"rewards/judgement_reward": 0.575421929359436, |
|
"rewards/token_reward": 0.028626374900341034, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 82.27472686767578, |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 1.8461878166271504, |
|
"kl": 0.05322265625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0727, |
|
"reward": 0.5569639205932617, |
|
"reward_std": 0.14818796515464783, |
|
"rewards/format_reward": 0.1340659111738205, |
|
"rewards/judgement_reward": 0.3882276713848114, |
|
"rewards/token_reward": 0.034670326858758926, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 79.75824737548828, |
|
"epoch": 0.9804878048780488, |
|
"grad_norm": 2.318927168904253, |
|
"kl": 0.056396484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0058, |
|
"reward": 0.5673214197158813, |
|
"reward_std": 0.23380212485790253, |
|
"rewards/format_reward": 0.14285710453987122, |
|
"rewards/judgement_reward": 0.393749862909317, |
|
"rewards/token_reward": 0.030714284628629684, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.66484069824219, |
|
"epoch": 0.9853658536585366, |
|
"grad_norm": 1.9516131221905255, |
|
"kl": 0.051513671875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.1456, |
|
"reward": 1.2027462720870972, |
|
"reward_std": 0.13449910283088684, |
|
"rewards/format_reward": 0.19230760633945465, |
|
"rewards/judgement_reward": 0.9585154056549072, |
|
"rewards/token_reward": 0.051923077553510666, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 77.9065933227539, |
|
"epoch": 0.9902439024390244, |
|
"grad_norm": 1.9128695892010321, |
|
"kl": 0.052734375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0288, |
|
"reward": 0.8996109366416931, |
|
"reward_std": 0.25487908720970154, |
|
"rewards/format_reward": 0.17362631857395172, |
|
"rewards/judgement_reward": 0.7061491012573242, |
|
"rewards/token_reward": 0.01983516290783882, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.14286041259766, |
|
"epoch": 0.9951219512195122, |
|
"grad_norm": 1.7460376924567018, |
|
"kl": 0.058349609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0686, |
|
"reward": 0.9356229305267334, |
|
"reward_std": 0.17833828926086426, |
|
"rewards/format_reward": 0.17032961547374725, |
|
"rewards/judgement_reward": 0.7273810505867004, |
|
"rewards/token_reward": 0.03791208565235138, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 80.55555725097656, |
|
"epoch": 1.0, |
|
"grad_norm": 1.8130933404400773, |
|
"kl": 0.051513671875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0604, |
|
"reward": 0.9366883635520935, |
|
"reward_std": 0.1746482402086258, |
|
"rewards/format_reward": 0.1560439020395279, |
|
"rewards/judgement_reward": 0.7467983365058899, |
|
"rewards/token_reward": 0.03384615480899811, |
|
"step": 205 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 205, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 26, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|