{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 7.936507936507937e-08, "logits/chosen": 0.3974232077598572, "logits/rejected": 0.3553540110588074, "logps/chosen": -777.8718872070312, "logps/rejected": -1263.3857421875, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 7.936507936507937e-07, "logits/chosen": 0.2189813256263733, "logits/rejected": 0.11798671633005142, "logps/chosen": -1287.5775146484375, "logps/rejected": -2137.94189453125, "loss": 0.4998, "rewards/accuracies": 0.4236111044883728, "rewards/chosen": 0.001081566559150815, "rewards/margins": 0.002832952421158552, "rewards/rejected": -0.0017513858620077372, "step": 10 }, { "epoch": 0.03, "learning_rate": 1.5873015873015873e-06, "logits/chosen": 0.254694402217865, "logits/rejected": 0.17226830124855042, "logps/chosen": -1060.751708984375, "logps/rejected": -1972.5423583984375, "loss": 0.4987, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.01862182281911373, "rewards/margins": 0.011560038663446903, "rewards/rejected": -0.03018186055123806, "step": 20 }, { "epoch": 0.05, "learning_rate": 2.380952380952381e-06, "logits/chosen": 0.24693536758422852, "logits/rejected": 0.14691275358200073, "logps/chosen": -1164.8314208984375, "logps/rejected": -2265.688232421875, "loss": 0.4942, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.06745004653930664, "rewards/margins": 0.06839548051357269, "rewards/rejected": -0.13584552705287933, "step": 30 }, { "epoch": 0.06, "learning_rate": 3.1746031746031746e-06, "logits/chosen": 0.2351571023464203, "logits/rejected": 0.08826713263988495, "logps/chosen": -1165.0933837890625, "logps/rejected": -2538.54443359375, "loss": 0.4882, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.18789814412593842, "rewards/margins": 0.15809166431427002, "rewards/rejected": -0.34598982334136963, "step": 40 }, { "epoch": 0.08, "learning_rate": 3.968253968253968e-06, "logits/chosen": 0.21212966740131378, "logits/rejected": 0.055394046008586884, "logps/chosen": -1518.728271484375, "logps/rejected": -2687.27783203125, "loss": 0.4803, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5128198862075806, "rewards/margins": 0.28584352135658264, "rewards/rejected": -0.7986633777618408, "step": 50 }, { "epoch": 0.1, "learning_rate": 4.761904761904762e-06, "logits/chosen": 0.14459244906902313, "logits/rejected": 0.03840586543083191, "logps/chosen": -2544.2294921875, "logps/rejected": -3382.06640625, "loss": 0.4881, "rewards/accuracies": 0.5, "rewards/chosen": -1.1230318546295166, "rewards/margins": 0.08744711428880692, "rewards/rejected": -1.2104789018630981, "step": 60 }, { "epoch": 0.11, "learning_rate": 4.998086282661188e-06, "logits/chosen": 0.03217538818717003, "logits/rejected": -0.029030317440629005, "logps/chosen": -2578.112060546875, "logps/rejected": -3333.111328125, "loss": 0.4889, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2647404670715332, "rewards/margins": 0.06161295250058174, "rewards/rejected": -1.3263534307479858, "step": 70 }, { "epoch": 0.13, "learning_rate": 4.988720025682995e-06, "logits/chosen": 0.1540413349866867, "logits/rejected": 0.00599607964977622, "logps/chosen": -2171.90966796875, "logps/rejected": -3310.91259765625, "loss": 0.4818, "rewards/accuracies": 0.53125, "rewards/chosen": -1.0253573656082153, "rewards/margins": 0.32645487785339355, "rewards/rejected": -1.3518123626708984, "step": 80 }, { "epoch": 0.14, "learning_rate": 4.9715789537359126e-06, "logits/chosen": 0.18023057281970978, "logits/rejected": -0.0267815999686718, "logps/chosen": -2072.573974609375, "logps/rejected": -3439.62744140625, "loss": 0.477, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.79749995470047, "rewards/margins": 0.3549983501434326, "rewards/rejected": -1.152498483657837, "step": 90 }, { "epoch": 0.16, "learning_rate": 4.946716615897932e-06, "logits/chosen": 0.27958863973617554, "logits/rejected": 0.08261282742023468, "logps/chosen": -1780.5172119140625, "logps/rejected": -2961.440185546875, "loss": 0.4749, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.6027330160140991, "rewards/margins": 0.38158130645751953, "rewards/rejected": -0.9843141436576843, "step": 100 }, { "epoch": 0.18, "learning_rate": 4.9142106826480114e-06, "logits/chosen": 0.139088436961174, "logits/rejected": 0.03682307153940201, "logps/chosen": -2555.55810546875, "logps/rejected": -3190.896240234375, "loss": 0.4867, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.9442774653434753, "rewards/margins": 0.06666886806488037, "rewards/rejected": -1.0109463930130005, "step": 110 }, { "epoch": 0.19, "learning_rate": 4.874162703221823e-06, "logits/chosen": 0.1611190289258957, "logits/rejected": -0.012361553497612476, "logps/chosen": -2050.61865234375, "logps/rejected": -3838.018798828125, "loss": 0.4732, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9395005106925964, "rewards/margins": 0.6280331015586853, "rewards/rejected": -1.5675336122512817, "step": 120 }, { "epoch": 0.21, "learning_rate": 4.826697788369752e-06, "logits/chosen": 0.09378266334533691, "logits/rejected": -0.05030001327395439, "logps/chosen": -2598.26953125, "logps/rejected": -3382.542236328125, "loss": 0.479, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1976029872894287, "rewards/margins": 0.14105060696601868, "rewards/rejected": -1.338653564453125, "step": 130 }, { "epoch": 0.22, "learning_rate": 4.7719642195082224e-06, "logits/chosen": 0.14611801505088806, "logits/rejected": 0.03608284890651703, "logps/chosen": -1731.415771484375, "logps/rejected": -2871.3349609375, "loss": 0.4783, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.7977763414382935, "rewards/margins": 0.4263841211795807, "rewards/rejected": -1.2241604328155518, "step": 140 }, { "epoch": 0.24, "learning_rate": 4.710132985485355e-06, "logits/chosen": 0.1052960604429245, "logits/rejected": -0.002506089163944125, "logps/chosen": -1893.218505859375, "logps/rejected": -2854.30859375, "loss": 0.4777, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6661534905433655, "rewards/margins": 0.35293903946876526, "rewards/rejected": -1.0190925598144531, "step": 150 }, { "epoch": 0.26, "learning_rate": 4.641397248408122e-06, "logits/chosen": 0.017198827117681503, "logits/rejected": -0.11535916477441788, "logps/chosen": -2269.91650390625, "logps/rejected": -3919.4375, "loss": 0.4746, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.9236265420913696, "rewards/margins": 0.3831055164337158, "rewards/rejected": -1.306731939315796, "step": 160 }, { "epoch": 0.27, "learning_rate": 4.5659717401997655e-06, "logits/chosen": 0.01966998353600502, "logits/rejected": -0.08529923856258392, "logps/chosen": -2126.437255859375, "logps/rejected": -3403.6328125, "loss": 0.4795, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.0373507738113403, "rewards/margins": 0.41469430923461914, "rewards/rejected": -1.452045202255249, "step": 170 }, { "epoch": 0.29, "learning_rate": 4.4840920917726425e-06, "logits/chosen": 0.05112285539507866, "logits/rejected": -0.052227288484573364, "logps/chosen": -2470.00537109375, "logps/rejected": -3359.12109375, "loss": 0.4825, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1501070261001587, "rewards/margins": 0.24429550766944885, "rewards/rejected": -1.3944026231765747, "step": 180 }, { "epoch": 0.3, "learning_rate": 4.396014096912182e-06, "logits/chosen": 0.04223916679620743, "logits/rejected": -0.13196751475334167, "logps/chosen": -2119.87841796875, "logps/rejected": -3257.087158203125, "loss": 0.4842, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.0418171882629395, "rewards/margins": 0.2414085417985916, "rewards/rejected": -1.2832257747650146, "step": 190 }, { "epoch": 0.32, "learning_rate": 4.302012913171584e-06, "logits/chosen": 0.01791330613195896, "logits/rejected": -0.14596834778785706, "logps/chosen": -2362.22314453125, "logps/rejected": -3574.25927734375, "loss": 0.4675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.061951994895935, "rewards/margins": 0.25661829113960266, "rewards/rejected": -1.3185702562332153, "step": 200 }, { "epoch": 0.34, "learning_rate": 4.202382202273702e-06, "logits/chosen": 0.012257062830030918, "logits/rejected": -0.15737880766391754, "logps/chosen": -2264.546630859375, "logps/rejected": -4172.1708984375, "loss": 0.4655, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0106382369995117, "rewards/margins": 0.853549599647522, "rewards/rejected": -1.8641879558563232, "step": 210 }, { "epoch": 0.35, "learning_rate": 4.097433212705492e-06, "logits/chosen": -0.0625365823507309, "logits/rejected": -0.27418404817581177, "logps/chosen": -2211.255615234375, "logps/rejected": -4339.79638671875, "loss": 0.4675, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0741349458694458, "rewards/margins": 0.7618507742881775, "rewards/rejected": -1.835985779762268, "step": 220 }, { "epoch": 0.37, "learning_rate": 3.987493807371033e-06, "logits/chosen": 0.07172416150569916, "logits/rejected": 0.0005642950418405235, "logps/chosen": -2197.752197265625, "logps/rejected": -3114.567626953125, "loss": 0.471, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.9756608009338379, "rewards/margins": 0.24378642439842224, "rewards/rejected": -1.2194470167160034, "step": 230 }, { "epoch": 0.38, "learning_rate": 3.872907439340758e-06, "logits/chosen": 0.01632564514875412, "logits/rejected": -0.18748557567596436, "logps/chosen": -2377.099853515625, "logps/rejected": -4215.26953125, "loss": 0.4747, "rewards/accuracies": 0.5625, "rewards/chosen": -1.054962396621704, "rewards/margins": 0.6837267279624939, "rewards/rejected": -1.7386891841888428, "step": 240 }, { "epoch": 0.4, "learning_rate": 3.75403207889666e-06, "logits/chosen": -0.012413917109370232, "logits/rejected": -0.13336405158042908, "logps/chosen": -2371.0888671875, "logps/rejected": -3082.55322265625, "loss": 0.4731, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.9990957379341125, "rewards/margins": 0.2219332903623581, "rewards/rejected": -1.2210289239883423, "step": 250 }, { "epoch": 0.42, "learning_rate": 3.631239095225417e-06, "logits/chosen": 0.007460703607648611, "logits/rejected": -0.1302846223115921, "logps/chosen": -2123.05810546875, "logps/rejected": -3470.561279296875, "loss": 0.4727, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.0149245262145996, "rewards/margins": 0.44816678762435913, "rewards/rejected": -1.463091254234314, "step": 260 }, { "epoch": 0.43, "learning_rate": 3.5049120962530608e-06, "logits/chosen": -0.0549989752471447, "logits/rejected": -0.2491796910762787, "logps/chosen": -2456.557861328125, "logps/rejected": -4269.2412109375, "loss": 0.4724, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.0748237371444702, "rewards/margins": 0.7018817067146301, "rewards/rejected": -1.7767053842544556, "step": 270 }, { "epoch": 0.45, "learning_rate": 3.3754457302455464e-06, "logits/chosen": 0.028259318321943283, "logits/rejected": -0.15549519658088684, "logps/chosen": -2343.92724609375, "logps/rejected": -3815.20751953125, "loss": 0.4775, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.9266031384468079, "rewards/margins": 0.4884551167488098, "rewards/rejected": -1.4150583744049072, "step": 280 }, { "epoch": 0.46, "learning_rate": 3.2432444529190714e-06, "logits/chosen": 0.09060511738061905, "logits/rejected": -0.13999487459659576, "logps/chosen": -1729.5335693359375, "logps/rejected": -3556.65771484375, "loss": 0.4629, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5673225522041321, "rewards/margins": 0.5148328542709351, "rewards/rejected": -1.082155466079712, "step": 290 }, { "epoch": 0.48, "learning_rate": 3.1087212639117057e-06, "logits/chosen": 0.10731784254312515, "logits/rejected": -0.12288031727075577, "logps/chosen": -1606.456787109375, "logps/rejected": -3169.525146484375, "loss": 0.4802, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.47551971673965454, "rewards/margins": 0.5530696511268616, "rewards/rejected": -1.0285893678665161, "step": 300 }, { "epoch": 0.5, "learning_rate": 2.9722964165636263e-06, "logits/chosen": -0.040035147219896317, "logits/rejected": -0.13161209225654602, "logps/chosen": -2169.19677734375, "logps/rejected": -3303.66064453125, "loss": 0.4767, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8213248252868652, "rewards/margins": 0.4187691807746887, "rewards/rejected": -1.2400939464569092, "step": 310 }, { "epoch": 0.51, "learning_rate": 2.8343961050366275e-06, "logits/chosen": -0.0054204524494707584, "logits/rejected": -0.2861900329589844, "logps/chosen": -1592.34375, "logps/rejected": -3716.65869140625, "loss": 0.4721, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7492900490760803, "rewards/margins": 0.8597043752670288, "rewards/rejected": -1.608994483947754, "step": 320 }, { "epoch": 0.53, "learning_rate": 2.695451132874385e-06, "logits/chosen": -0.1356322020292282, "logits/rejected": -0.292384535074234, "logps/chosen": -1807.0335693359375, "logps/rejected": -3166.426513671875, "loss": 0.48, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.7727370858192444, "rewards/margins": 0.4709666669368744, "rewards/rejected": -1.2437037229537964, "step": 330 }, { "epoch": 0.54, "learning_rate": 2.5558955671628964e-06, "logits/chosen": -0.07628178596496582, "logits/rejected": -0.20791587233543396, "logps/chosen": -1737.909423828125, "logps/rejected": -3203.9970703125, "loss": 0.4668, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.7647561430931091, "rewards/margins": 0.49256449937820435, "rewards/rejected": -1.2573206424713135, "step": 340 }, { "epoch": 0.56, "learning_rate": 2.4161653824955654e-06, "logits/chosen": -0.05267338082194328, "logits/rejected": -0.25987547636032104, "logps/chosen": -2673.15380859375, "logps/rejected": -3792.9609375, "loss": 0.4653, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1372772455215454, "rewards/margins": 0.37971851229667664, "rewards/rejected": -1.5169956684112549, "step": 350 }, { "epoch": 0.58, "learning_rate": 2.2766970989791697e-06, "logits/chosen": -0.15440881252288818, "logits/rejected": -0.3189676105976105, "logps/chosen": -2074.7294921875, "logps/rejected": -3670.177001953125, "loss": 0.47, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.8276017904281616, "rewards/margins": 0.6911835670471191, "rewards/rejected": -1.5187852382659912, "step": 360 }, { "epoch": 0.59, "learning_rate": 2.1379264185356545e-06, "logits/chosen": -0.03989617899060249, "logits/rejected": -0.29078492522239685, "logps/chosen": -2240.99951171875, "logps/rejected": -3693.21875, "loss": 0.4675, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.982672393321991, "rewards/margins": 0.5129915475845337, "rewards/rejected": -1.4956640005111694, "step": 370 }, { "epoch": 0.61, "learning_rate": 2.000286863759934e-06, "logits/chosen": -0.18141961097717285, "logits/rejected": -0.34517520666122437, "logps/chosen": -2447.95458984375, "logps/rejected": -4179.51220703125, "loss": 0.4721, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.249446153640747, "rewards/margins": 0.7962532639503479, "rewards/rejected": -2.04569935798645, "step": 380 }, { "epoch": 0.62, "learning_rate": 1.8642084235859764e-06, "logits/chosen": -0.10085698217153549, "logits/rejected": -0.4116978645324707, "logps/chosen": -2039.9761962890625, "logps/rejected": -4429.87451171875, "loss": 0.4625, "rewards/accuracies": 0.625, "rewards/chosen": -1.0368822813034058, "rewards/margins": 1.2251580953598022, "rewards/rejected": -2.262040615081787, "step": 390 }, { "epoch": 0.64, "learning_rate": 1.7301162099921013e-06, "logits/chosen": -0.07682979851961136, "logits/rejected": -0.35371267795562744, "logps/chosen": -2304.14306640625, "logps/rejected": -3896.870361328125, "loss": 0.4642, "rewards/accuracies": 0.5625, "rewards/chosen": -1.131152868270874, "rewards/margins": 0.7709552645683289, "rewards/rejected": -1.9021081924438477, "step": 400 }, { "epoch": 0.66, "learning_rate": 1.5984291299420117e-06, "logits/chosen": -0.07511943578720093, "logits/rejected": -0.28092044591903687, "logps/chosen": -2233.59814453125, "logps/rejected": -3915.47900390625, "loss": 0.4691, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0048208236694336, "rewards/margins": 0.6850495934486389, "rewards/rejected": -1.6898702383041382, "step": 410 }, { "epoch": 0.67, "learning_rate": 1.4695585767104092e-06, "logits/chosen": -0.005914182402193546, "logits/rejected": -0.24251346290111542, "logps/chosen": -1424.0343017578125, "logps/rejected": -3496.74755859375, "loss": 0.4581, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6118324398994446, "rewards/margins": 0.6595968008041382, "rewards/rejected": -1.2714293003082275, "step": 420 }, { "epoch": 0.69, "learning_rate": 1.3439071446815452e-06, "logits/chosen": -0.05694418027997017, "logits/rejected": -0.27310264110565186, "logps/chosen": -1811.8056640625, "logps/rejected": -3876.968017578125, "loss": 0.4613, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7894518971443176, "rewards/margins": 0.9580751657485962, "rewards/rejected": -1.7475271224975586, "step": 430 }, { "epoch": 0.7, "learning_rate": 1.2218673716356919e-06, "logits/chosen": -0.04861157387495041, "logits/rejected": -0.22785380482673645, "logps/chosen": -2162.256103515625, "logps/rejected": -3135.13330078125, "loss": 0.4752, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.9659271240234375, "rewards/margins": 0.3405894339084625, "rewards/rejected": -1.3065165281295776, "step": 440 }, { "epoch": 0.72, "learning_rate": 1.103820512452661e-06, "logits/chosen": -0.0886421948671341, "logits/rejected": -0.24363021552562714, "logps/chosen": -2278.705078125, "logps/rejected": -3871.57861328125, "loss": 0.4697, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0452690124511719, "rewards/margins": 0.5815836191177368, "rewards/rejected": -1.6268523931503296, "step": 450 }, { "epoch": 0.74, "learning_rate": 9.901353480633468e-07, "logits/chosen": -0.01507838536053896, "logits/rejected": -0.27143269777297974, "logps/chosen": -1932.046875, "logps/rejected": -3638.012939453125, "loss": 0.4705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8395353555679321, "rewards/margins": 0.6771036982536316, "rewards/rejected": -1.5166391134262085, "step": 460 }, { "epoch": 0.75, "learning_rate": 8.811670333701544e-07, "logits/chosen": -0.06362856924533844, "logits/rejected": -0.2936163544654846, "logps/chosen": -1988.303955078125, "logps/rejected": -4044.038330078125, "loss": 0.4641, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.8915748596191406, "rewards/margins": 0.8164836168289185, "rewards/rejected": -1.7080585956573486, "step": 470 }, { "epoch": 0.77, "learning_rate": 7.772559877354341e-07, "logits/chosen": -0.030277037993073463, "logits/rejected": -0.21407613158226013, "logps/chosen": -2318.84912109375, "logps/rejected": -4174.81298828125, "loss": 0.4745, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.9398199319839478, "rewards/margins": 0.7878143787384033, "rewards/rejected": -1.7276341915130615, "step": 480 }, { "epoch": 0.78, "learning_rate": 6.787268315040604e-07, "logits/chosen": -0.1001216396689415, "logits/rejected": -0.24970397353172302, "logps/chosen": -2327.34375, "logps/rejected": -3589.403564453125, "loss": 0.4775, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.9565193057060242, "rewards/margins": 0.5348533987998962, "rewards/rejected": -1.4913727045059204, "step": 490 }, { "epoch": 0.8, "learning_rate": 5.858873718824829e-07, "logits/chosen": -0.08827606588602066, "logits/rejected": -0.2271912395954132, "logps/chosen": -2329.033935546875, "logps/rejected": -3762.891357421875, "loss": 0.4701, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.9019953608512878, "rewards/margins": 0.6341419219970703, "rewards/rejected": -1.536137342453003, "step": 500 }, { "epoch": 0.82, "learning_rate": 4.990276413423817e-07, "logits/chosen": 0.006130737718194723, "logits/rejected": -0.2663702070713043, "logps/chosen": -1954.1243896484375, "logps/rejected": -4118.31689453125, "loss": 0.4608, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7334184050559998, "rewards/margins": 0.8962429761886597, "rewards/rejected": -1.6296613216400146, "step": 510 }, { "epoch": 0.83, "learning_rate": 4.184189915529796e-07, "logits/chosen": -0.022313248366117477, "logits/rejected": -0.2796049416065216, "logps/chosen": -2069.07861328125, "logps/rejected": -4346.296875, "loss": 0.4631, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8130847215652466, "rewards/margins": 0.9093513488769531, "rewards/rejected": -1.7224359512329102, "step": 520 }, { "epoch": 0.85, "learning_rate": 3.4431324567258176e-07, "logits/chosen": -0.006228646729141474, "logits/rejected": -0.18093259632587433, "logps/chosen": -1853.189453125, "logps/rejected": -3369.416748046875, "loss": 0.4685, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7921710014343262, "rewards/margins": 0.5718700885772705, "rewards/rejected": -1.3640410900115967, "step": 530 }, { "epoch": 0.86, "learning_rate": 2.769419116476052e-07, "logits/chosen": -0.0418720506131649, "logits/rejected": -0.30269795656204224, "logps/chosen": -2341.14501953125, "logps/rejected": -4083.86083984375, "loss": 0.4729, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.9471173286437988, "rewards/margins": 0.6616966128349304, "rewards/rejected": -1.6088138818740845, "step": 540 }, { "epoch": 0.88, "learning_rate": 2.1651545897676512e-07, "logits/chosen": -0.08359251916408539, "logits/rejected": -0.31541210412979126, "logps/chosen": -1795.650390625, "logps/rejected": -3922.41259765625, "loss": 0.4713, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.6783340573310852, "rewards/margins": 0.7878134846687317, "rewards/rejected": -1.4661474227905273, "step": 550 }, { "epoch": 0.9, "learning_rate": 1.6322266119983222e-07, "logits/chosen": -0.08543523401021957, "logits/rejected": -0.22058483958244324, "logps/chosen": -2356.09619140625, "logps/rejected": -2975.605712890625, "loss": 0.4693, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.9343336224555969, "rewards/margins": 0.29671743512153625, "rewards/rejected": -1.231050968170166, "step": 560 }, { "epoch": 0.91, "learning_rate": 1.1723000616502167e-07, "logits/chosen": -0.07000622898340225, "logits/rejected": -0.2409258633852005, "logps/chosen": -2480.648681640625, "logps/rejected": -3360.589111328125, "loss": 0.4772, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.0692129135131836, "rewards/margins": 0.2927553355693817, "rewards/rejected": -1.3619682788848877, "step": 570 }, { "epoch": 0.93, "learning_rate": 7.868117591737585e-08, "logits/chosen": -0.02028440684080124, "logits/rejected": -0.17536571621894836, "logps/chosen": -2015.087646484375, "logps/rejected": -3354.944580078125, "loss": 0.4748, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7683865427970886, "rewards/margins": 0.602486252784729, "rewards/rejected": -1.3708727359771729, "step": 580 }, { "epoch": 0.94, "learning_rate": 4.769659783295383e-08, "logits/chosen": -0.12467147409915924, "logits/rejected": -0.2483750879764557, "logps/chosen": -2129.997314453125, "logps/rejected": -3120.9951171875, "loss": 0.4782, "rewards/accuracies": 0.5, "rewards/chosen": -0.9002014398574829, "rewards/margins": 0.3127484619617462, "rewards/rejected": -1.2129498720169067, "step": 590 }, { "epoch": 0.96, "learning_rate": 2.4373068401120358e-08, "logits/chosen": -0.02018025331199169, "logits/rejected": -0.2830565869808197, "logps/chosen": -1880.4671630859375, "logps/rejected": -3650.389404296875, "loss": 0.4589, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7423778772354126, "rewards/margins": 0.7750624418258667, "rewards/rejected": -1.5174401998519897, "step": 600 }, { "epoch": 0.98, "learning_rate": 8.78345083022425e-09, "logits/chosen": -0.017909971997141838, "logits/rejected": -0.2379368245601654, "logps/chosen": -2240.98193359375, "logps/rejected": -3461.29345703125, "loss": 0.467, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.001007080078125, "rewards/margins": 0.4783032536506653, "rewards/rejected": -1.479310154914856, "step": 610 }, { "epoch": 0.99, "learning_rate": 9.764474213677654e-10, "logits/chosen": -0.011101929470896721, "logits/rejected": -0.18822148442268372, "logps/chosen": -1725.9417724609375, "logps/rejected": -3190.5283203125, "loss": 0.4695, "rewards/accuracies": 0.5, "rewards/chosen": -0.661989688873291, "rewards/margins": 0.6126888394355774, "rewards/rejected": -1.2746784687042236, "step": 620 }, { "epoch": 1.0, "step": 625, "total_flos": 0.0, "train_loss": 0.4739649757385254, "train_runtime": 8016.6369, "train_samples_per_second": 2.495, "train_steps_per_second": 0.078 } ], "logging_steps": 10, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }